Source code for pywrangler.pandas.util

"""This module contains utility functions (e.g. validation) commonly used by
pandas wranglers.

"""

import numpy as np
import pandas as pd
from pandas.core.groupby.generic import DataFrameGroupBy

from pywrangler.util.sanitizer import ensure_iterable
from pywrangler.util.types import TYPE_ASCENDING, TYPE_COLUMNS


[docs]def validate_empty_df(df: pd.DataFrame):
    """Check for empty dataframe. By definition, wranglers operate on non
    empty dataframe. Therefore, raise error if dataframe is empty.

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe to check against.

    """

    if df.empty:
        raise ValueError('Dataframe is empty.')


[docs]def validate_columns(df: pd.DataFrame, columns: TYPE_COLUMNS):
    """Check that columns exist in dataframe and raise error if otherwise.

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe to check against.
    columns: iterable[str]
        Columns to be validated.

    """

    columns = ensure_iterable(columns)

    for column in columns:
        if column not in df.columns:
            raise ValueError('Column with name `{}` does not exist. '
                             'Please check parameter settings.'
                             .format(column))


[docs]def sort_values(df: pd.DataFrame,
                order_columns: TYPE_COLUMNS,
                ascending: TYPE_ASCENDING) -> pd.DataFrame:
    """Convenient function to return sorted dataframe while taking care of
     optional order columns and order (ascending/descending).

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe to check against.
    order_columns: TYPE_COLUMNS
        Columns to be sorted.
    ascending: TYPE_ASCENDING
        Column order.

    Returns
    -------
    df_sorted: pd.DataFrame

    """

    if order_columns:
        return df.sort_values(order_columns, ascending=ascending)
    else:
        return df


[docs]def groupby(df: pd.DataFrame,
            groupby_columns: TYPE_COLUMNS) -> DataFrameGroupBy:
    """Convenient function to group by a dataframe while taking care of
     optional groupby columns. Always returns a `DataFrameGroupBy` object.

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe to check against.
    groupby_columns: TYPE_COLUMNS
        Columns to be grouped by.

    Returns
    -------
    groupby: DataFrameGroupBy

    """

    if groupby_columns:
        return df.groupby(groupby_columns)
    else:
        return df.groupby(np.zeros(df.shape[0]))