Source code for pywrangler.pandas.base

"""This module contains the pandas base wrangler.

"""

import pandas as pd

from pywrangler.base import BaseWrangler


[docs]class PandasWrangler(BaseWrangler): """Pandas wrangler base class. """ @property def computation_engine(self): return "pandas" def _validate_output_shape(self, df_in: pd.DataFrame, df_out: pd.DataFrame): """If wrangler implementation preserves sample size, assert equal sample sizes between input and output dataframe. Using pandas, all data is in memory. Hence, getting shape information is cheap and this check can be done regularly (in contrast to pyspark where `df.count()` can be very expensive). Parameters ---------- df_in: pd.DataFrame Input dataframe. df_out: pd.DataFrame Output dataframe. """ if self.preserves_sample_size: shape_in = df_in.shape[0] shape_out = df_out.shape[0] if shape_in != shape_out: raise ValueError('Number of input samples ({}) does not match ' 'number of ouput samples ({}) which should ' 'be the case because wrangler is supposed to ' 'preserve the number of samples.' .format(shape_in, shape_out))
[docs]class PandasSingleNoFit(PandasWrangler): """Mixin class defining `fit` and `fit_transform` for all wranglers with a single data frame input and output with no fitting necessary. """
[docs] def fit(self, df: pd.DataFrame): """Do nothing and return the wrangler unchanged. This method is just there to implement the usual API and hence work in pipelines. Parameters ---------- df: pd.DataFrame """ return self
[docs] def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame: """Apply fit and transform in sequence at once. Parameters ---------- df: pd.DataFrame Returns ------- result: pd.DataFrame """ return self.fit(df).transform(df)