Source code for pywrangler.util.testing.plainframe

"""This module contains the PlainFrame and PlainColumn classes.

"""
import collections
import copy
import functools
import numbers
from datetime import datetime
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, \
    Union, NamedTuple

import numpy as np
import pandas as pd
import tabulate
from numpy.testing import assert_equal
from pandas.api import types as pd_types
from pywrangler.util.dependencies import is_available, requires


[docs]@functools.total_ordering class NullValue: """Represents null values. Provides operator comparison functions to allow sorting which is required to determine row order of data tables. """ def __repr__(self): return "NULL" def __lt__(self, other): return self def __eq__(self, other): return isinstance(other, NullValue) def __hash__(self): return hash(repr(self))
NaN = np.NaN NULL = NullValue() TYPE_ROW = List[Union[bool, int, float, str, datetime, NullValue]] TYPE_DSTR = Dict[str, str] TYPE_DTYPE_INPUT = Union[List[str], TYPE_DSTR] TYPE_ANY_PF = Union['PlainFrame', dict, tuple, pd.DataFrame, 'pyspark.sql.DataFrame'] PRIMITIVE_TYPES = {"bool": (bool, NullValue), "int": (int, NullValue), "float": (float, int, NullValue), "str": (str, NullValue), "datetime": (datetime, NullValue)} TYPE_ABBR = {"i": "int", "b": "bool", "f": "float", "s": "str", "d": "datetime"} _ImmutablePlainColumn = NamedTuple("_ImmutablePlainColumn", [("name", str), ("dtype", str), ("values", tuple)])
[docs]class PlainColumn(_ImmutablePlainColumn): """Represents an immutable column of a PlainFrame consisting of a name, dtype and values. Ensures type validity. Instantiation should be performed via `from_plain` factory method which adds preprocessing steps to ensure type correctness. In addition, it contains conversion methods for all supported computation engines. """ def __init__(self, *args, **kwargs): self._check_dtype() @property def typed_column(self) -> str: """Return typed column annotation of PlainColumn. """ return "{}:{}".format(self.name, self.dtype) @property def has_null(self) -> bool: """Signals presence of NULL values. """ return any([x is NULL for x in self.values]) @property def has_nan(self) -> bool: """Signals presence of NaN values. """ return any([x is np.NaN for x in self.values]) @property def to_pandas(self) -> 'ConverterToPandas': """Composite for conversion functionality to pandas. """ return ConverterToPandas(self) @property @requires("pyspark") def to_pyspark(self) -> 'ConverterToPySpark': """Composite for conversion functionality to pyspark. """ return ConverterToPySpark(self) @staticmethod def _preprocess_datetime(values: Sequence) \ -> Tuple[Union[datetime, NullValue]]: """Convenience method to allow timestamps of various formats. """ processed = [pd.Timestamp(x).to_pydatetime() if not isinstance(x, NullValue) else x for x in values] return tuple(processed) @staticmethod def _preprocess_float(values: Sequence) -> Tuple[Union[float, NullValue]]: """Convenience method to ensure numeric values are casted to float. """ processed = [float(x) if isinstance(x, numbers.Number) else x for x in values] return tuple(processed) def _check_dtype(self): """Ensures correct type of all values. Raises TypeError. """ # assert valid dtype if self.dtype not in PRIMITIVE_TYPES: raise ValueError("Type '{}' is invalid. Following types are " "allowed: {}" .format(self.dtype, PRIMITIVE_TYPES.keys())) # assert valid dtypes for values allowed_types = PRIMITIVE_TYPES[self.dtype] for value in self.values: if not isinstance(value, allowed_types): raise TypeError("Column '{}' has invalud value '{}' with " "invalid type '{}'. Allowed types are: {}." .format(self.name, value, type(value), allowed_types))
[docs] def modify(self, modifications: Dict[int, Any]) -> 'PlainColumn': """Modifies PlainColumn and returns new instance. Modification does not change dtype, name or the number of values. One or more values will be modified. Parameters ---------- modifications: dict Dictionary containing modifications with keys representing row indicies and values representing new values. Returns ------- modified: PlainColumn """ n_rows = len(self.values) values = [modifications.get(idx, self.values[idx]) for idx in range(n_rows)] return PlainColumn.from_plain(name=self.name, dtype=self.dtype, values=values)
[docs] @classmethod def from_plain(cls, name: str, dtype: str, values: Sequence) \ -> 'PlainColumn': """Factory method to instantiate PlainColumn from plain objects. Adds preprocessing steps for float and datetime types. Parameters ---------- name: str Name of the column. dtype: str Data type of the column. Must be one of bool, int, float, str or datetime. values: sequence sequence of values Returns ------- plaincolumn: PlainColumn """ # preprocess if dtype == "float": values = cls._preprocess_float(values) elif dtype == "datetime": values = cls._preprocess_datetime(values) values = tuple(values) return cls(name=name, dtype=dtype, values=values)
_ImmutablePlainFrame = NamedTuple("_ImmutablePlainFrame", [("plaincolumns", Tuple[PlainColumn])] )
[docs]class PlainFrame(_ImmutablePlainFrame): """Resembles an immutable dataframe in plain python. Its main purpose is to represent test data that is independent of any computation engine specific characteristics. It serves as a common baseline format. However, in order to be usable for all engines, it can be converted to and from any computation engine's data representation. This allows to formulate test data in an engine independent way only once and to employ it for all computation engines simultaneously. The main focus lies on simple but correct data representation. This includes explicit values for NULL and NaN. Each column needs to be typed. Available types are integer, boolean, string, float and datetime. For simplicity, all values will be represented as plain python types (no 3rd party). Hence, it is not intended to be used for large amounts of data due to its representation in plain python objects. There are several limitations. No index column is supported (as in pandas). Mixed dtypes are not supported (like dtype object in pandas). No distinction is made between int32/int64 or single/double floats. Only primitive/atomic types are supported (pyspark's ArrayType or MapType are currently not supported). Essentially, a PlainFrame consists of only 3 attributes: column names, column types and column values. In addition, it provides conversion methods for all computation engines. It does not offer any computation methods itself because it only represents data. """ def __init__(self, *args, **kwargs): self._validate_plaincolumns() @property def columns(self) -> List[str]: """Return column names of PlainFrame. """ return [column.name for column in self.plaincolumns] @property def dtypes(self) -> List[str]: """Return dtypes of columns of PlainFrame. """ return [column.dtype for column in self.plaincolumns] @property def data(self) -> List[List[Any]]: """Return data of PlainFrame row wise. """ column_wise = [column.values for column in self.plaincolumns] row_wise = [list(row) for row in zip(*column_wise)] return row_wise @property def n_rows(self) -> int: """Return the number of rows. """ return len(self.plaincolumns[0].values) @property def n_cols(self): """Returns the number columns. """ return len(self.plaincolumns) @property def assert_equal(self) -> 'EqualityAsserter': """Return equality assertion composite. """ return EqualityAsserter(self)
[docs] def modify(self, modifications: Dict[str, Dict[int, Any]]) -> 'PlainFrame': """Modifies PlainFrame and returns new instance. Modification does not change dtype, name or the number of values of defined columns. One or more values of one or more columns will be modified. Parameters ---------- modifications: dict Contains modifications. Keys represent column names and values represent column specific modifications. Returns ------- modified: PlainFrame """ modified = [] for plaincolumn in self.plaincolumns: try: modification = modifications[plaincolumn.name] modified.append(plaincolumn.modify(modification)) except KeyError: modified.append(plaincolumn) return PlainFrame(plaincolumns=tuple(modified))
[docs] def to_pandas(self) -> pd.DataFrame: """Converts test data table into a pandas dataframe. """ data = {column.name: column.to_pandas() for column in self.plaincolumns} return pd.DataFrame(data, columns=self.columns)
[docs] @requires("pyspark") def to_pyspark(self): """Converts test data table into a pandas dataframe. """ from pyspark.sql import types converted = [column.to_pyspark() for column in self.plaincolumns] fields, values = zip(*converted) data = list(zip(*values)) schema = types.StructType(fields) from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() return spark.createDataFrame(data=data, schema=schema)
def _validate_plaincolumns(self): """Check plaincolumns in regard to validity constraints. Raises ValueError in case of invalidity. """ # assert tuples for plaincolumns and plaincolumns to be PlainColumn if not isinstance(self.plaincolumns, tuple): raise ValueError("PlainFrame was instantiated incorrectly. " "`plaincolumns` needs to be of type `tuple`. " "However, {} was encountered. Please use " "`PlainFrame.from_plain` instead for convenient " "instantiation and proper type casts." .format(type(self.plaincolumns))) not_plaincolumn = [type(column) for column in self.plaincolumns if not isinstance(column, PlainColumn)] if not_plaincolumn: raise ValueError("PlainFrame was instantiated incorrectly. " "Elements of `plaincolumns` needs to be of type " "`PlainColumn`. However, {} was encountered. " "Please use `PlainFrame.from_plain` instead for " "convenient instantiation and proper type casts." .format(not_plaincolumn)) # assert equal number of values per column row_lenghts = {len(column.values) for column in self.plaincolumns} if len(row_lenghts) > 1: raise ValueError("Input data has varying number of values per " "column. Please check provided input data.") # assert unique column names duplicates = {x for x in self.columns if self.columns.count(x) > 1} if duplicates: raise ValueError("Duplicated column names encountered: {}. " "Please use unique column names." .format(duplicates)) @staticmethod def _validate_from_plain(data: Sequence[Sequence], columns: Sequence[str], dtypes: Sequence[str], row_wise: bool): """Validates input given to `from_plain` factory. Raises value error in case of invalid input. Parameters ---------- data: list List of iterables representing the input data. columns: list List of strings representing the column names. Typed annotations are allowed to be used here and will be checked of `dtypes` is not provided. dtypes: list, optional List of column types. row_wise: bool, optional By default, assumes `data` is provided in row wise format. All values belonging to the same row are stored in the same array. In contrast, if `row_wise` is False, column wise alignment is assumed. In this case, all values belonging to the same column are stored in the same array. """ if row_wise: # assert equal number of elements across rows row_lenghts = {len(row) for row in data} if len(row_lenghts) > 1: raise ValueError("Input data has varying number of values per " "row. Please check provided input data") # assert equal number of columns and elements per row row_lenghts.add(len(columns)) if len(row_lenghts) > 1: raise ValueError( "Number of columns has to equal the number of " "values per row. Please check column names and " "provided input data.") # assert equal number of dtypes and elements per row row_lenghts.add(len(dtypes)) if len(row_lenghts) > 1: raise ValueError("Number of dtypes has to equal the number of " "values per row. Please check dtypes and " "provided input data.") else: # assert equal number of elements across columns col_lengths = {len(col) for col in data} if len(col_lengths) > 1: raise ValueError("Input data has varying number of values per " "columns. Please check provided input data") # assert equal number of columns in data, column names and dtypes col_count = len(columns) if col_count != len(data): raise ValueError("Input data and column names have different " "amount of columns. Please check provided " "input data") if col_count != len(dtypes): raise ValueError("Input data and dtypes have different " "amount of columns. Please check provided " "input data")
[docs] @classmethod def from_pandas(cls, df: pd.DataFrame, dtypes: TYPE_DTYPE_INPUT = None) \ -> 'PlainFrame': """Instantiate `PlainFrame` from pandas DataFrame. Parameters ---------- df: pd.DataFrame Dataframe to be converted. dtypes: list, dict, optional If list is provided, each value represents a dtype and maps to one column of the dataframe in given order. If dict is provided, keys refer to column names and values represent dtypes. Returns ------- datatable: PlainFrame Converted dataframe """ converter = ConverterFromPandas(df) return converter(cls, dtypes=dtypes)
[docs] @classmethod def from_pyspark(cls, df: 'pyspark.sql.DataFrame') -> 'PlainFrame': """Converts pandas dataframe into TestDataTabble. Parameters ---------- df: pyspark.sql.DataFrame Dataframe to be converted. Returns ------- datatable: PlainFrame Converted dataframe """ converter = ConverterFromPySpark(df) return converter(cls)
[docs] @classmethod def from_plain(cls, data: Sequence[Sequence], columns: Sequence[str], dtypes: Optional[Sequence[str]] = None, row_wise: bool = True): """Instantiate `PlainFrame` from plain python objects. Dtypes have to be provided either via `columns` as typed column annotations or directly via `dtypes`. Typed column annotations are a convenient way to omit the `dtypes` parameter while specifying dtypes directly with the `columns` parameter. An exmaple of a typed column annotation is as follows: >>> columns = ["col_a:int", "col_b:str", "col_c:float"] Abbreviations may also be used like: >>> columns = ["col_a:i", "col_b:s", "col_c:f"] For a complete abbreviation mapping, please see `TYPE_ABBR`. Parameters ---------- data: list List of iterables representing the input data. columns: list List of strings representing the column names. Typed annotations are allowed to be used here and will be checked of `dtypes` is not provided. dtypes: list, optional List of column types. row_wise: bool, optional By default, assumes `data` is provided in row wise format. All values belonging to the same row are stored in the same array. In contrast, if `row_wise` is False, column wise alignment is assumed. In this case, all values belonging to the same column are stored in the same array. Returns ------- plainframe: PlainFrame """ # check for typed columns if dtypes is None: columns, dtypes = cls._parse_typed_columns(columns) # validate input data cls._validate_from_plain(data=data, columns=columns, dtypes=dtypes, row_wise=row_wise) # transpose data if row wise if row_wise: data = list(zip(*data)) # handle empty data GH#29 if not data: data = [[]] * len(columns) # instantiate PlainColumns zipped = zip(columns, dtypes, data) plaincolumns = [PlainColumn.from_plain(column, dtype, data) for column, dtype, data in zipped] return cls(plaincolumns=tuple(plaincolumns))
[docs] def to_plain(self) -> Tuple[List[List], List[str], List[str]]: """Converts PlainFrame into tuple with 3 values (data, columns, dtypes). Returns ------- data, columns, values """ return self.data, self.columns, self.dtypes
[docs] @classmethod def from_dict(cls, data: 'collections.OrderedDict[str, Sequence]') \ -> 'PlainFrame': """Instantiate `PlainFrame` from ordered dict. Assumes keys to be column names with type annotations and values to be values. Parameters ---------- data: dict Keys represent typed column annotations and values represent data values. Returns ------- plainframe: PlainFrame """ typed_columns, values = zip(*data.items()) columns, dtypes = cls._parse_typed_columns(typed_columns) return cls.from_plain(data=values, columns=columns, dtypes=dtypes, row_wise=False)
[docs] def to_dict(self) -> 'collections.OrderedDict[str, tuple]': """Converts PlainFrame into dictionary with key as typed columns and values as data. Returns ------- table_dict: OrderedDict """ columns = [(column.typed_column, column.values) for column in self.plaincolumns] return collections.OrderedDict(columns)
[docs] @classmethod def from_any(cls, raw: TYPE_ANY_PF) -> 'PlainFrame': """Instantiate `PlainFrame` from any possible type supported. Checks following scenarios: If PlainFrame is given, simply pass. If dict is given, call constructor from dict. If tuple is given, call constructor from plain. If pandas dataframe is given, call from pandas. If spark dataframe is given, call from pyspark. Parameters ---------- raw: TYPE_ANY_PF Input to be converted. Returns ------- plainframe: PlainFrame """ if isinstance(raw, cls): return raw elif isinstance(raw, dict): return cls.from_dict(raw) elif isinstance(raw, tuple): return cls.from_plain(*raw) elif isinstance(raw, pd.DataFrame): return cls.from_pandas(raw) if is_available("pyspark"): from pyspark.sql.dataframe import DataFrame if isinstance(raw, DataFrame): return cls.from_pyspark(raw) raise ValueError("Unsupported data encountered. Data " "needs to be a PlainFrame, a dict or a " "tuple. Provided type is {}." .format(type(raw)))
@staticmethod def _parse_typed_columns(typed_columns: Sequence[str]) \ -> Tuple[Tuple[str], Tuple[str]]: """Separates column names and corresponding type annotations from column names with type annotation strings. For example, ["a:int", "b:str"] will be separated into ("a", "b"), ("int", "str"). """ splitted = [x.split(":") for x in typed_columns] # assert correct split invalid = [x for x in splitted if len(x) != 2] if invalid: raise ValueError("Invalid typed column format encountered: {}. " "Typed columns should be formulated like " "'col_name:type_name', e.g. 'col1:int'. Please " "be aware that this error may occur if you omit " "dtypes when instantiating `PlainFrame`." .format(invalid)) # get column names and corresponding types cols, types = zip(*splitted) # complete type abbreviations types = tuple([TYPE_ABBR.get(x, x) for x in types]) # check valid types invalid_types = set(types).difference(TYPE_ABBR.values()) if invalid_types: raise ValueError("Invalid types encountered: {}. Valid types " "are: {}." .format(invalid_types, TYPE_ABBR.items())) return cols, types
[docs] def get_column(self, name: str) -> PlainColumn: """Convenient access to PlainColumn via column name. Parameters ---------- name: str Label identifier for columns. Returns ------- column: PlainColumn """ try: idx = self.columns.index(name) column = self.plaincolumns[idx] return column except ValueError: raise ValueError("Column '{}' does not exist. Available columns " "are: {}" .format(name, self.columns))
def __getitem__(self, subset: Union[str, Sequence[str], slice]) \ -> 'PlainFrame': """Get labeled based subset of PlainFrame. Supports single columns, list and slices of columns. Parameters ---------- columns: str, list, slice Defines column subset to be returned. If single str is passed, returns single column. If list of strings is passed, returns corresponding columns. If slice is passed, returns all columns included within slice (start and end including). Returns ------- table: PlainFrame """ # handle different input types if isinstance(subset, int): return tuple.__getitem__(self, subset) elif isinstance(subset, str): columns = [subset] elif isinstance(subset, (list, tuple)): columns = subset elif isinstance(subset, slice): idx_start = subset.start idx_stop = subset.stop if idx_start is None: idx_start = 0 elif not isinstance(idx_start, int): idx_start = self.columns.index(idx_start) if idx_stop is None: idx_stop = len(self.columns) elif not isinstance(idx_stop, int): idx_stop = self.columns.index(idx_stop)+1 columns = self.columns[idx_start:idx_stop] else: raise ValueError("Subsetting requires str, list, tuple or slice. " "However, {} was encountered." .format(type(subset))) # check column names invalid = [column for column in columns if column not in self.columns] if invalid: raise ValueError("Columns '{}' does not exist. Available column " "names are: {}" .format(invalid, self.columns)) plaincolumns = tuple([self.get_column(column) for column in columns]) return PlainFrame(plaincolumns=plaincolumns) def __repr__(self): """Get table as ASCII representation. """ headers = ["{}\n({})".format(column, dtype) for column, dtype in zip(self.columns, self.dtypes)] preserve = copy.copy(tabulate.MIN_PADDING) tabulate.MIN_PADDING = 0 _repr = tabulate.tabulate(tabular_data=self.data, headers=headers, numalign="center", stralign="center", tablefmt="psql", showindex="always") tabulate.MIN_PADDING = preserve return _repr
[docs]class ConverterFromPySpark: """Convert pyspark dataframe into PlainFrame. """ TYPE_MAPPING = {"smallint": "int", "int": "int", "bigint": "int", "boolean": "bool", "float": "float", "double": "float", "string": "str", "timestamp": "datetime", "date": "datetime"} def __init__(self, df: 'pyspark.sql.DataFrame'): self.df = df def __call__(self, cls) -> 'PlainFrame': """Converts pyspark dataframe to PlainFrame. Several types are not supported including BinaryType, DecimalType, ByteType, ArrayType and MapType. Parameters ---------- cls: type Class used for instantiation. Returns ------- datatable: pywrangler.util.testing.plainframe.PlainFrame Converted dataframe. """ data = list(map(self.convert_null, self.df.collect())) columns, dtypes = self.get_column_dtypes() return cls.from_plain(data=data, columns=columns, dtypes=dtypes)
[docs] def get_column_dtypes(self) -> Tuple[List[str], List[str]]: """Get column names and corresponding dtypes. """ columns, pyspark_dtypes = zip(*self.df.dtypes) # check unsupported pyspark dtypes unsupported = set(pyspark_dtypes).difference(self.TYPE_MAPPING.keys()) if unsupported: raise ValueError("Unsupported dtype encountered: {}. Supported" "dtypes are: {}." .format(unsupported, self.TYPE_MAPPING.keys())) dtypes = [self.TYPE_MAPPING[dtype] for dtype in pyspark_dtypes] return columns, dtypes
[docs] @staticmethod def convert_null(values: Iterable) -> list: """Substitutes python `None` with NULL values. Parameters ---------- values: iterable """ return [x if x is not None else NULL for x in values]
[docs]class ConverterToPySpark: """Collection of pyspark conversion methods as a composite of PlainColumn. It handles spark specifics like NULL as None and proper type matching. """ def __init__(self, parent: 'PlainColumn'): self.parent = parent @property def sanitized(self) -> list: """Replaces Null values with None to conform pyspark missing value convention. """ return [None if x is NULL else x for x in self.parent.values] def __call__(self) -> Tuple['pyspark.sql.types.StructField', list]: """Main entry point for composite which returns appropriate `StructField` with corresponding values. """ from pyspark.sql import types mapping = {"bool": types.BooleanType(), "int": types.IntegerType(), "float": types.DoubleType(), "str": types.StringType(), "datetime": types.TimestampType()} pyspark_type = mapping[self.parent.dtype] field = types.StructField(self.parent.name, pyspark_type) return field, self.sanitized
[docs]class ConverterFromPandas: """Convert pandas dataframe into plain PlainFrame. """ def __init__(self, df: pd.DataFrame): self.df = df def __call__(self, cls: PlainFrame, dtypes: Optional[TYPE_DTYPE_INPUT] = None) \ -> 'PlainFrame': """Converts pandas dataframe to PlainFrame. Dtypes will be inferred from pandas dataframe. However, dtypes may be provided explicitly to overwrite inferred dtypes because pandas missing values (np.NaN) always casts to type float (e.g. bool or int with missings will be casted to float). Parameters ---------- cls: type Class used for instantiation. dtypes: list, dict, optional If list is provided, each value represents a dtype and maps to one column of the dataframe in given order. If dict is provided, keys refer to column names and values represent dtypes. Returns ------- datatable: PlainFrame Converted dataframe. """ dtypes_validated = self.get_forced_dtypes(dtypes) dtypes_validated.update(self.get_object_dtypes(dtypes_validated)) dtypes_validated.update(self.get_inferred_dtypes(dtypes_validated)) columns = self.df.columns.tolist() dtypes = [dtypes_validated[column] for column in columns] data = [self.convert_series(column, dtypes_validated[column]) for column in columns] data = list(zip(*data)) return cls.from_plain(data=data, columns=self.df.columns.tolist(), dtypes=dtypes)
[docs] def get_forced_dtypes(self, dtypes: TYPE_DTYPE_INPUT) -> TYPE_DSTR: """Validate user provided `dtypes` parameter. Parameters ---------- dtypes: list, dict If list is provided, each value represents a dtype and maps to one column of the dataframe in order. If dict is provided, keys refer to column names and values represent dtypes. Returns ------- dtypes_forced: dict Keys refer to column names and values represent dtypes. """ if isinstance(dtypes, list): if len(dtypes) != self.df.shape[1]: raise ValueError("Length mismatch: Length of `dtypes` ({}) " "has to equal the number of columns ({})." .format(len(dtypes), self.df.shape[1])) dtypes_forced = dict(zip(self.df.columns, dtypes)) elif isinstance(dtypes, dict): dtypes_forced = dtypes elif dtypes is not None: raise ValueError("Parameter `dtypes` has to be of type `list` or " "`dict`. However, type `{}` is given." .format(type(dtypes))) else: dtypes_forced = {} if dtypes_forced: for column, dtype in dtypes_forced.items(): if column not in self.df.columns: raise ValueError("Column `{}` does not exist. Available " "columns are: `{}`" .format(column, self.df.columns)) if dtype not in PRIMITIVE_TYPES: raise ValueError("Dtype `{}` is invalid. Valid dtypes " "are: {}." .format(dtype, PRIMITIVE_TYPES.keys())) return dtypes_forced
[docs] def get_object_dtypes(self, dtypes_validated: TYPE_DSTR) -> TYPE_DSTR: """Inspect all columns of dtype object and ensure no mixed dtypes are present. Raises type error otherwise. Ignores columns for which dtypes are already explicitly set. Parameters ---------- dtypes_validated: dict Represents already given column/dtype pairs. Keys refer to column names and values represent dtypes. Returns ------- dtypes_object: dict Keys refer to column names and values represent dtypes. """ dtypes_object = {} for column in self.df.columns: if column in dtypes_validated: continue if pd_types.is_object_dtype(self.df[column]): dtypes_object[column] = self.inspect_dtype_object(column) return dtypes_object
[docs] def get_inferred_dtypes(self, dtypes_validated: TYPE_DSTR) -> TYPE_DSTR: """Get all dtypes for columns which have not been provided, yet. Assumes that columns of dtype object are not present. Raises type error otherwise. Parameters ---------- dtypes_validated: dict Represents already given column/dtype pairs. Keys refer to column names and values represent dtypes. Returns ------- dtypes_inferred: dict Keys refer to column names and values represent dtypes. """ dtypes_inferred = {} for column in self.df.columns: if column in dtypes_validated: continue dtypes_inferred[column] = self.inspect_dtype(self.df[column]) return dtypes_inferred
[docs] def convert_series(self, column: str, dtype: str) -> TYPE_ROW: """Converts a column of pandas dataframe into PlainFrame readable format with specified dtype (np.NaN to NULL, timestamps to datetime.datetime). Parameters ---------- column: str Identifier for column. dtype: str Dtype identifier. Returns ------- values: list Converted pandas series as plain python objects. """ series = self.df[column] if dtype != "float": series = series.fillna(NULL) values = self.force_dtype(series, dtype) return values
[docs] def inspect_dtype_object(self, column: str) -> str: """Inspect series of dtype object and ensure no mixed dtypes are present. Try to infer actual dtype after removing np.NaN distinguishing dtypes bool and str. Parameters ---------- column: str Identifier for column. Returns ------- dtype: str Inferred dtype as string. """ series = self.df[column].dropna() # check for bool try: conv = pd.to_numeric(series) return self.inspect_dtype(conv) except ValueError: pass # check for mixed dtypes dtypes = {type(x) for x in series} if len(dtypes) > 1: raise TypeError("Column `{}` has mixed dtypes: {}. Currently, " "this is not supported." .format(column, dtypes)) # check for string if isinstance(series[0], str): return "str" # raise if unsupported dtype is encountered raise TypeError("Column `{}` has dtype `{}` which is currently " "not supported." .format(column, type(series[0])))
[docs] @staticmethod def inspect_dtype(series: pd.Series) -> str: """Get appropriate dtype of pandas series. Checks against bool, int, float and datetime. If dtype object is encountered, raises type error. Parameters ---------- series: pd.Series pandas series column identifier. Returns ------- dtype: str Inferred dtype as string. """ mapping = {pd_types.is_bool_dtype: "bool", pd_types.is_integer_dtype: "int", pd_types.is_float_dtype: "float", pd_types.is_datetime64_any_dtype: "datetime"} for check, result in mapping.items(): if check(series): return result raise TypeError("Type is not understand for column '{}'. Allowed " "types are bool, int, float, str and datetime." .format(series.name))
[docs] @staticmethod def force_dtype(series: pd.Series, dtype: str) -> TYPE_ROW: """Attempts to convert values to provided type. Parameters ---------- series: pd.Series Values in pandas representation. dtype: str Dtype identifier. Returns ------- values: list Converted pandas series as plain python objects. """ conv_funcs = {"bool": bool, "int": int, "float": float, "str": str, "datetime": lambda x: pd.to_datetime(x).to_pydatetime()} conv_func = conv_funcs[dtype] return [conv_func(x) if not isinstance(x, NullValue) else NULL for x in series]
[docs]class ConverterToPandas: """Collection of pandas conversion methods as a composite of PlainColumn. It handles pandas specifics like the missing distinction between NULL and NaN. """ def __init__(self, parent: 'PlainColumn'): self.parent = parent self.requires_nan = parent.has_nan or parent.has_null @property def sanitized(self) -> list: """Replaces any Null values with np.NaN to conform pandas' missing value convention. """ return [np.NaN if x is NULL else x for x in self.parent.values] def __call__(self) -> pd.Series: """Main entry point of composite which calls appropriate converter method corresponding to parent's dtype. """ converter = {"datetime": self._convert_datetime, "int": self._convert_int, "bool": self._convert_bool} func = converter.get(self.parent.dtype, self._convert) return func() def _convert(self, dtype=None) -> pd.Series: """Generic converter for non special dtypes. """ dtype = dtype or self.parent.dtype return pd.Series(self.sanitized, dtype=dtype, name=self.parent.name) def _convert_bool(self) -> pd.Series: """Handle dtype float upcast if missings are present. """ if self.requires_nan: dtype = "float" else: dtype = "bool" return self._convert(dtype=dtype) def _convert_int(self) -> pd.Series: """Since pandas 0.24.0 exists `arrays.IntegerArray` which could be used as an nullable interger dtype. However, this is still experimental (0.25.3) and hence is not used yet. """ if self.requires_nan: dtype = "float" else: dtype = "int" return self._convert(dtype=dtype) def _convert_datetime(self) -> pd.Series: """Pandas timestamp values have to be created via `pd.to_datetime` and can't be casted via `astype`. """ series = pd.to_datetime(self.sanitized) series.name = self.parent.name return series
[docs]class EqualityAsserter: """Collection of equality assertions as a composite of PlainFrame. It contains equality tests in regard to number of rows, columns, dtypes etc. """ def __init__(self, parent: 'PlainFrame'): self.parent = parent def __call__(self, other: 'PlainFrame', assert_column_order: bool = False, assert_row_order: bool = False): """Main entry point for equality assertion. By default, no strict column nor row order is assumed but may be enabled. Parameters ---------- other: PlainFrame Instance to assert equality against. assert_column_order: bool, optional If enabled, column order will be tested. Otherwise, column order does not matter for equality assertion. assert_row_order: bool, optional If enabled, row order will be tested. Otherwise, row order does not matter for equality assertion. """ self._assert_shape(other) self._assert_column_names(other, assert_column_order) self._assert_dtypes(other) if not assert_row_order: order_left = self._get_row_order(self.parent) order_right = self._get_row_order(other) for column in self.parent.columns: left = self.parent.get_column(column).values right = other.get_column(column).values if not assert_row_order: left = [left[idx] for idx in order_left] right = [right[idx] for idx in order_right] msg = "\nDifference for column: {} \n\n".format(column) msg += tabulate.tabulate(zip(*[left, right]), headers=["ACTUAL", "DESIRED"], ) msg += "\n" assert_equal(left, right, err_msg=msg) def _assert_shape(self, other: 'PlainFrame'): """Check for identical shape """ if self.parent.n_rows != other.n_rows: raise AssertionError("Unequal number of rows: " "left {} vs. right {}" .format(self.parent.n_rows, other.n_rows)) if self.parent.n_cols != other.n_cols: raise AssertionError("Unequal number of columns: " "left {} vs right {}" .format(self.parent.n_cols, other.n_cols)) def _assert_column_names(self, other: 'PlainFrame', assert_column_order: bool): """Check for matching column names. Take column order into account if required. """ if assert_column_order: enum = enumerate(zip(self.parent.columns, other.columns)) for idx, (left, right) in enum: if left != right: raise AssertionError( "Mismatching column names at index {}: " "left '{}' vs. right '{}'" .format(idx + 1, left, right) ) else: left = set(self.parent.columns) right = set(other.columns) if left != right: left_exclusive = left.difference(right) right_exclusive = right.difference(left) msg = "Mismatching column names: " if left_exclusive: msg += ("Right does not have columns: {}. " .format(left_exclusive)) if right_exclusive: msg += ("Left does not have columns: {}. " .format(right_exclusive)) raise AssertionError(msg) def _assert_dtypes(self, other: 'PlainFrame'): """Check for matching dtypes. """ left_dtypes = {column.name: column.dtype for column in self.parent.plaincolumns} right_dtypes = {column.name: column.dtype for column in other.plaincolumns} if left_dtypes != right_dtypes: msg = "Mismatching types: " for column, left_dtype in left_dtypes.items(): right_dtype = right_dtypes[column] if left_dtype != right_dtype: msg += ("{} (left '{}' vs. right '{}'" .format(column, left_dtype, right_dtype)) raise AssertionError(msg) @staticmethod def _get_row_order(table: 'PlainFrame') -> List[int]: """Helper function to get index order of sorted data. """ indices = range(table.n_rows) return sorted(indices, key=lambda k: table.data[k])