Source code for pywrangler.util.testing.mutants

"""This module contains the data mutants and mutation classes.

"""
import itertools
from datetime import datetime
from collections import Counter, defaultdict
from typing import Any, List, NamedTuple, Sequence, Callable, \
    Optional, Union, Dict
import random
from string import ascii_letters

from pywrangler.util.helper import get_param_names
from pywrangler.util.testing.plainframe import PlainFrame

TYPE_RAW_MUTANTS = Optional[Union[dict, 'BaseMutant', List['BaseMutant']]]

ImmutableMutation = NamedTuple("ImmutableMutation", [("column", str),
                                                     ("row", int),
                                                     ("value", Any)])


[docs]class Mutation(ImmutableMutation):
    """Resembles a single mutation of a dataframe which essentially represents
    a data modification of a single cell of a dataframe. Hence, a mutation is
    fully specified via three values: a column, a row and a new value.

    The column is always given via label (string). The row is always given via
    an index (integer) because plainframe does not have labeled indices. The
    row index starts with 0. The new value may be of any type.

    """

    @property
    def key(self):
        return self.column, self.row


[docs]class BaseMutant:
    """Base class for all mutants. A mutant produces one or more mutations.

    """

[docs]    def generate_mutations(self, df: PlainFrame) -> List[Mutation]:
        """Returns all mutations produced by a mutant given a PlainFrame. Needs
        to be implemented by every Mutant. This is essentially the core of
        every mutant.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        mutations: list
            List of Mutation instances.

        """

        raise NotImplementedError

[docs]    def mutate(self, df: PlainFrame) -> PlainFrame:
        """Modifies given PlainFrame with inherent mutations and returns new,
        modifed PlainFrame.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to be modified.

        Returns
        -------
        modified: PlainFrame

        """

        mutations = self.generate_mutations(df)
        self._check_duplicated_mutations(mutations)
        self._check_valid_mutations(mutations, df)

        modifications = defaultdict(dict)
        for mutation in mutations:
            modifications[mutation.column][mutation.row] = mutation.value

        return df.modify(modifications)

[docs]    @classmethod
    def from_dict(cls, raw: dict) -> Union['ValueMutant', 'MutantCollection']:
        """Factory method to conveniently convert a raw value into a Mutant
        instance. This is used for easy Mutant creation in dict format to
        avoid boilerplate code. Essentially, the dict format understands
        value mutations only. The key consists of a tuple of column and row and
        the value represents the actual new value, as follows:

        >>> {("col1", 1): 0}

        is identical to

        >>> ValueMutant(column="col1", row=1, value=0)

        Moreover, multiple mutations may be provided:

        >>> {("col1", 1): 0, ("col1", 2): 1}

        will result into

        >>> MutantCollection([ValueMutant(column="col1", row=1, value=0),
        >>>                   ValueMutant(column="col1", row=2, value=1)])

        Parameters
        ----------
        raw: dict
            Raw value mutant definitions.

        Returns
        -------
        mutant: ValueMutant, MutantCollection

        """

        if not isinstance(raw, dict):
            raise ValueError("Parameter `raw` needs to be of type dict. "
                             "However, {} was encountered."
                             .format(type(raw)))

        value_mutants = [ValueMutant(column=column, row=row, value=value)
                         for (column, row), value in raw.items()]

        if len(value_mutants) == 1:
            return value_mutants[0]
        else:
            return MutantCollection(mutants=value_mutants)

[docs]    @classmethod
    def from_multiple_any(cls, raw: TYPE_RAW_MUTANTS) -> List['BaseMutant']:
        """Factory method to conveniently convert raw values into a list of
        Mutant objects.

        Mutants can be defined in various formats. You can provide a single
        mutant like:
        >>> return ValueMutant(column="col1", row=0, value=3)

        This is identical to the dictionary notation:
        >>> return {("col1", 0): 3}

        If you want to provide multiple mutations within one mutant at once,
        you can use the `MutantCollection` or simply rely on the dictionary
        notation:
        >>> return {("col1", 2): 5, ("col2", 1): "asd"}

        If you want to provide multiple mutants at once, you may provide
        multiple dictionaries within a list:
        >>>  [{("col1", 2): 5}, {("col1", 2): 3}]

        Overall, all subclasses of `BaseMutant` are allowed to be used. You may
        also mix a specialized mutant with the dictionary notation:
        >>> [RandomMutant(), {("col1", 0): 1}]

        Parameters
        ----------
        raw: TYPE_RAW_MUTANTS

        Returns
        -------
        mutants: list
            List of converted mutant instances.

        """

        if not raw:
            return []

        elif isinstance(raw, dict):
            return [cls.from_dict(raw)]

        elif isinstance(raw, BaseMutant):
            return [raw]

        elif isinstance(raw, list):
            mutants = [cls.from_multiple_any(x) for x in raw]
            return list(itertools.chain.from_iterable(mutants))

        else:
            raise ValueError(
                "DataTestCase: Invalid mutant definition provided. "
                "It has to be a dict, list or a subclasses of "
                "BaseMutant. However, {} was provided."
                    .format(type(raw)))

[docs]    def get_params(self) -> Dict[str, Any]:
        """Retrieve all parameters set within the __init__ method.

        Returns
        -------
        param_dict: dictionary
            Parameter names as keys and corresponding values as values

        """

        param_names = get_param_names(self.__class__.__init__, ["self"])
        param_dict = {x: getattr(self, x) for x in param_names}

        return param_dict

    def __repr__(self):
        """Provide simple string representation for readability.

         """

        param_dict = self.get_params()
        repr_dict = ", ".join(["{}={}".format(key, value)
                               for key, value in param_dict.items()])

        return "{}({})".format(self.__class__.__name__, repr_dict)

    def __eq__(self, other: 'BaseMutant') -> bool:
        """Enable comparison for testing purposes on init attributes.

        """

        return self.get_params() == other.get_params()

    @staticmethod
    def _check_duplicated_mutations(mutations: Sequence[Mutation]):
        """Validate unique mutations to prevent overwriting data modifications.

        Raises ValueError.

        """

        keys = [mutation.key for mutation in mutations]
        counter = Counter(keys)

        duplicated = [key for key, count in counter.items() if count > 1]
        if duplicated:
            raise ValueError("Duplicated mutations found: following "
                             "mutations have identical column/row "
                             "specifications which causes unpredictable "
                             "modifications: {}"
                             .format(duplicated))

    @staticmethod
    def _check_valid_mutations(mutations: Sequence[Mutation], df: PlainFrame):
        """Validate that mutations are applicable to plainframe.

        """

        def has_column(column: str) -> bool:
            return column in df.columns

        def has_row(row: int) -> bool:
            return row <= df.n_rows - 1

        for mutation in mutations:
            if not has_column(mutation.column):
                raise ValueError("Mutation ({}) is not applicable to given "
                                 "PlainFrame. Column '{}' does not exist."
                                 .format(mutation, mutation.column))

            if not has_row(mutation.row):
                raise ValueError("Mutation ({}) is not applicable to given "
                                 "PlainFrame. Row '{}' does not exist."
                                 .format(mutation, mutation.row))


[docs]class ValueMutant(BaseMutant):
    """Represents a Mutant with a single mutation.

    Attributes
    ----------
    column: str
        Name of the column.
    row: int
        Index of the row.
    value: Any
        The new value to be used.

    """

    def __init__(self, column: str, row: int, value: Any):
        self.column = column
        self.row = row
        self.value = value

[docs]    def generate_mutations(self, df: PlainFrame) -> List[Mutation]:
        """Returns a single mutation.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        mutations: list
            List of Mutation instances.

        """

        mutation = Mutation(column=self.column, row=self.row, value=self.value)

        return [mutation]


[docs]class FunctionMutant(BaseMutant):
    """Represents a Mutant which wraps a function that essentially generates
    mutations.

    Attributes
    ----------
    func: callable
        A function to be used as a mutation generation method.

    """

    def __init__(self, func: Callable):
        self.func = func

[docs]    def generate_mutations(self, df: PlainFrame) -> List[Mutation]:
        """Delegates the mutation generation to a custom function to allow
        all possible mutation generation.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        mutations: list
            List of Mutation instances.

        """

        return self.func(df)


[docs]class RandomMutant(BaseMutant):
    """Creates random mutations with naive values for supported dtypes of
    PlainFrame. Randomness is controlled via an explicit seed to allow
    reproducibility. Mutation generation may be narrowed to given rows or
    columns. The number of distinct mutations may also be specified.

    Attributes
    ----------
    count: int, optional
        The number of mutations to be executed.
    columns: sequence, optional
        Restrict mutations to provided columns, if given.
    rows: sequence, optional
        Restrict mutations to provided rows, if given.
    seed: int, optional
        Set the seed for the random generator.

    """

    def __init__(self, count: int = 1, columns: Sequence[str] = None,
                 rows: Sequence[int] = None, seed: int = 1):
        self.count = count
        self.columns = columns
        self.rows = rows
        self.seed = seed

[docs]    def generate_mutations(self, df: PlainFrame) -> List[Mutation]:
        """Generates population of all possible mutations and draws a sample of
        it.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        mutations: list
            List of Mutation instances.

        """

        # set random seed
        random.seed(self.seed)

        # validate columns and rows
        columns = self._get_validated_columns(df)
        rows = self._get_validated_rows(df)

        # validate max count of mutations
        max_count = len(columns) * len(rows)
        count = self.count if self.count <= max_count else max_count

        # generate candidates and draw sample
        candidates = list(itertools.product(columns, rows))
        sample = random.sample(candidates, count)

        return [self.generate_mutation(df, column, row)
                for column, row in sample]

[docs]    def generate_mutation(self, df: PlainFrame, column: str,
                          row: int) -> Mutation:
        """Generates single mutation from given PlainFrame for a given
        candidate. A candidate is specified via column name and row index.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.
        column: str
            Identifies relevant column of mutation.
        row: int
            Identifies relevant row of mutation.

        Returns
        -------
        mutation: Mutation

        """

        plaincolumn = df.get_column(column)
        value = plaincolumn.values[row]
        new_value = self._random_value(plaincolumn.dtype, value)

        return Mutation(column=column, row=row, value=new_value)

    @staticmethod
    def _random_value(dtype: str, original_value: Any) -> Any:
        """Helper function to generate a random value given original value
        and dtype.

        Parameters
        ----------
        dtype: str
            Defines the dtype of the new value.
        original_value: Any
            Represents original value

        Returns
        -------
        new_value: Any
            Generated new random value.

        """

        def _bool():
            return random.choice([True, False])

        def _int():
            return random.randint(-10, 10)

        def _float():
            return random.random()

        def _str():
            return random.choice(list(ascii_letters))

        def _datetime():
            year = random.randint(datetime.min.year, datetime.max.year)
            return datetime(year=year, month=1, day=1)

        func = {"bool": _bool,
                "int": _int,
                "float": _float,
                "str": _str,
                "datetime": _datetime}[dtype]

        candidate = func()
        while candidate == original_value:
            candidate = func()

        return candidate

    def _get_validated_rows(self, df: PlainFrame) -> List[int]:
        """Provide validated rows. Provided rows which are not present in given
        PlainFrame will raise a ValueError.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        rows: list
            List of validated rows (integers)

        """

        rows = self.rows or list(range(df.n_rows))

        valid_rows = range(df.n_rows)
        invalid_rows = set(rows).difference(valid_rows)
        if invalid_rows:
            raise ValueError("RandomMutant: Invalid rows provided: {}. "
                             "Valid rows are: {}"
                             .format(invalid_rows, valid_rows))

        return rows

    def _get_validated_columns(self, df: PlainFrame) -> List[str]:
        """Provide validated columns. Provided columns which are not present in
        given PlainFrame will raise a ValueError.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        rows: list
            List of validated columns (strings).

        """

        columns = self.columns or df.columns
        invalid_columns = set(columns).difference(df.columns)
        if invalid_columns:
            raise ValueError("RandomMutant: Invalid columns provided: {}. "
                             "Valid columns are: {}"
                             .format(invalid_columns, df.columns))

        return columns


[docs]class MutantCollection(BaseMutant):
    """Represents a collection of multiple Mutant instances.

    Attributes
    ----------
    mutants: sequence
        List of mutants.

    """

    def __init__(self, mutants: Sequence):
        self.mutants = mutants

[docs]    def generate_mutations(self, df: PlainFrame) -> List[Mutation]:
        """Collects all mutations generated by included Mutants.

        Parameters
        ----------
        df: PlainFrame
            PlainFrame to generate mutations from.

        Returns
        -------
        mutations: list
            List of Mutation instances.

        """

        mutations = [mutant.generate_mutations(df) for mutant in self.mutants]

        return list(itertools.chain.from_iterable(mutations))