Source code for pywrangler.benchmark

"""This module contains benchmarking utility.

"""

import gc
import numbers
import sys
import timeit
from typing import Callable, Iterable, List, Union

import numpy as np

from pywrangler.exceptions import NotProfiledError
from pywrangler.util._pprint import (
    enumeration,
    header,
    pretty_file_size,
    pretty_time_duration
)
from pywrangler.util.helper import get_param_names


[docs]def allocate_memory(size: float) -> np.ndarray:
    """Helper function to approximately allocate memory by creating numpy array
    with given size in MiB.

    Numpy is used deliberately to define the used memory via dtype.

    Parameters
    ----------
    size: float
        Size in MiB to be occupied.

    Returns
    -------
    memory_holder: np.ndarray

    """

    if size <= 0:
        return None

    empty_size = sys.getsizeof(np.ones(0))

    size_in_bytes = np.ceil(size * (2 ** 20)).astype(np.int64) - empty_size
    memory_holder = np.ones(size_in_bytes, dtype=np.int8)

    return memory_holder


[docs]class BaseProfiler:
    """Base class defining the interface for all profilers.

    Subclasses have to implement `profile` (the actual profiling method) and
    `less_is_better` (defining the ranking of profiling measurements).

    The private attribute `_measurements` is assumed to be set by `profile`.

    Attributes
    ----------
    measurements: list
        The actual profiling measurements.
    best: float
        The best measurement.
    median: float
        The median of measurements.
    worst: float
        The worst measurement.
    std: float
        The standard deviation of measurements.
    runs: int
        The number of measurements.

    Methods
    -------
    profile
        Contains the actual profiling implementation.
    report
        Print simple report consisting of best, median, worst, standard
        deviation and the number of measurements.
    profile_report
        Calls profile and report in sequence.

    """

    @property
    def measurements(self) -> List[float]:
        """Return measurements of profiling.

        """

        self._check_is_profiled(["_measurements"])

        return self._measurements

    @property
    def best(self) -> float:
        """Returns the best measurement.

        """

        if self.less_is_better:
            return np.min(self.measurements)
        else:
            return np.max(self.measurements)

    @property
    def median(self) -> float:
        """Returns the median of measurements.

        """

        return np.median(self.measurements)

    @property
    def worst(self) -> float:
        """Returns the worst measurement.

        """

        if self.less_is_better:
            return np.max(self.measurements)
        else:
            return np.min(self.measurements)

    @property
    def std(self) -> float:
        """Returns the standard deviation of measurements.

        """

        return np.std(self.measurements)

    @property
    def runs(self) -> int:
        """Return number of measurements.

        """

        return len(self.measurements)

    @property
    def less_is_better(self) -> bool:
        """Defines ranking of measurements.

        """

        raise NotImplementedError

[docs]    def profile(self, *args, **kwargs):
        """Contains the actual profiling implementation and has to set
        `self._measurements`. Always returns self.

        """

        raise NotImplementedError

[docs]    def report(self):
        """Print simple report consisting of best, median, worst, standard
        deviation and the number of measurements.

        """

        tpl = "{best} {sign} {median} {sign} {worst} ± {std} ({runs} runs)"

        fmt = self._pretty_formatter
        values = {"best": fmt(self.best),
                  "median": fmt(self.median),
                  "worst": fmt(self.worst),
                  "std": fmt(self.std),
                  "runs": self.runs,
                  "sign": "<" if self.less_is_better else ">"}

        print(tpl.format(**values))

[docs]    def profile_report(self, *args, **kwargs):
        """Calls profile and report in sequence.

        """

        self.profile(*args, **kwargs).report()

    @staticmethod
    def _pretty_formatter(value: float) -> str:
        """String formatter for human readable output of given input `value`.
        Should be replaced with sensible formatters for file size or time
        duration.

        Parameters
        ----------
        value: float
            Numeric value to be formatted.

        Returns
        -------
        pretty_string: str
            Human readable representation of `value`.

        """

        return str(value)

    def _check_is_profiled(self, attributes: Iterable[str]) -> None:
        """Check if `profile` was already called by ensuring passed attributes
        are not `None`.

        Parameters
        ----------
        attributes:
            Attribute name(s) given as string or a list/tuple of strings

        Returns
        -------
        None

        Raises
        ------
        NotProfiledError

        Notes
        -----
        Inspired by sklearns `check_is_fitted`.

        """

        if any([getattr(self, x, None) is None for x in attributes]):
            msg = ("This {}'s instance is not profiled yet. Call 'profile' "
                   "with appropriate arguments before using this method."
                   .format(self.__class__.__name__))

            raise NotProfiledError(msg)

    def __repr__(self):
        """Print representation of profiler instance.

        """

        # get name of profiler
        profiler_name = self.__class__.__name__

        # get parameter names
        param_names = get_param_names(self.__class__.__init__, ["self"])
        param_dict = {x: getattr(self, x) for x in param_names}

        return header(profiler_name) + enumeration(param_dict)


[docs]class MemoryProfiler(BaseProfiler):
    """Approximate the increase in memory usage when calling a given function.
    Memory increase is defined as the difference between the maximum memory
    usage during function execution and the baseline memory usage before
    function execution.

    In addition, compute the mean increase in baseline memory usage between
    repetitions which might indicate memory leakage.

    Parameters
    ----------
    func: callable
        Callable object to be memory profiled.
    repetitions: int, optional
        Number of repetitions.
    interval: float, optional
        Defines interval duration between consecutive memory usage
        measurements in seconds.

    Attributes
    ----------
    measurements: list
        The actual profiling measurements in bytes.
    best: float
        The best measurement in bytes.
    median: float
        The median of measurements in bytes.
    worst: float
        The worst measurement in bytes.
    std: float
        The standard deviation of measurements in bytes.
    runs: int
        The number of measurements.
    baseline_change: float
        The median change in baseline memory usage across all runs in bytes.

    Methods
    -------
    profile
        Contains the actual profiling implementation.
    report
        Print simple report consisting of best, median, worst, standard
        deviation and the number of measurements.
    profile_report
        Calls profile and report in sequence.

    Notes
    -----
    The implementation is based on `memory_profiler` and is inspired by the
    IPython `%memit` magic which additionally calls `gc.collect()` before
    executing the function to get more stable results.

    """

    def __init__(self, func: Callable, repetitions: int = 5,
                 interval: float = 0.01):
        self.func = func
        self.repetitions = repetitions
        self.interval = interval

[docs]    def profile(self, *args, **kwargs):
        """Executes the actual memory profiling.

        Parameters
        ----------
        args: iterable, optional
            Optional positional arguments passed to `func`.
        kwargs: mapping, optional
            Optional keyword arguments passed to `func`.

        """

        from memory_profiler import memory_usage

        counter = 0
        baselines = []
        max_usages = []

        func_args = (self.func, args, kwargs)
        mem_args = dict(interval=self.interval,
                        multiprocess=True,
                        max_usage=True)

        while counter < self.repetitions:
            gc.collect()
            baseline = memory_usage(**mem_args)

            # API change in memoryprofiler 0.57
            max_usage = memory_usage(func_args, **mem_args)
            if not isinstance(max_usage, numbers.Number):
                max_usage = max_usage[0]

            baselines.append(self._mb_to_bytes(baseline))
            max_usages.append(self._mb_to_bytes(max_usage))
            counter += 1

        self._max_usages = max_usages
        self._baselines = baselines
        self._measurements = np.subtract(max_usages, baselines).tolist()

        return self

    @property
    def less_is_better(self) -> bool:
        """Less memory consumption is better.

        """

        return True

    @property
    def max_usages(self) -> List[int]:
        """Returns the absolute, maximum memory usages for each run in
        bytes.

        """

        self._check_is_profiled(['_max_usages'])

        return self._max_usages

    @property
    def baselines(self) -> List[int]:
        """Returns the absolute, baseline memory usages for each run in
        bytes. The baseline memory usage is defined as the memory usage before
        function execution.

        """

        self._check_is_profiled(['_baselines'])

        return self._baselines

    @property
    def baseline_change(self) -> float:
        """Returns the median change in baseline memory usage across all
        run. The baseline memory usage is defined as the memory usage
        before function execution.
        """

        changes = np.diff(self.baselines)
        return float(np.median(changes))

    @staticmethod
    def _pretty_formatter(value: float) -> str:
        """String formatter for human readable output of given input `value`.

        Parameters
        ----------
        value: float
            Numeric value to be formatted.

        Returns
        -------
        pretty_string: str
            Human readable representation of `value`.

        """

        return pretty_file_size(value)

    @staticmethod
    def _mb_to_bytes(size_mib: float) -> int:
        """Helper method to convert MiB to Bytes.

        Parameters
        ----------
        size_mib: float
            Size in MiB

        Returns
        -------
        size_bytes: int
            Size in bytes.

        """

        return int(size_mib * (2 ** 20))


[docs]class TimeProfiler(BaseProfiler):
    """Approximate the time required to execute a function call.

    By default, the number of repetitions is estimated if not set explicitly.

    Parameters
    ----------
    func: callable
        Callable object to be memory profiled.
    repetitions: None, int, optional
        Number of repetitions. If `None`, `timeit.Timer.autorange` will
        determine a sensible default.

    Attributes
    ----------
    measurements: list
        The actual profiling measurements in seconds.
    best: float
        The best measurement in seconds.
    median: float
        The median of measurements in seconds.
    worst: float
        The worst measurement in seconds.
    std: float
        The standard deviation of measurements in seconds.
    runs: int
        The number of measurements.

    Methods
    -------
    profile
        Contains the actual profiling implementation.
    report
        Print simple report consisting of best, median, worst, standard
        deviation and the number of measurements.
    profile_report
        Calls profile and report in sequence.

    Notes
    -----
    The implementation is based on standard library's `timeit` module.

    """

    def __init__(self, func: Callable, repetitions: Union[None, int] = None):
        self.func = func
        self.repetitions = repetitions

[docs]    def profile(self, *args, **kwargs):
        """Executes the actual time profiling.

        Parameters
        ----------
        args: iterable, optional
            Optional positional arguments passed to `func`.
        kwargs: mapping, optional
            Optional keyword arguments passed to `func`.

        """

        def wrapper():
            """Helper function without arguments which is passed to `repeat`
            which only calls given function with provided args and kwargs.

            """

            self.func(*args, **kwargs)

        timer = timeit.Timer(stmt=wrapper)

        if self.repetitions is None:
            repeat, _ = timer.autorange(None)
        else:
            repeat = self.repetitions

        self._measurements = timer.repeat(number=1, repeat=repeat)

        return self

    @property
    def less_is_better(self) -> bool:
        """Less time required is better.

        """

        return True

    @staticmethod
    def _pretty_formatter(self, value: float) -> str:
        """String formatter for human readable output of given input `value`.

        Parameters
        ----------
        value: float
            Numeric value to be formatted.

        Returns
        -------
        pretty_string: str
            Human readable representation of `value`.

        """

        return pretty_time_duration(value)