Source code for pytest_notebook.nb_regression

"""Jupyter Notebook Regression Test Class."""
import copy
import logging
import os
import sys
from typing import Any, List, TextIO, Tuple, Union

import attr
from attr.validators import instance_of
from nbdime.diff_format import DiffEntry
import nbformat
from nbformat import NotebookNode

try:
    # coverage is an optional dependency
    from coverage import Coverage as CoverageType
except ImportError:
    CoverageType = Any

from pytest_notebook.diffing import diff_notebooks, diff_to_string, filter_diff
from pytest_notebook.execution import (
    HELP_COVERAGE,
    HELP_COVERAGE_CONFIG,
    HELP_COVERAGE_SOURCE,
    execute_notebook,
)
from pytest_notebook.notebook import (
    load_notebook_with_config,
    regex_replace_nb,
    validate_regex_replace,
)
from pytest_notebook.post_processors import (
    ENTRY_POINT_NAME,
    list_processor_names,
    load_processor,
)
from pytest_notebook.utils import autodoc

logger = logging.getLogger(__name__)

HELP_EXEC_NOTEBOOK = (
    "Create a new notebook, by executing all cells in the original notebook"
)
HELP_EXEC_CWD = (
    "Path to the directory which the notebook will run in "
    "(defaults to directory of notebook)."
)
HELP_EXEC_TIMEOUT = "The maximum time to wait (in seconds) for execution of each cell."
HELP_EXEC_ALLOW_ERRORS = (
    "Do not stop execution after the first unexpected exception "
    "(where cell is not tagged ``raises-exception``)."
)
HELP_DIFF_REPLACE = (
    "A list of regex replacements to apply before diffing, "
    r"e.g. ``[('/cells/*/outputs', '\d{2,4}-\d{1,2}-\d{1,2}', 'DATE-STAMP')]``."
)
HELP_DIFF_IGNORE = (
    "List of diff paths to ignore, e.g. '/cells/1/outputs' or '/cells/\\*/metadata'."
)
HELP_DIFF_USE_COLOR = "Use ANSI color code escapes for text output."
HELP_DIFF_COLOR_WORDS = "Highlight changed words using only colors."
HELP_FORCE_REGEN = (
    "Re-generate notebook files, if no unexpected execution errors, "
    "and an output path has been supplied."
)
HELP_POST_PROCS = (
    "post-processors to apply to the new workbook, "
    f"relating to entry points in the '{ENTRY_POINT_NAME}' group"
)
HELP_COVERAGE_MERGE = "A coverage.Coverage instance, to merge coverage results with."


[docs]class NBRegressionError(Exception):
    """Exception to signal a regression test fail."""


[docs]@autodoc
@attr.s(frozen=True, slots=True, repr=False)
class NBRegressionResult:
    """A class to store the result of ``NBRegressionFixture.check``."""

    nb_initial: NotebookNode = attr.ib(
        validator=instance_of(NotebookNode), metadata={"help": "Initial notebook."}
    )
    nb_final: NotebookNode = attr.ib(
        validator=instance_of(NotebookNode),
        metadata={"help": "Notebook after execution and post-processing."},
    )

    diff_full: List[DiffEntry] = attr.ib(
        metadata={"help": "Full diff of initial/final notebooks."}
    )
    diff_filtered: List[DiffEntry] = attr.ib(
        metadata={
            "help": (
                "Diff of initial/final notebooks, "
                "filtered according to the parsed configuration."
            )
        }
    )
    diff_string: str = attr.ib(
        validator=instance_of(str),
        metadata={"help": "The formatte string of diff_filtered."},
    )

    process_resources: dict = attr.ib(
        attr.Factory(dict),
        instance_of(dict),
        metadata={"help": "Resources returned from notebook processors."},
    )

    def __repr__(self):
        """Represent the class instance."""
        return (
            f"NBRegressionResult(diff_full_length={len(self.diff_full)},"
            f"diff_filtered_length={len(self.diff_filtered)})"
        )


[docs]@autodoc
@attr.s
class NBRegressionFixture:
    """Class to perform Jupyter Notebook Regression tests."""

    exec_notebook: bool = attr.ib(
        True, instance_of(bool), metadata={"help": HELP_EXEC_NOTEBOOK}
    )
    exec_cwd: Union[str, None] = attr.ib(
        None, instance_of((type(None), str)), metadata={"help": HELP_EXEC_CWD}
    )

[docs]    @exec_cwd.validator
    def _validate_exec_cwd(self, attribute, value):
        if value is None:
            return
        if not isinstance(value, str):
            raise TypeError("exec_cwd must be None or a string")
        if not os.path.isdir(value):
            raise OSError(f"exec_cwd='{value}' is not an existing directory")

    exec_allow_errors: bool = attr.ib(
        False, instance_of(bool), metadata={"help": HELP_EXEC_ALLOW_ERRORS}
    )
    exec_timeout: int = attr.ib(
        120, instance_of((int, float)), metadata={"help": HELP_EXEC_TIMEOUT}
    )

[docs]    @exec_timeout.validator
    def _validate_exec_timeout(self, attribute, value):
        if not isinstance(value, int):
            raise TypeError("exec_timeout must be an integer")
        if value <= 0:
            raise ValueError("exec_timeout must be larger than 0")

    coverage: bool = attr.ib(False, metadata={"help": HELP_COVERAGE})

[docs]    @coverage.validator
    def _validate_coverage(self, attribute, value):
        if not isinstance(value, bool):
            raise TypeError("coverage must be an boolean")
        if value:
            try:
                import coverage  # noqa: F401
            except ImportError:
                raise ImportError("The 'coverage' package must be installed.")

    cov_config: Union[str, None] = attr.ib(
        None, instance_of((type(None), str)), metadata={"help": HELP_COVERAGE_CONFIG}
    )
    cov_source: Union[str, Tuple[str]] = attr.ib(
        None, instance_of((type(None), tuple)), metadata={"help": HELP_COVERAGE_SOURCE}
    )

    cov_merge: Union[CoverageType, None] = attr.ib(
        None, metadata={"help": HELP_COVERAGE_MERGE}, hash=True
    )

[docs]    @cov_merge.validator
    def _validate_cov_merge(self, attribute, value):
        if value is None:
            return
        try:
            from coverage import Coverage
        except ImportError:
            raise ImportError("The 'coverage' package must be installed")
        if not isinstance(value, Coverage):
            raise TypeError("cov_merge must be an instance of coverage.Coverage")

    post_processors: tuple = attr.ib(
        ("coalesce_streams",), metadata={"help": HELP_POST_PROCS}
    )

[docs]    @post_processors.validator
    def _validate_post_processors(self, attribute, values):
        if not isinstance(values, tuple):
            raise TypeError(f"post_processors must be a tuple: {values}")
        for name in values:
            if name not in list_processor_names():
                raise TypeError(
                    f"name '{name}' not found in entry points: {list_processor_names()}"
                )

    process_resources: dict = attr.ib(
        attr.Factory(dict),
        instance_of(dict),
        metadata={"help": "Resources to parse to processor functions."},
    )

    diff_replace: tuple = attr.ib((), metadata={"help": HELP_DIFF_REPLACE})

[docs]    @diff_replace.validator
    def _validate_diff_replace(self, attribute, values):
        if not isinstance(values, tuple):
            raise TypeError(f"diff_replace must be a tuple: {values}")
        for i, args in enumerate(values):
            validate_regex_replace(args, i)

    diff_ignore: tuple = attr.ib(
        # TODO replace this default with a diff_replace one?
        ("/cells/*/outputs/*/traceback",),
        metadata={"help": HELP_DIFF_IGNORE},
    )

[docs]    @diff_ignore.validator
    def _validate_diff_ignore(self, attribute, values):
        if not isinstance(values, tuple):
            raise TypeError(f"diff_ignore must be a tuple: {values}")
        for item in values:
            if not isinstance(item, str):
                raise TypeError(f"diff_ignore item '{item}' must a strings")
            if not item.startswith("/"):
                raise ValueError(f"diff_ignore item '{item}' must start with '/'")

    diff_use_color: bool = attr.ib(
        True, instance_of(bool), metadata={"help": HELP_DIFF_USE_COLOR}
    )
    diff_color_words: bool = attr.ib(
        False, instance_of(bool), metadata={"help": HELP_DIFF_COLOR_WORDS}
    )

    force_regen: bool = attr.ib(
        False, instance_of(bool), metadata={"help": HELP_FORCE_REGEN}
    )

    def __setattr__(self, key, value):
        """Add validation when setting attributes."""
        x_attr = getattr(attr.fields(self.__class__), key)
        if x_attr.validator:
            x_attr.validator(self, x_attr, value)

        super().__setattr__(key, value)

[docs]    def check(
        self, path: Union[TextIO, str], raise_errors: bool = True
    ) -> NBRegressionResult:
        """Execute the Notebook and compare its initial vs. final contents.

        if ``force_regen`` is True, the new notebook will be written to ``path``

        if ``raise_errors`` is True:

        :raise nbconvert.preprocessors.CellExecutionError: if error in execution
        :raise NBConfigValidationError: if the notebook metadata is invalid
        :raise NBRegressionError: if diffs present

        :rtype: NBRegressionResult

        """
        __tracebackhide__ = True
        if hasattr(path, "name"):
            abspath = os.path.abspath(path.name)
        else:
            abspath = os.path.abspath(str(path))
        logger.debug(f"Checking file: {abspath}")

        nb_initial, nb_config = load_notebook_with_config(path)

        resources = copy.deepcopy(self.process_resources)
        if not self.exec_cwd:
            self.exec_cwd = os.path.dirname(abspath)

        if self.exec_notebook:
            logger.debug("Executing notebook.")
            exec_results = execute_notebook(
                nb_initial,
                resources=resources,
                cwd=self.exec_cwd,
                timeout=self.exec_timeout,
                allow_errors=self.exec_allow_errors,
                with_coverage=self.coverage,
                cov_config_file=self.cov_config,
                cov_source=self.cov_source,
            )
            exec_error = exec_results.exec_error
            nb_final = exec_results.notebook
            resources = exec_results.resources
        else:
            exec_error = None
            nb_final = nb_initial

        # TODO merge on fail option (using pytest-cov --no-cov-on-fail)
        if self.cov_merge and exec_results.has_coverage:
            logger.info("Merging coverage.")
            self.cov_merge.get_data().update(
                exec_results.coverage_data(debug=self.cov_merge._debug),
                aliases=_get_coverage_aliases(self.cov_merge),
            )
            # we also take this opportunity to remove ''
            # from the unmatched source packages, which is caused by using `--cov=`
            self.cov_merge._inorout.source_pkgs_unmatched = [
                p for p in self.cov_merge._inorout.source_pkgs_unmatched if p
            ]

        for proc_name in self.post_processors:
            logger.debug(f"Applying post processor: {proc_name}")
            post_proc = load_processor(proc_name)
            nb_final, resources = post_proc(nb_final, resources)

        regex_replace = list(self.diff_replace) + list(nb_config.diff_replace)

        if regex_replace:
            logger.debug(f"Applying replacements: {regex_replace}")
            nb_initial_replace = regex_replace_nb(nb_initial, regex_replace)
            nb_final_replace = regex_replace_nb(nb_final, regex_replace)
        else:
            nb_initial_replace = nb_initial
            nb_final_replace = nb_final

        full_diff = diff_notebooks(nb_initial_replace, nb_final_replace)

        diff_ignore = copy.deepcopy(nb_config.diff_ignore)
        diff_ignore.update(self.diff_ignore)
        logger.debug(f"filtering diff by ignoring: {diff_ignore}")
        filtered_diff = filter_diff(full_diff, diff_ignore)

        diff_string = diff_to_string(
            nb_initial_replace,
            filtered_diff,
            use_color=self.diff_use_color,
            color_words=self.diff_color_words,
        )
        # TODO optionally write diff to file

        regen_exc = None
        if filtered_diff and self.force_regen and not exec_error:
            if hasattr(path, "close") and hasattr(path, "name"):
                path.close()
                with open(path.name, "w") as handle:
                    nbformat.write(nb_final, handle)
            else:
                nbformat.write(nb_final, str(path))

            regen_exc = NBRegressionError(
                f"Files differ and --nb-force-regen set, "
                f"regenerating file at:\n- {abspath}"
            )

        if not raise_errors:
            pass
        elif exec_error:
            print("Diff up to exception:\n" + diff_string, file=sys.stderr)
            raise exec_error
        elif regen_exc:
            print("Diff before regeneration:\n" + diff_string, file=sys.stderr)
            raise regen_exc
        elif filtered_diff:
            raise NBRegressionError(diff_string)

        return NBRegressionResult(
            nb_initial, nb_final, full_diff, filtered_diff, diff_string, resources
        )


[docs]def _get_coverage_aliases(cov):
    """Retrieve path aliases from coverage.Coverage object."""
    from coverage.files import PathAliases

    aliases = None
    if cov.config.paths:
        aliases = PathAliases()
        for paths in cov.config.paths.values():
            result = paths[0]
            for pattern in paths[1:]:
                aliases.add(pattern, result)
    return aliases