diff --git a/dir_content_diff/__init__.py b/dir_content_diff/__init__.py index fa32604..17f6967 100644 --- a/dir_content_diff/__init__.py +++ b/dir_content_diff/__init__.py @@ -1,24 +1,25 @@ """Module containing the base functions of the dir-content-diff package.""" import copy -import filecmp import logging from pathlib import Path -from dir_content_diff.base_comparators import compare_json_files -from dir_content_diff.base_comparators import compare_pdf_files -from dir_content_diff.base_comparators import compare_yaml_files +from dir_content_diff.base_comparators import DefaultComparator +from dir_content_diff.base_comparators import JsonComparator +from dir_content_diff.base_comparators import PdfComparator +from dir_content_diff.base_comparators import YamlComparator from dir_content_diff.util import diff_msg_formatter from dir_content_diff.util import format_ext -from dir_content_diff.version import VERSION as __version__ +from dir_content_diff.version import VERSION as __version__ # noqa L = logging.getLogger(__name__) _DEFAULT_COMPARATORS = { - ".json": compare_json_files, - ".pdf": compare_pdf_files, - ".yaml": compare_yaml_files, - ".yml": compare_yaml_files, + None: DefaultComparator(), + ".json": JsonComparator(), + ".pdf": PdfComparator(), + ".yaml": YamlComparator(), + ".yml": YamlComparator(), } _COMPARATORS = {} @@ -43,9 +44,24 @@ def register_comparator(ext, comparator, force=False): Args: ext (str): The extension to register. - comparator (str): The comparator that should be associated with the given extension. + comparator (callable): The comparator that should be associated with the given extension. force (bool): If set to `True`, no exception is raised if the given `ext` is already registered. + + .. note:: + The given comparator should have the following signature: + + .. code-block:: python + + comparator( + ref_file: str, + comp_file: str, + *diff_args: Sequence[Any], + return_raw_diffs: bool=False, + **diff_kwargs: Mapping[str, Any], + ) -> Union[False, str] + + The return type can be Any when used with `return_raw_diffs == True`. """ ext = format_ext(ext) if not force and ext in _COMPARATORS: @@ -73,60 +89,45 @@ def unregister_comparator(ext, quiet=False): return _COMPARATORS.pop(ext, None) -def compare_files(ref_file, comp_file, comparator, specific_args=None): +def compare_files(ref_file, comp_file, comparator, *args, return_raw_diffs=False, **kwargs): """Compare 2 files and return the difference. Args: ref_file (str): Path to the reference file. comp_file (str): Path to the compared file. - comparator (callable): The comparator to use. - specific_args (dict): A dict with the args/kwargs that should be given to the comparator. - This dict should be like the following: - - .. code-block:: Python - - { - args: [arg1, arg2, ...], - kwargs: { - kwarg_name_1: kwarg_value_1, - kwarg_name_2: kwarg_value_2, - } - } + comparator (callable): The comparator to use (see in :func:`register_comparator` for the + comparator signature). + return_raw_diffs (bool): If set to True, only the raw differences are returned instead of a + formatted report. + *args: passed to the comparator. + **kwargs: passed to the comparator. Returns: - bool or str: True if the files are equal or a string with a message explaining the + bool or str: False if the files are equal or a string with a message explaining the differences if they are different. """ # Get the compared file L.debug("Compare: %s and %s", ref_file, comp_file) - # Get specific args and kwargs - if specific_args is None: - specific_args = {} - args = specific_args.get("args", []) - kwargs = specific_args.get("kwargs", {}) - - if comparator is not None: - # If the suffix has an associated comparator, use this comparator - try: - return comparator(ref_file, comp_file, *args, **kwargs) - except Exception as exception: # pylint: disable=broad-except - return diff_msg_formatter( - ref_file, - comp_file, - reason="\n".join(exception.args), - args=args, - kwargs=kwargs, - ) - else: - # If no comparator is known for this suffix, test with standard filecmp library - if not filecmp.cmp(ref_file, comp_file): - msg = diff_msg_formatter(ref_file, comp_file) - return msg - return True + if comparator is None: + # If the suffix has no associated comparator, use the default comparator + comparator = _COMPARATORS.get(None) + + try: + return comparator(ref_file, comp_file, *args, return_raw_diffs=return_raw_diffs, **kwargs) + except Exception as exception: # pylint: disable=broad-except + return diff_msg_formatter( + ref_file, + comp_file, + reason="Exception raised: " + "\n".join(exception.args), + args=args, + kwargs=kwargs, + ) -def compare_trees(ref_path, comp_path, comparators=None, specific_args=None): +def compare_trees( + ref_path, comp_path, comparators=None, specific_args=None, return_raw_diffs=False +): """Compare all files from 2 different directory trees and return the differences. .. note:: @@ -154,6 +155,8 @@ def compare_trees(ref_path, comp_path, comparators=None, specific_args=None): }, : {...} } + return_raw_diffs (bool): If set to True, only the raw differences are returned instead of a + formatted report. Returns: dict: A dict in which the keys are the relative file paths and the values are the @@ -179,13 +182,16 @@ def compare_trees(ref_path, comp_path, comparators=None, specific_args=None): comp_file = comp_path / relative_path if comp_file.exists(): + specific_file_args = specific_args.get(relative_path, {}) res = compare_files( ref_file, comp_file, - comparator=comparators.get(ref_file.suffix), - specific_args=specific_args.get(relative_path), + comparators.get(ref_file.suffix), + *specific_file_args.get("args", []), + return_raw_diffs=return_raw_diffs, + **specific_file_args.get("kwargs", {}), ) - if res is not True: + if res is not False: different_files[relative_path] = res else: msg = f"The file '{relative_path}' does not exist in '{comp_path}'." diff --git a/dir_content_diff/base_comparators.py b/dir_content_diff/base_comparators.py index 32c9eee..137c0a0 100644 --- a/dir_content_diff/base_comparators.py +++ b/dir_content_diff/base_comparators.py @@ -1,5 +1,8 @@ """Module containing the base comparators.""" +import filecmp import json +from abc import ABC +from abc import abstractmethod import dictdiffer import yaml @@ -13,108 +16,269 @@ "remove": "Removed the value(s) '{value}' from '{key}' key.", } -_MAX_COMPARE_LENGHT = 50 +class BaseComparator(ABC): + """Base Comparator class.""" -def _format_key(key): - if isinstance(key, str): - key = key.split(".") - if key == [""]: - key = [] - return "".join(f"[{k}]" for k in key) + # pylint: disable=no-self-use + def __init__( + self, + default_load_kwargs=None, + default_filter_kwargs=None, + default_format_kwargs=None, + default_report_kwargs=None, + ): + self._default_load_kwargs = default_load_kwargs or {} + self._default_filter_kwargs = default_filter_kwargs or {} + self._default_format_kwargs = default_format_kwargs or {} + self._default_report_kwargs = default_report_kwargs or {} -def _format_add_value(value): - return json.dumps(dict(sorted(value))) + def load(self, path, **kwargs): + """Load a file.""" + return path + @abstractmethod + def diff(self, ref, comp, *args, **kwargs): + """Perform the comparison between the reference data and the compared data. -def _format_remove_value(value): - return json.dumps(dict(sorted(value))) + .. note:: + This function must return either of the following: + * an iterable of differences between each data element (the iterable can be empty). + * a mapping of differences between each data element in which the keys can be an + element ID or a column name (the mapping can be empty). + * a boolean indicating whether the files are different (`True`) or not (`False`). + """ -def _format_change_value(value): - value = list(value) - for num, i in enumerate(value): - if isinstance(i, str): - value[num] = f"'{i}'" + def filter(self, differences, **kwargs): + """Define a filter to remove specific elements from the result differences.""" + return differences + + def format(self, difference, **kwargs): + """Format one element difference.""" + return difference + + def sort(self, differences, **kwargs): + """Sort the element differences.""" + return sorted(differences) + + def concatenate(self, differences): + """Concatenate the differences.""" + return "\n".join(differences) + + def report( + self, + ref_file, + comp_file, + formatted_differences, + diff_args, + diff_kwargs, + **kwargs, + ): + """Create a report from the formatted differences. + + .. note:: + This function must return a formatted report of the differences (usually as a string + but it can be any type). If the passed differences are `None`, the report should state + that the files are equal. + """ + return diff_msg_formatter( + ref_file, + comp_file, + formatted_differences, + diff_args, + diff_kwargs, + ) + + def __call__( + self, + ref_file, + comp_file, + *diff_args, + return_raw_diffs=False, + load_kwargs=None, + filter_kwargs=None, + format_kwargs=None, + report_kwargs=None, + **diff_kwargs, + ): + """Perform the comparison between the reference file and the compared file. + + .. note:: + The workflow is the following: + * call `self.load()` to load the reference file. + * call `self.load()` to load the compared file. + * call `self.diff()` to compute the differences. + * if `return_raw_diffs`, the diffs are returned at this step. + * if the diffs are not just a boolean, the collection is: + * filtered by calling `self.filter()`. + * formatted by calling `self.format()` on each element. + * sorted by calling `self.sort()`. + * concatenated into one string by calling `self.concatenate()`. + * a report is generated by calling `self.report()`. + """ + if load_kwargs is None: + load_kwargs = self._default_load_kwargs + if filter_kwargs is None: + filter_kwargs = self._default_filter_kwargs + if format_kwargs is None: + format_kwargs = self._default_format_kwargs + if report_kwargs is None: + report_kwargs = self._default_report_kwargs + + ref = self.load(ref_file, **load_kwargs) + comp = self.load(comp_file, **load_kwargs) + diffs = self.diff(ref, comp, *diff_args, **diff_kwargs) + + if return_raw_diffs: + return diffs + + if not diffs: + formatted_diffs = False + elif diffs is True: + formatted_diffs = diffs else: - value[num] = str(i) - return value + filtered_diffs = self.filter(diffs, **filter_kwargs) + if hasattr(filtered_diffs, "items"): + formatted_diffs = self.concatenate( + self.sort([self.format(i, **format_kwargs) for i in filtered_diffs.items()]) + ) + else: + formatted_diffs = self.concatenate( + self.sort([self.format(i, **format_kwargs) for i in filtered_diffs]) + ) + return self.report( + ref_file, + comp_file, + formatted_diffs, + diff_args, + diff_kwargs, + **report_kwargs, + ) -def compare_dicts(ref, comp, *args, **kwargs): - """Compare two dictionaries. + def __eq__(self, other): + """Compare 2 BaseComparator instances.""" + if type(self) is not type(other) or self.__dict__.keys() != other.__dict__.keys(): + return False - This function call :func:`dictdiffer.diff` and format its output, read the doc of this - function for details on args and kwargs. + for k, v in self.__dict__.items(): + if other.__dict__[k] != v: + return False + + return True - Args: - ref (dict): The reference dictionary. - comp (dict): The compared dictionary. - Returns: - bool or str: ``True`` if the dictionaries are considered as equal or a string explaining - why they are not considered as equal. +class DefaultComparator(BaseComparator): + """The comparator used by default when none is registered for a given extension. + + This comparator only performs a binary comparison of the files. """ - format_mapping = { - "add": _format_add_value, - "remove": _format_remove_value, - "change": _format_change_value, + + def diff(self, ref, comp, *args, **kwargs): + """Compare binary data. + + This function calls :func:`filecmp.cmp`, read the doc of this function for details on + args and kwargs. + """ + return not filecmp.cmp(ref, comp) + + +class DictComparator(BaseComparator): + """Comparator for dictionnaries.""" + + _ACTION_MAPPING = { + "add": "Added the value(s) '{value}' in the '{key}' key.", + "change": "Changed the value of '{key}' from {value[0]} to {value[1]}.", + "remove": "Removed the value(s) '{value}' from '{key}' key.", } - if len(args) > 5: - dot_notation = args[5] - args = args[:5] + args[6:] - else: - dot_notation = kwargs.pop("dot_notation", False) - kwargs["dot_notation"] = dot_notation - res = list(dictdiffer.diff(ref, comp, *args, **kwargs)) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._format_mapping = { + "add": self._format_add_value, + "remove": self._format_remove_value, + "change": self._format_change_value, + } - if not res: - return True + @staticmethod + def _format_key(key): + if isinstance(key, str): + key = key.split(".") + if key == [""]: + key = [] + return "".join(f"[{k}]" for k in key) - res_formatted = sorted( - _ACTION_MAPPING[action].format(key=_format_key(key), value=format_mapping[action](value)) - for action, key, value in res[:_MAX_COMPARE_LENGHT] - ) - res_str = "\n".join(res_formatted) - return res_str + @staticmethod + def _format_add_value(value): + return json.dumps(dict(sorted(value))) + @staticmethod + def _format_remove_value(value): + return json.dumps(dict(sorted(value))) -def compare_json_files(ref_path, comp_path, *args, **kwargs): - """Compare data from two JSON files. + @staticmethod + def _format_change_value(value): + value = list(value) + for num, i in enumerate(value): + if isinstance(i, str): + value[num] = f"'{i}'" + else: + value[num] = str(i) + return value - This function calls :func:`compare_dicts`, read the doc of this function for details on args - and kwargs. - """ - with open(ref_path) as file: - ref = json.load(file) - with open(comp_path) as file: - comp = json.load(file) - res = compare_dicts(ref, comp, *args, **kwargs) - return diff_msg_formatter(ref_path, comp_path, res, args, kwargs) + def diff(self, ref, comp, *args, **kwargs): + """Compare 2 dictionnaries. + This function calls :func:`compare_dicts`, read the doc of this function for details on + args and kwargs. + """ + if len(args) > 5: + dot_notation = args[5] + args = args[:5] + args[6:] + else: + dot_notation = kwargs.pop("dot_notation", False) + kwargs["dot_notation"] = dot_notation + return list(dictdiffer.diff(ref, comp, *args, **kwargs)) -def compare_yaml_files(ref_path, comp_path, *args, **kwargs): - """Compare data from two YAML files. + def format(self, difference): + """Format one element difference.""" + action, key, value = difference + return self._ACTION_MAPPING[action].format( + key=self._format_key(key), + value=self._format_mapping[action](value), + ) - This function calls :func:`compare_dicts`, read the doc of this function for details on args - and kwargs. - """ - with open(ref_path) as file: - ref = yaml.load(file) - with open(comp_path) as file: - comp = yaml.load(file) - res = compare_dicts(ref, comp, *args, **kwargs) - return diff_msg_formatter(ref_path, comp_path, res, args, kwargs) +class JsonComparator(DictComparator): + """Comparator for JSON files.""" -def compare_pdf_files(ref_path, comp_path, *args, **kwargs): - """Compare two PDF files. + def load(self, path): + """Open a JSON file.""" + with open(path) as file: # pylint: disable=unspecified-encoding + data = json.load(file) + return data - This function calls :func:`diff_pdf_visually.pdfdiff`, read the doc of this function for - details on args and kwargs here: - https://github.com/bgeron/diff-pdf-visually/blob/main/diff_pdf_visually/diff.py - """ - res = pdfdiff(ref_path, comp_path, *args, **kwargs) - return diff_msg_formatter(ref_path, comp_path, res, args, kwargs) + +class YamlComparator(DictComparator): + """Comparator for YAML files.""" + + def load(self, path): + """Open a JSON file.""" + with open(path) as file: # pylint: disable=unspecified-encoding + data = yaml.full_load(file) + return data + + +class PdfComparator(BaseComparator): + """Compartor for PDF files.""" + + def diff(self, ref, comp, *args, **kwargs): + """Compare data from two PDF files. + + This function calls :func:`diff_pdf_visually.pdfdiff`, read the doc of this function for + details on args and kwargs here: + https://github.com/bgeron/diff-pdf-visually/blob/main/diff_pdf_visually/diff.py + """ + return not pdfdiff(ref, comp, *args, **kwargs) diff --git a/dir_content_diff/pandas.py b/dir_content_diff/pandas.py index 742d8f4..55a02a9 100644 --- a/dir_content_diff/pandas.py +++ b/dir_content_diff/pandas.py @@ -5,6 +5,7 @@ raise ImportError("Could not import pandas package, please install it.") from exception from dir_content_diff import register_comparator +from dir_content_diff.base_comparators import BaseComparator from dir_content_diff.util import diff_msg_formatter @@ -39,126 +40,96 @@ def format_dataframe(comp, replace_pattern=None, ref=None): elif hasattr(comp[col], "str"): # If all values are NaN, Pandas casts the column dtype to float, so the str # attribute is not available. - comp[col] = comp[col].str.replace(pattern, new_value, flags=flags) + comp[col] = comp[col].str.replace(pattern, new_value, flags=flags, regex=True) return res -def compare_dataframes(ref, comp, *args, ignore_columns=None, replace_pattern=None, **kwargs): - """Compare two :class:`Pandas.DataFrames`. - - This function calls :func:`pandas.testing.assert_series_equal`, read the doc of this function - for details on args and kwargs. - - Args: - ref (pandas.DataFrame): The reference DataFrame. - comp (pandas.DataFrame): The compared DataFrame. - ignore_columns (list(str)): The columns that should not be checked. - replace_pattern (dict): The columns that contain a given pattern which must be made - replaced. The dictionary must be as the following: - - .. code-block:: python - - { - (, , ): [col1, col2] - } - - Returns: - bool or str: ``True`` if the DataFrames are considered as equal or a string explaining why - they are not considered as equal. - """ - res = format_dataframe(comp, replace_pattern, ref=ref) - - if ignore_columns is not None: - ref.drop(columns=ignore_columns, inplace=True, errors="ignore") - comp.drop(columns=ignore_columns, inplace=True, errors="ignore") - - if replace_pattern is not None: - for pat, cols in replace_pattern.items(): - pattern = pat[0] - new_value = pat[1] - if len(pat) > 2: - flags = pat[2] - else: - flags = 0 - for col in cols: - if col not in ref.columns: - res[col] = ( - "The column is missing in the reference DataFrame, please fix the " - "'replace_pattern' argument." - ) - elif col not in comp.columns: - res[col] = ( - "The column is missing in the compared DataFrame, please fix the " - "'replace_pattern' argument." - ) - elif hasattr(comp[col], "str"): - # If all values are NaN, Pandas casts the column dtype to float, so the str - # attribute is not available. - comp[col] = comp[col].str.replace(pattern, new_value, flags=flags) - - for col in ref.columns: - if col in res: - continue - try: - if col not in comp.columns: - res[col] = "The column is missing in the compared DataFrame." - else: - pd.testing.assert_series_equal(ref[col], comp[col], *args, **kwargs) - res[col] = True - except AssertionError as e: - res[col] = e.args[0] - - for col in comp.columns: - if col not in res and col not in ref.columns: - res[col] = "The column is missing in the reference DataFrame." - - not_equals = {k: v for k, v in res.items() if v is not True} - if len(not_equals) == 0: - return True - return "\n".join([f"\nColumn '{k}': {v}" for k, v in not_equals.items()]) - - -def compare_csv_files( - ref_path, - comp_path, - *args, - ignore_columns=None, - replace_pattern=None, - read_csv_kwargs=None, - **kwargs, -): - """Compare data from two CSV / TSV / DAT files. - - This function calls :func:`compare_dataframes`, read the doc of this function for details on - args and kwargs. - - Args: - ref_path (str): The path to the reference CSV file. - comp_path (str): The path to the compared CSV file. - ignore_columns (list(str)): See :func:`compare_dataframes`. - replace_pattern (list(str)): See :func:`compare_dataframes`. - read_csv_kwargs (dict): The kwargs that should be passed to :func:`pandas.read_csv`. - - Returns: - bool or str: ``True`` if the DataFrames are considered as equal or a string explaining why - they are not considered as equal. - """ - if read_csv_kwargs is None: - read_csv_kwargs = {} - ref = pd.read_csv(ref_path, **read_csv_kwargs) - comp = pd.read_csv(comp_path, **read_csv_kwargs) - - res = compare_dataframes( - ref, - comp, - *args, - ignore_columns=ignore_columns, - replace_pattern=replace_pattern, - **kwargs, - ) - - return diff_msg_formatter(ref_path, comp_path, res, args, kwargs) +class DataframeComparator(BaseComparator): + """Comparator for :class:`Pandas.DataFrames` objects.""" + + def diff(self, ref, comp, *args, **kwargs): + """Compare two :class:`Pandas.DataFrames`. + + This function calls :func:`pandas.testing.assert_series_equal`, read the doc of this + function for details on args and kwargs. + + Args: + ref (pandas.DataFrame): The reference DataFrame. + comp (pandas.DataFrame): The compared DataFrame. + **ignore_columns (list(str)): (Optional) The columns that should not be checked. + **replace_pattern (dict): (Optional) The columns that contain a given pattern which + must be made replaced. + The dictionary must have the following format: + + .. code-block:: python + + { + (, , ): [col1, col2] + } + + Returns: + bool or str: ``False`` if the DataFrames are considered as equal or a string explaining + why they are not considered equal. + """ + ignore_columns = kwargs.pop("ignore_columns", None) + replace_pattern = kwargs.pop("replace_pattern", None) + + res = format_dataframe(comp, replace_pattern, ref=ref) + + if ignore_columns is not None: + ref.drop(columns=ignore_columns, inplace=True, errors="ignore") + comp.drop(columns=ignore_columns, inplace=True, errors="ignore") + + for col in ref.columns: + if col in res: + continue + try: + if col not in comp.columns: + res[col] = "The column is missing in the compared DataFrame." + else: + pd.testing.assert_series_equal(ref[col], comp[col], *args, **kwargs) + res[col] = True + except AssertionError as e: + res[col] = e.args[0] + + for col in comp.columns: + if col not in res and col not in ref.columns: + res[col] = "The column is missing in the reference DataFrame." + + not_equals = {k: v for k, v in res.items() if v is not True} + if len(not_equals) == 0: + return False + return not_equals + + def format(self, difference): + """Format one element difference.""" + k, v = difference + return f"\nColumn '{k}': {v}" + + def sort(self, differences): + """Do not sort the differences to keep the column order.""" + return differences + + def report(self, ref_file, comp_file, formatted_differences, diff_args, diff_kwargs): + """Create a report from the formatted differences.""" + # if not isinstance(formatted_differences, bool): + # formatted_differences = "\n".join(formatted_differences) + return diff_msg_formatter( + ref_file, + comp_file, + formatted_differences, + diff_args, + diff_kwargs, + ) + + +class CsvComparator(DataframeComparator): + """Comparator for CSV files.""" + + def load(self, path, **kwargs): + """Load a CSV file into a :class:`Pandas.DataFrames`.""" + return pd.read_csv(path, **kwargs) def save_csv_file( @@ -174,7 +145,7 @@ def save_csv_file( Args: file_path (str): The path to the CSV file. file_dest (str): The path to the CSV file in which the formatted data will be exported. - replace_pattern (list(str)): See :func:`compare_dataframes`. + replace_pattern (list(str)): See :class:`DataframeComparator`. read_csv_kwargs (dict): The kwargs that should be passed to :func:`pandas.read_csv`. ref_path (str): The path to the reference CSV file if the formatting function needs it. to_csv_kwargs (dict): The kwargs that should be passed to :meth:`pandas.DataFrame.to_csv`. @@ -199,5 +170,5 @@ def save_csv_file( def register_pandas(): """Register Pandas extensions.""" - register_comparator(".csv", compare_csv_files) - register_comparator(".tsv", compare_csv_files) + register_comparator(".csv", CsvComparator()) + register_comparator(".tsv", CsvComparator()) diff --git a/dir_content_diff/util.py b/dir_content_diff/util.py index 78d0308..77146aa 100644 --- a/dir_content_diff/util.py +++ b/dir_content_diff/util.py @@ -23,18 +23,18 @@ def diff_msg_formatter(ref, comp, reason=None, args=None, kwargs=None): Args: ref (str): The path to the reference file. comp (str): The path to the compared file. - reason (bool or str): If the reason is True, True is returned. If it is a str, a formatted + reason (bool or str): If the reason is False, False is returned. If it is a str, a formatted message is returned. args (list): (optional) The args used for the comparison. kwargs (list): (optional) The kwargs used for the comparison. Returns: - True or the diff message. + False or the diff message. """ - if reason is True: - return True + if not reason: + return False - if reason is not None and reason is not False: + if reason is not None and reason is not True: reason_used = f"{reason}" else: reason_used = "" diff --git a/dir_content_diff/voxcell.py b/dir_content_diff/voxcell.py index 41503c2..c180e22 100644 --- a/dir_content_diff/voxcell.py +++ b/dir_content_diff/voxcell.py @@ -8,63 +8,77 @@ raise ImportError("Could not import voxcell package, please install it.") from e from dir_content_diff import register_comparator -from dir_content_diff.pandas import compare_dataframes -from dir_content_diff.util import diff_msg_formatter +from dir_content_diff.base_comparators import BaseComparator +from dir_content_diff.pandas import DataframeComparator -def compare_nrrd_files(ref_path, comp_path, precision=None): - """Compare data from two NRRD files. +class NrrdComparator(BaseComparator): + """Comparator for NRRD files.""" - Note: NRRD files can contain their creation date, so their hashes are depends on - this creation date, even if the data are the same. + def load(self, path, **kwargs): + """Load a NRRD file into a :class:`numpy.ndarray`.""" + return VoxelData.load_nrrd(path, **kwargs).raw - Args: - ref_path (str): The path to the reference CSV file. - comp_path (str): The path to the compared CSV file. - precision (int): The desired precision, default is 6. + def diff(self, ref, comp, precision=None): + """Compare data from two NRRD files. - Returns: - bool or str: ``True`` if the DataFrames are considered as equal or a string explaining why - they are not considered as equal. - """ - ref = VoxelData.load_nrrd(ref_path).raw - comp = VoxelData.load_nrrd(comp_path).raw - try: - if precision is not None: - np.testing.assert_array_almost_equal(ref, comp, decimal=precision) - else: - np.testing.assert_array_equal(ref, comp) - return True - except AssertionError as exception: - return diff_msg_formatter( - ref_path, comp_path, exception.args[0], kwargs={"precision": precision} + Note: NRRD files can contain their creation date, so their hashes are depends on + this creation date, even if the actual data are the same. This comparator only compares the + actual data in the files. + + Args: + ref_path (str): The path to the reference CSV file. + comp_path (str): The path to the compared CSV file. + precision (int): The desired precision, default is exact precision. + + Returns: + bool or str: ``False`` if the DataFrames are considered as equal or a string explaining + why they are not considered as equal. + """ + try: + if precision is not None: + np.testing.assert_array_almost_equal(ref, comp, decimal=precision) + else: + np.testing.assert_array_equal(ref, comp) + return False + except AssertionError as exception: + return exception.args + + def format(self, difference): + """Format one element difference.""" + return difference + + def report(self, ref_file, comp_file, formatted_differences, diff_args, diff_kwargs, **kwargs): + """Create a report from the formatted differences.""" + if "precision" not in diff_kwargs: + diff_kwargs["precision"] = None + return super().report( + ref_file, + comp_file, + formatted_differences, + diff_args, + diff_kwargs, + **kwargs, ) -def compare_mvd3_files(ref_path, comp_path, *args, **kwargs): - """Compare data from two MVD3 files. +class Mvd3Comparator(DataframeComparator): + """Comparator for MVD3 files. Note: MVD3 files can contain their creation date, so their hashes are depends on this creation date, even if the data are the same. - This function calls :func:`dir_content_diff.pandas.compare_dataframes`, read the doc of this - function for details on args and kwargs. - - Args: - ref_path (str): The path to the reference CSV file. - comp_path (str): The path to the compared CSV file. - - Returns: - bool or str: ``True`` if the DataFrames are considered as equal or a string explaining why - they are not considered as equal. + The ``diff`` function of this comparator calls + :func:`dir_content_diff.pandas.compare_dataframes`, read the doc of this function for details + on args and kwargs. """ - ref = CellCollection.load_mvd3(ref_path).as_dataframe() - comp = CellCollection.load_mvd3(comp_path).as_dataframe() - res = compare_dataframes(ref, comp, *args, **kwargs) - return diff_msg_formatter(ref_path, comp_path, res, args, kwargs) + + def load(self, path, **kwargs): + """Load a MVD3 file into a :class:`Pandas.DataFrames`.""" + return CellCollection.load_mvd3(path, **kwargs).as_dataframe() def register_voxcell(): """Register Voxcell extensions.""" - register_comparator(".nrrd", compare_nrrd_files) - register_comparator(".mvd3", compare_mvd3_files) + register_comparator(".nrrd", NrrdComparator()) + register_comparator(".mvd3", Mvd3Comparator()) diff --git a/pyproject.toml b/pyproject.toml index c06b40f..d878f71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ target-version = ['py36'] # PYLINT [tool.pylint.messages-control] disable = [ + "arguments-differ", "invalid-name", "similarities", ] diff --git a/tests/generate_test_files.py b/tests/generate_test_files.py index 60478c2..46ba6ce 100644 --- a/tests/generate_test_files.py +++ b/tests/generate_test_files.py @@ -108,7 +108,7 @@ def create_json(filename, diff=False): data = copy.deepcopy(DIFF_DICT) else: data = copy.deepcopy(REF_DICT) - with open(filename, "w") as f: + with open(filename, "w", encoding="utf-8") as f: json.dump(data, f) @@ -118,7 +118,7 @@ def create_yaml(filename, diff=False): data = copy.deepcopy(DIFF_DICT) else: data = copy.deepcopy(REF_DICT) - with open(filename, "w") as f: + with open(filename, "w", encoding="utf-8") as f: yaml.dump(data, f) @@ -130,6 +130,6 @@ def create_pdf(filename, diff=False): data = copy.deepcopy(REF_RST) with tempfile.TemporaryDirectory() as tmp_dir: rst_file = Path(tmp_dir) / Path(filename.name).with_suffix(".rst") - with open(rst_file, "w") as f: + with open(rst_file, "w", encoding="utf-8") as f: f.write(data) rst2pdf.createpdf.main([str(rst_file), "-o", str(filename)]) diff --git a/tests/test_base.py b/tests/test_base.py index 8cffa81..12c0d6a 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -3,8 +3,10 @@ # pylint: disable=no-self-use # pylint: disable=redefined-outer-name # pylint: disable=unused-argument +import json import re +import dictdiffer import pytest import dir_content_diff @@ -12,43 +14,292 @@ from dir_content_diff import compare_trees +class TestBaseComparator: + """Test the base comparator.""" + + def test_equal(self): + assert dir_content_diff.JsonComparator() != dir_content_diff.PdfComparator() + assert dir_content_diff.JsonComparator() == dir_content_diff.JsonComparator() + + class ComparatorWithAttributes(dir_content_diff.base_comparators.BaseComparator): + """Compare data from two JSON files.""" + + def __init__(self, arg1, arg2): + super().__init__() + self.arg1 = arg1 + if arg2: + self.arg2 = arg2 + + def diff(self, ref, comp, *args, **kwargs): + return False + + assert ComparatorWithAttributes(1, 2) == ComparatorWithAttributes(1, 2) + assert ComparatorWithAttributes(1, 2) != ComparatorWithAttributes(3, 4) + assert ComparatorWithAttributes(1, 2) != ComparatorWithAttributes(1, None) + + def test_load_kwargs(self, ref_tree, res_tree_diff): + class ComparatorWithLoader(dir_content_diff.base_comparators.JsonComparator): + """Compare data from two JSON files.""" + + def load(self, path, load_empty=False): + if load_empty: + return {} + return super().load(path) + + ref_file = ref_tree / "file.json" + res_file = res_tree_diff / "file.json" + + diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithLoader(), + ) + + no_load_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithLoader(), + load_kwargs={"load_empty": False}, + ) + + no_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithLoader(), + load_kwargs={"load_empty": True}, + ) + + no_diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithLoader(default_load_kwargs={"load_empty": True}), + ) + + diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithLoader(default_load_kwargs={"load_empty": True}), + load_kwargs={"load_empty": False}, + ) + + assert diff == no_load_diff + assert diff is not False + assert no_diff is False + assert no_diff_default is False + assert diff_default == diff + + def test_filter_kwargs(self, ref_tree, res_tree_diff): + class ComparatorWithFilter(dir_content_diff.base_comparators.JsonComparator): + """Compare data from two JSON files.""" + + def filter(self, differences, remove_all=False): + if remove_all: + return [] + return differences + + ref_file = ref_tree / "file.json" + res_file = res_tree_diff / "file.json" + + diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFilter(), + ) + + no_filter_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFilter(), + filter_kwargs={"remove_all": False}, + ) + + no_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFilter(), + filter_kwargs={"remove_all": True}, + ) + + no_diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFilter(default_filter_kwargs={"remove_all": True}), + ) + + diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFilter(default_filter_kwargs={"remove_all": True}), + filter_kwargs={"remove_all": False}, + ) + + assert diff == no_filter_diff + assert diff is not False + assert no_diff is False + assert no_diff_default is False + assert diff_default == diff + + def test_format_kwargs(self, ref_tree, res_tree_diff): + class ComparatorWithFormat(dir_content_diff.base_comparators.JsonComparator): + """Compare data from two JSON files.""" + + def format(self, difference, mark_formatted=False): + """Format one element difference.""" + difference = super().format(difference) + if mark_formatted: + difference += "### FORMATTED" + return difference + + ref_file = ref_tree / "file.json" + res_file = res_tree_diff / "file.json" + + diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFormat(), + ) + + no_format_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFormat(), + format_kwargs={"mark_formatted": False}, + ) + + formatted_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFormat(), + format_kwargs={"mark_formatted": True}, + ) + + formatted_diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFormat(default_format_kwargs={"mark_formatted": True}), + ) + + diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithFormat(default_format_kwargs={"mark_formatted": True}), + format_kwargs={"mark_formatted": False}, + ) + + assert diff == no_format_diff + assert len(re.findall("### FORMATTED", diff)) == 0 + assert len(re.findall("### FORMATTED", formatted_diff)) == 25 + assert len(re.findall("### FORMATTED", formatted_diff_default)) == 25 + assert diff_default == diff + + def test_report_kwargs(self, ref_tree, res_tree_diff): + class ComparatorWithReport(dir_content_diff.base_comparators.JsonComparator): + """Compare data from two JSON files.""" + + def report( + self, + ref_file, + comp_file, + formatted_differences, + diff_args, + diff_kwargs, + mark_report=False, + ): + report = super().report( + ref_file, + comp_file, + formatted_differences, + diff_args, + diff_kwargs, + ) + if mark_report: + report += "### REPORTED" + return report + + ref_file = ref_tree / "file.json" + res_file = res_tree_diff / "file.json" + + diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithReport(), + ) + + no_report_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithReport(), + report_kwargs={"mark_report": False}, + ) + + reported_diff = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithReport(), + report_kwargs={"mark_report": True}, + ) + + reported_diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithReport(default_report_kwargs={"mark_report": True}), + ) + + no_report_diff_default = dir_content_diff.compare_files( + ref_file, + res_file, + ComparatorWithReport(default_report_kwargs={"mark_report": True}), + report_kwargs={"mark_report": False}, + ) + + assert diff == no_report_diff + assert len(re.findall("### REPORTED", diff)) == 0 + assert len(re.findall("### REPORTED", reported_diff)) == 1 + assert len(re.findall("### REPORTED", reported_diff_default)) == 1 + assert no_report_diff_default == diff + + class TestRegistry: """Test the internal registry.""" def test_init_register(self, registry_reseter): """Test the initial registry with the get_comparators() function.""" assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), } def test_update_register(self, registry_reseter): """Test the functions to update the registry.""" - dir_content_diff.register_comparator(".test_ext", dir_content_diff.compare_json_files) + dir_content_diff.register_comparator(".test_ext", dir_content_diff.JsonComparator()) assert dir_content_diff.get_comparators() == { - ".test_ext": dir_content_diff.compare_json_files, - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, + None: dir_content_diff.DefaultComparator(), + ".test_ext": dir_content_diff.JsonComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), } dir_content_diff.unregister_comparator(".yaml") dir_content_diff.unregister_comparator("json") # Test suffix without dot assert dir_content_diff.get_comparators() == { - ".test_ext": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yml": dir_content_diff.compare_yaml_files, + None: dir_content_diff.DefaultComparator(), + ".test_ext": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yml": dir_content_diff.YamlComparator(), } dir_content_diff.reset_comparators() assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), } with pytest.raises( @@ -58,29 +309,31 @@ def test_update_register(self, registry_reseter): "replaced." ), ): - dir_content_diff.register_comparator(".pdf", dir_content_diff.compare_json_files) + dir_content_diff.register_comparator(".pdf", dir_content_diff.JsonComparator()) with pytest.raises(ValueError, match=("The '.unknown_ext' extension is not registered.")): dir_content_diff.unregister_comparator(".unknown_ext") dir_content_diff.unregister_comparator(".unknown_ext", quiet=True) - dir_content_diff.register_comparator(".new_ext", dir_content_diff.compare_json_files) + dir_content_diff.register_comparator(".new_ext", dir_content_diff.JsonComparator()) assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, - ".new_ext": dir_content_diff.compare_json_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), + ".new_ext": dir_content_diff.JsonComparator(), } dir_content_diff.register_comparator( - ".new_ext", dir_content_diff.compare_pdf_files, force=True + ".new_ext", dir_content_diff.PdfComparator(), force=True ) assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, - ".new_ext": dir_content_diff.compare_pdf_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), + ".new_ext": dir_content_diff.PdfComparator(), } @@ -205,7 +458,7 @@ def bad_comparator(ref_path, test_path, *args, **kwargs): assert len(res) == 1 match = re.match( r"The files '\S*/ref/file.yaml' and '\S*/res/file.yaml' are different:\n" - r"Bad\ncomparator", + r"Exception raised: Bad\ncomparator", res["file.yaml"], ) assert match is not None @@ -265,3 +518,41 @@ def test_fix_dot_notation(self, ref_tree, res_tree_diff, pdf_diff, dict_diff): for match_i in [match_res_0, match_res_1, match_res_2]: assert match_i is not None + + def test_format_inside_diff(self, ref_tree, res_tree_diff, dict_diff): + class JsonComparator(dir_content_diff.base_comparators.BaseComparator): + """Compare data from two JSON files.""" + + def load(self, path, *args, **kwargs): + with open(path) as file: # pylint: disable=unspecified-encoding + data = json.load(file) + return data + + def diff(self, ref, comp, *args, **kwargs): + diffs = list(dictdiffer.diff(ref, comp, *args, dot_notation=False, **kwargs)) + + # Format here instead of overriding the default format method + comparator = dir_content_diff.base_comparators.JsonComparator() + formatted = [comparator.format(i) for i in diffs] + + return formatted + + res = compare_trees(ref_tree, res_tree_diff, comparators={".json": JsonComparator()}) + + match = re.match(dict_diff, res["file.json"]) + + assert match is not None + + +class TestProgrammaticUse: + """Test specific comparators that could be use programmatically.""" + + def test_diff_tree(self, ref_tree, res_tree_diff, pdf_diff, dict_diff): + res = compare_trees(ref_tree, res_tree_diff, return_raw_diffs=True) + + res_json = res["file.json"] + + assert len(res_json) == 25 + assert len(list(filter(lambda x: x[0] == "change", res_json))) == 17 + assert len(list(filter(lambda x: x[0] == "add", res_json))) == 4 + assert len(list(filter(lambda x: x[0] == "remove", res_json))) == 4 diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 32787af..3d162c4 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -19,20 +19,22 @@ class TestRegistry: def test_pandas_register(self, registry_reseter): assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), } dir_content_diff.pandas.register_pandas() assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, - ".csv": dir_content_diff.pandas.compare_csv_files, - ".tsv": dir_content_diff.pandas.compare_csv_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), + ".csv": dir_content_diff.pandas.CsvComparator(), + ".tsv": dir_content_diff.pandas.CsvComparator(), } @@ -155,7 +157,8 @@ def test_replace_pattern( assert len(res) == 1 res_csv = res["file.csv"] match_res = re.match( - r"The files '\S*/ref/file.csv' and '\S*/res/file.csv' are different:\n\n" + r"The files '\S*/ref/file.csv' and '\S*/res/file.csv' are different:\n" + r"Kwargs used: {'replace_pattern': {.*}}\n\n" r"Column 'test_path_only_in_ref': The column is missing in the compared DataFrame, " r"please fix the 'replace_pattern' argument.\n\n" r"Column 'test_path_only_in_res': The column is missing in the reference DataFrame, " @@ -184,7 +187,8 @@ def test_replace_pattern( assert len(res) == 1 res_csv = res["file.csv"] match_res = re.match( - r"The files '\S*/ref/file.csv' and '\S*/res/file.csv' are different:\n\n" + r"The files '\S*/ref/file.csv' and '\S*/res/file.csv' are different:\n" + r"Kwargs used: {'replace_pattern': {.*}}\n\n" r"Column 'test_path_only_in_ref': The column is missing in the compared DataFrame, " r"please fix the 'replace_pattern' argument.\n\n" r"Column 'test_path_only_in_res': The column is missing in the reference DataFrame, " @@ -204,7 +208,8 @@ def test_replace_pattern( assert len(res) == 1 res_csv = res["file.csv"] match_res = re.match( - r"The files '\S*/ref/file.csv' and '\S*/res/file.csv' are different:\n\n" + r"The files '\S*/ref/file.csv' and '\S*/res/file.csv' are different:\n" + r"Kwargs used: {'replace_pattern': {.*}}\n\n" r"Column 'test_path_only_in_ref': The column is missing in the compared DataFrame, " r"please fix the 'replace_pattern' argument.\n\n" r"Column 'test_path_only_in_res': The column is missing in the reference DataFrame, " @@ -291,7 +296,7 @@ def test_read_csv_kwargs( ): specific_args = { "file.csv": { - "kwargs": {"read_csv_kwargs": {"header": None, "skiprows": 1, "prefix": "col_"}} + "kwargs": {"load_kwargs": {"header": None, "skiprows": 1, "prefix": "col_"}} } } res = compare_trees(ref_tree, res_tree_diff, specific_args=specific_args) diff --git a/tests/test_voxcell.py b/tests/test_voxcell.py index f122eb5..1426567 100644 --- a/tests/test_voxcell.py +++ b/tests/test_voxcell.py @@ -20,20 +20,22 @@ class TestRegistry: def test_voxcell_register(self, registry_reseter): assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), } dir_content_diff.voxcell.register_voxcell() assert dir_content_diff.get_comparators() == { - ".json": dir_content_diff.compare_json_files, - ".pdf": dir_content_diff.compare_pdf_files, - ".yaml": dir_content_diff.compare_yaml_files, - ".yml": dir_content_diff.compare_yaml_files, - ".nrrd": dir_content_diff.voxcell.compare_nrrd_files, - ".mvd3": dir_content_diff.voxcell.compare_mvd3_files, + None: dir_content_diff.DefaultComparator(), + ".json": dir_content_diff.JsonComparator(), + ".pdf": dir_content_diff.PdfComparator(), + ".yaml": dir_content_diff.YamlComparator(), + ".yml": dir_content_diff.YamlComparator(), + ".nrrd": dir_content_diff.voxcell.NrrdComparator(), + ".mvd3": dir_content_diff.voxcell.Mvd3Comparator(), }