diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index a0b09bf1..eca4bc58 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -200,77 +200,95 @@ def tables(self) -> typing.List[str]: """ return self._df[self._df["type"] == define.DependType.META].index.tolist() - def archive(self, file: str) -> str: - r"""Name of archive the file belongs to. + def archive( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[str, typing.List[str]]: + r"""Name of archive a file belong to. Args: - file: relative file path + files: relative file path(s) Returns: - archive name + archive name(s) """ - return self._df.archive[file] + return self._column_loc("archive", files) - def bit_depth(self, file: str) -> int: + def bit_depth( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[int, typing.List[int]]: r"""Bit depth of media file. Args: - file: relative file path + files: relative file path(s) Returns: - bit depth + bit depth(s) """ - return self._column_loc("bit_depth", file, int) + return self._column_loc("bit_depth", files, int) - def channels(self, file: str) -> int: + def channels( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[int, typing.List[int]]: r"""Number of channels of media file. Args: - file: relative file path + files: relative file path(s) Returns: - number of channels + number(s) of channels """ - return self._column_loc("channels", file, int) + return self._column_loc("channels", files, int) - def checksum(self, file: str) -> str: + def checksum( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[str, typing.List[str]]: r"""Checksum of file. Args: - file: relative file path + files: relative file path(s) Returns: - checksum of file + checksum of file(s) """ - return self._column_loc("checksum", file) + return self._column_loc("checksum", files) - def duration(self, file: str) -> float: + def duration( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[float, typing.List[float]]: r"""Duration of file. Args: - file: relative file path + files: relative file path(s) Returns: - duration in seconds + duration(s) in seconds """ - return self._column_loc("duration", file, float) + return self._column_loc("duration", files, float) - def format(self, file: str) -> str: + def format( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[str, typing.List[str]]: r"""Format of file. Args: - file: relative file path + files: relative file path(s) Returns: - file format (always lower case) + file format(s) (always lower case) """ - return self._column_loc("format", file) + return self._column_loc("format", files) def load(self, path: str): r"""Read dependencies from file. @@ -321,29 +339,35 @@ def load(self, path: str): ) self._df.index = self._df.index.astype("string") - def removed(self, file: str) -> bool: + def removed( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[bool, typing.List[bool]]: r"""Check if file is marked as removed. Args: - file: relative file path + files: relative file path(s) Returns: ``True`` if file was removed """ - return self._column_loc("removed", file, bool) + return self._column_loc("removed", files, bool) - def sampling_rate(self, file: str) -> int: + def sampling_rate( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[int, typing.List[int]]: r"""Sampling rate of media file. Args: - file: relative file path + files: relative file path(s) Returns: - sampling rate in Hz + sampling rate(s) in Hz """ - return self._column_loc("sampling_rate", file, int) + return self._column_loc("sampling_rate", files, int) def save(self, path: str): r"""Write dependencies to file. @@ -362,29 +386,35 @@ def save(self, path: str): protocol=4, # supported by Python >= 3.4 ) - def type(self, file: str) -> int: + def type( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[int, typing.List[int]]: r"""Type of file. Args: - file: relative file path + files: relative file path(s) Returns: - type + type(s) """ - return self._column_loc("type", file, int) + return self._column_loc("type", files, int) - def version(self, file: str) -> str: + def version( + self, + files: typing.Union[str, typing.Sequence[str]], + ) -> typing.Union[str, typing.List[str]]: r"""Version of file. Args: - file: relative file path + files: relative file path(s) Returns: - version string + version string(s) """ - return self._column_loc("version", file) + return self._column_loc("version", files) def _add_attachment( self, @@ -487,10 +517,21 @@ def _column_loc( dtype: typing.Callable = None, ) -> typing.Union[typing.Any, typing.List[typing.Any]]: r"""Column content for selected files.""" - value = self._df.at[files, column] - if dtype is not None: - value = dtype(value) - return value + # Single file + if isinstance(files, str): + value = self._df.at[files, column] + if dtype is not None: + value = dtype(value) + return value + + # Multiple files + else: + values = self._df.loc[files, column] + if dtype is not None: + values = [dtype(value) for value in values] + else: + values = values.tolist() + return values def _drop(self, files: typing.Sequence[str]): r"""Drop files from table. diff --git a/benchmarks/README.md b/benchmarks/README.md index aa05d728..57160cb2 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -51,34 +51,44 @@ using `pyarrow` dtypes). |-------------------------------------------------|----------|----------|-----------| | Dependencies.\_\_call__() | 0.000 | 0.000 | 0.000 | | Dependencies.\_\_contains__(10000 files) | 0.005 | 0.004 | 0.004 | -| Dependencies.\_\_get_item__(10000 files) | 0.322 | 0.224 | 0.900 | +| Dependencies.\_\_get_item__(10000 files) | 0.316 | 0.219 | 0.903 | | Dependencies.\_\_len__() | 0.000 | 0.000 | 0.000 | -| Dependencies.\_\_str__() | 0.006 | 0.005 | 0.006 | -| Dependencies.archives | 0.144 | 0.116 | 0.152 | -| Dependencies.attachments | 0.030 | 0.018 | 0.018 | -| Dependencies.attachment_ids | 0.029 | 0.018 | 0.018 | -| Dependencies.files | 0.030 | 0.011 | 0.046 | -| Dependencies.media | 0.129 | 0.073 | 0.095 | -| Dependencies.removed_media | 0.117 | 0.070 | 0.087 | -| Dependencies.table_ids | 0.037 | 0.026 | 0.023 | -| Dependencies.tables | 0.029 | 0.017 | 0.017 | -| Dependencies.archive(10000 files) | 0.045 | 0.042 | 0.065 | -| Dependencies.bit_depth(10000 files) | 0.024 | 0.024 | 0.045 | -| Dependencies.channels(10000 files) | 0.023 | 0.023 | 0.045 | -| Dependencies.checksum(10000 files) | 0.026 | 0.023 | 0.047 | -| Dependencies.duration(10000 files) | 0.023 | 0.023 | 0.043 | -| Dependencies.format(10000 files) | 0.026 | 0.023 | 0.047 | -| Dependencies.removed(10000 files) | 0.023 | 0.023 | 0.043 | -| Dependencies.sampling_rate(10000 files) | 0.023 | 0.023 | 0.043 | -| Dependencies.type(10000 files) | 0.023 | 0.023 | 0.043 | -| Dependencies.version(10000 files) | 0.026 | 0.023 | 0.047 | -| Dependencies._add_attachment() | 0.055 | 0.062 | 0.220 | -| Dependencies._add_media(10000 files) | 0.057 | 0.057 | 0.066 | -| Dependencies._add_meta() | 0.117 | 0.129 | 0.145 | -| Dependencies._drop() | 0.075 | 0.078 | 0.121 | -| Dependencies._remove() | 0.061 | 0.069 | 0.064 | -| Dependencies._update_media() | 0.087 | 0.086 | 0.145 | -| Dependencies._update_media_version(10000 files) | 0.011 | 0.011 | 0.020 | +| Dependencies.\_\_str__() | 0.005 | 0.005 | 0.006 | +| Dependencies.archives | 0.143 | 0.118 | 0.144 | +| Dependencies.attachments | 0.029 | 0.018 | 0.018 | +| Dependencies.attachment_ids | 0.028 | 0.017 | 0.017 | +| Dependencies.files | 0.030 | 0.011 | 0.043 | +| Dependencies.media | 0.132 | 0.071 | 0.086 | +| Dependencies.removed_media | 0.123 | 0.068 | 0.081 | +| Dependencies.table_ids | 0.035 | 0.025 | 0.023 | +| Dependencies.tables | 0.028 | 0.017 | 0.017 | +| Dependencies.archive(10000 files) | 0.028 | 0.025 | 0.047 | +| Dependencies.archive([10000 files]) | 0.134 | 0.008 | 0.224 | +| Dependencies.bit_depth(10000 files) | 0.026 | 0.024 | 0.045 | +| Dependencies.bit_depth([10000 files]) | 0.140 | 0.002 | 0.224 | +| Dependencies.channels(10000 files) | 0.025 | 0.024 | 0.044 | +| Dependencies.channels([10000 files]) | 0.140 | 0.002 | 0.224 | +| Dependencies.checksum(10000 files) | 0.027 | 0.025 | 0.047 | +| Dependencies.checksum([10000 files]) | 0.142 | 0.002 | 0.220 | +| Dependencies.duration(10000 files) | 0.025 | 0.025 | 0.044 | +| Dependencies.duration([10000 files]) | 0.139 | 0.002 | 0.223 | +| Dependencies.format(10000 files) | 0.027 | 0.023 | 0.047 | +| Dependencies.format([10000 files]) | 0.142 | 0.002 | 0.221 | +| Dependencies.removed(10000 files) | 0.025 | 0.024 | 0.044 | +| Dependencies.removed([10000 files]) | 0.140 | 0.002 | 0.224 | +| Dependencies.sampling_rate(10000 files) | 0.026 | 0.024 | 0.045 | +| Dependencies.sampling_rate([10000 files]) | 0.140 | 0.002 | 0.223 | +| Dependencies.type(10000 files) | 0.026 | 0.024 | 0.045 | +| Dependencies.type([10000 files]) | 0.141 | 0.002 | 0.225 | +| Dependencies.version(10000 files) | 0.027 | 0.023 | 0.047 | +| Dependencies.version([10000 files]) | 0.142 | 0.002 | 0.226 | +| Dependencies._add_attachment() | 0.059 | 0.062 | 0.210 | +| Dependencies._add_media(10000 files) | 0.058 | 0.057 | 0.066 | +| Dependencies._add_meta() | 0.115 | 0.133 | 0.145 | +| Dependencies._drop() | 0.075 | 0.076 | 0.118 | +| Dependencies._remove() | 0.062 | 0.069 | 0.065 | +| Dependencies._update_media() | 0.087 | 0.090 | 0.144 | +| Dependencies._update_media_version(10000 files) | 0.011 | 0.011 | 0.021 | ## audb.Dependencies loading/writing to file diff --git a/benchmarks/benchmark-dependencies-methods.py b/benchmarks/benchmark-dependencies-methods.py index e5423403..e4efe65d 100644 --- a/benchmarks/benchmark-dependencies-methods.py +++ b/benchmarks/benchmark-dependencies-methods.py @@ -253,60 +253,120 @@ def astype(df, dtype): t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.archive([{n_files} files])" + t0 = time.time() + deps.archive(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.bit_depth({n_files} files)" t0 = time.time() [deps.bit_depth(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.bit_depth([{n_files} files])" + t0 = time.time() + deps.bit_depth(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.channels({n_files} files)" t0 = time.time() [deps.channels(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.channels([{n_files} files])" + t0 = time.time() + deps.channels(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.checksum({n_files} files)" t0 = time.time() [deps.checksum(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.checksum([{n_files} files])" + t0 = time.time() + deps.checksum(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.duration({n_files} files)" t0 = time.time() [deps.duration(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.duration([{n_files} files])" + t0 = time.time() + deps.duration(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.format({n_files} files)" t0 = time.time() [deps.format(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.format([{n_files} files])" + t0 = time.time() + deps.format(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.removed({n_files} files)" t0 = time.time() [deps.removed(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.removed([{n_files} files])" + t0 = time.time() + deps.removed(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.sampling_rate({n_files} files)" t0 = time.time() [deps.sampling_rate(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.sampling_rate([{n_files} files])" + t0 = time.time() + deps.sampling_rate(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.type({n_files} files)" t0 = time.time() [deps.type(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.type([{n_files} files])" + t0 = time.time() + deps.type(_files) + t = time.time() - t0 + results.at[method, dtype] = t + method = f"Dependencies.version({n_files} files)" t0 = time.time() [deps.version(file) for file in _files] t = time.time() - t0 results.at[method, dtype] = t + method = f"Dependencies.version([{n_files} files])" + t0 = time.time() + deps.version(_files) + t = time.time() - t0 + results.at[method, dtype] = t + # ------------------------------------------------------------------------- method = "Dependencies._add_attachment()" t0 = time.time() diff --git a/tests/test_dependencies.py b/tests/test_dependencies.py index c295fd4b..05d36402 100644 --- a/tests/test_dependencies.py +++ b/tests/test_dependencies.py @@ -36,14 +36,6 @@ ] -def get_entries(column): - return [row[column] for row in ROWS] - - -def test_get_entries(): - assert get_entries("archive") == ["archive1", "archive2"] - - @pytest.fixture( scope="function", ) @@ -122,11 +114,11 @@ def test_get_item(deps): def test_archives(deps): - assert deps.archives == get_entries("archive") + assert deps.archives == ["archive1", "archive2"] def test_files(deps): - assert deps.files == get_entries("file") + assert deps.files == ["db.files.csv", "file.wav"] def test_media(deps): @@ -145,74 +137,95 @@ def test_tables(deps): assert deps.tables == ["db.files.csv"] -def test_archive(deps): - files = get_entries("file") - archives = get_entries("archive") - for file, archive in zip(files, archives): - assert deps.archive(file) == archive - assert isinstance(deps.archive(file), str) - with pytest.raises(KeyError, match="non.existing"): - deps.archive("non.existing") - - -def test_bit_depth(deps): - files = get_entries("file") - bit_depths = get_entries("bit_depth") - for file, bit_depth in zip(files, bit_depths): - assert deps.bit_depth(file) == bit_depth - assert isinstance(deps.bit_depth(file), int) - with pytest.raises(KeyError, match="non.existing"): - deps.bit_depth("non.existing") - - -def test_channels(deps): - files = get_entries("file") - channels = get_entries("channels") - for file, channel in zip(files, channels): - assert deps.channels(file) == channel - assert isinstance(deps.channels(file), int) - with pytest.raises(KeyError, match="non.existing"): - deps.channels("non.existing") - - -def test_checksum(deps): - files = get_entries("file") - checksums = get_entries("checksum") - for file, checksum in zip(files, checksums): - assert deps.checksum(file) == checksum - assert isinstance(deps.checksum(file), str) - with pytest.raises(KeyError, match="non.existing"): - deps.checksum("non.existing") - - -def test_duration(deps): - files = get_entries("file") - durations = get_entries("duration") - for file, duration in zip(files, durations): - assert deps.duration(file) == duration - assert isinstance(deps.duration(file), float) - with pytest.raises(KeyError, match="non.existing"): - deps.duration("non.existing") +@pytest.mark.parametrize( + "files", + [ + "", + "non-existing", + [""], + ["non-existing"], + ], +) +@pytest.mark.parametrize( + "method, expected_error", + [ + ("archive", KeyError), + ("bit_depth", KeyError), + ("channels", KeyError), + ("checksum", KeyError), + ("duration", KeyError), + ("format", KeyError), + ("removed", KeyError), + ("sampling_rate", KeyError), + ("type", KeyError), + ("version", KeyError), + ], +) +def test_error_file_based_methods(deps, files, method, expected_error): + """Test errors for all file based methods of audb.Dependencies. + Test all methods that have ``files`` as argument, + and return an entry from a column + of the dependency table + for the selected files. -def test_format(deps): - files = get_entries("file") - formats = get_entries("format") - for file, format in zip(files, formats): - assert deps.format(file) == format - assert isinstance(deps.format(file), str) - with pytest.raises(KeyError, match="non.existing"): - deps.format("non.existing") + """ + deps_method = getattr(deps, method) + with pytest.raises(expected_error): + deps_method(files) -def test_removed(deps): - files = get_entries("file") - removeds = get_entries("removed") - for file, removed in zip(files, removeds): - assert deps.removed(file) == removed - assert isinstance(deps.removed(file), bool) - with pytest.raises(KeyError, match="non.existing"): - deps.removed("non.existing") +@pytest.mark.parametrize( + "files", + [ + "db.files.csv", + "file.wav", + ["db.files.csv"], + ["db.files.csv", "file.wav"], + ], +) +@pytest.mark.parametrize( + "method, expected_dtype", + [ + ("archive", str), + ("bit_depth", int), + ("channels", int), + ("checksum", str), + ("duration", float), + ("format", str), + ("removed", bool), + ("sampling_rate", int), + ("type", int), + ("version", str), + ], +) +def test_file_bases_methods(deps, files, method, expected_dtype): + """Test all file based methods of audb.Dependencies. + + Test all methods that have ``files`` as argument, + and return an entry from a column + of the dependency table + for the selected files. + + """ + deps_method = getattr(deps, method) + result = deps_method(files) + if not isinstance(files, list): + for row in ROWS: + if row["file"] == files: + assert result == row[method] + assert isinstance(result, expected_dtype) + break + else: + expected = [] + for file in files: + for row in ROWS: + if row["file"] == file: + expected.append(row[method]) + break + assert result == expected + for result in result: + assert isinstance(result, expected_dtype) def test_load_save(deps): @@ -231,36 +244,6 @@ def test_load_save(deps): deps.load(deps_file) -def test_sampling_rate(deps): - files = get_entries("file") - sampling_rates = get_entries("sampling_rate") - for file, sampling_rate in zip(files, sampling_rates): - assert deps.sampling_rate(file) == sampling_rate - assert isinstance(deps.sampling_rate(file), int) - with pytest.raises(KeyError, match="non.existing"): - deps.sampling_rate("non.existing") - - -def test_type(deps): - files = get_entries("file") - types = get_entries("type") - for file, type in zip(files, types): - assert deps.type(file) == type - assert isinstance(deps.type(file), int) - with pytest.raises(KeyError, match="non.existing"): - deps.type("non.existing") - - -def test_version(deps): - files = get_entries("file") - versions = get_entries("version") - for file, version in zip(files, versions): - assert deps.version(file) == version - assert isinstance(deps.version(file), str) - with pytest.raises(KeyError, match="non.existing"): - deps.version("non.existing") - - def test_len(deps): assert len(deps) == len(ROWS)