From cd4c96c94c6424628de2ccf2eb503d6933eaad9d Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 19 Aug 2024 14:17:14 +0200 Subject: [PATCH 01/12] add config.OCRD_DOWNLOAD_INPUT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd/processor/base.py | 6 ++++-- src/ocrd_utils/config.py | 18 +++++++++++++----- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 322cbde19f..bf262b0b96 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -29,6 +29,8 @@ \b {config.describe('OCRD_DOWNLOAD_TIMEOUT')} \b +{config.describe('OCRD_DOWNLOAD_INPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677e..78f8b12374 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -29,6 +29,7 @@ VERSION as OCRD_VERSION, MIMETYPE_PAGE, MIME_TO_EXT, + config, getLogger, initLogging, list_resource_candidates, @@ -111,7 +112,7 @@ def __init__( input_file_grp=None, output_file_grp=None, page_id=None, - download_files=True, + download_files=config.OCRD_DOWNLOAD_INPUT, version=None ): """ @@ -137,7 +138,8 @@ def __init__( (or empty for all pages). \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. - download_files (boolean): Whether input files will be downloaded prior to processing. + download_files (boolean): Whether input files will be downloaded prior to processing, \ + defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default """ if ocrd_tool is not None: deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - " diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index d0955a8dcf..22a566e7bc 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -12,6 +12,8 @@ from tempfile import gettempdir from textwrap import fill, indent +_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') +_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') class OcrdEnvVariable(): @@ -102,8 +104,8 @@ def raw_value(self, name): config.add('OCRD_METS_CACHING', description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.', - validator=lambda val: val in ('true', 'false', '0', '1'), - parser=lambda val: val in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) config.add('OCRD_MAX_PROCESSOR_CACHE', description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.", @@ -125,7 +127,7 @@ def raw_value(self, name): description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of resource or workspace files.", + description="Number of times to retry failed attempts for downloads of resources or workspace files.", validator=int, parser=int) @@ -141,6 +143,12 @@ def _ocrd_download_timeout_parser(val): description="Timeout in seconds for connecting or reading (comma-separated) when downloading.", parser=_ocrd_download_timeout_parser) +config.add("OCRD_DOWNLOAD_INPUT", + description="Whether to download files not present locally during processing", + default=(True, True), + validator=_validator_boolean, + parser=_parser_boolean) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) @@ -190,5 +198,5 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_LOGGING_DEBUG", description="Print information about the logging setup to STDERR", default=(True, False), - validator=lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1'), - parser=lambda val: val if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')) + validator=_validator_boolean, + parser=_parser_boolean) From 785d60736919590aaa6e2c84a6a487dc46d12468 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:05:24 +0200 Subject: [PATCH 02/12] Processor.zip_input_files: warning instead of exception for missing input files --- src/ocrd/processor/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 344569677e..958661f79f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -575,16 +575,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: @@ -629,13 +622,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + LOG.critical(f"Could not find any files for selected pageId {self.page_id}") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + LOG.error(f'Found no page {page} in file group {ifg}') if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts From b12849da6dd4a46dd0d9a121c50f9438cb61d6e1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:07:16 +0200 Subject: [PATCH 03/12] Processor.zip_input_files: introduce NonUniqueInputFile exception --- src/ocrd/processor/__init__.py | 4 ++- src/ocrd/processor/base.py | 46 ++++++++++++++++++++++--------- tests/processor/test_processor.py | 6 ++-- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 0b3ce5a56e..b6c1188def 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -1,6 +1,8 @@ from .base import ( Processor, - ResourceNotFoundError + ResourceNotFoundError, + NonUniqueInputFile, + MissingInputFile, ) from .ocrd_page_result import ( OcrdPageResult, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 958661f79f..516989ae28 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -55,9 +55,36 @@ class ResourceNotFoundError(FileNotFoundError): def __init__(self, name, executable): self.name = name self.executable = executable - self.message = "Could not find resource '%s' for executable '%s'. " \ - "Try 'ocrd resmgr download %s %s' to download this resource." \ - % (name, executable, executable, name) + self.message = (f"Could not find resource '{name}' for executable '{executable}'. " + f"Try 'ocrd resmgr download {executable} {name}' to download this resource.") + super().__init__(self.message) + +class NonUniqueInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields multiple PAGE files, or no PAGE files but multiple images, + or multiple files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not determine unique input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") + super().__init__(self.message) + +class MissingInputFile(ValueError): + """ + An exception signifying the specified fileGrp / pageId / mimetype + selector yields no PAGE files, or no PAGE and no image files, + or no files of that mimetype. + """ + def __init__(self, fileGrp, pageId, mimetype): + self.fileGrp = fileGrp + self.pageId = pageId + self.mimetype = mimetype + self.message = (f"Could not find input file for fileGrp {fileGrp} " + f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) class Processor(): @@ -352,7 +379,6 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc file_path=image_file_path) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) - # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, @@ -592,9 +618,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( - mimetype, file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, mimetype) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) elif (ift[i].mimetype == MIMETYPE_PAGE and @@ -602,9 +626,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pass # keep PAGE match elif (ift[i].mimetype == MIMETYPE_PAGE and file_.mimetype == MIMETYPE_PAGE): - raise ValueError( - "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen if on_error == 'skip': @@ -614,9 +636,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': - raise ValueError( - "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( - file_.pageId, ifg)) + raise NonUniqueInputFile(ifg, file_.pageId, None) else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 2cf8a189b4..5d565ea707 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -251,7 +251,7 @@ def ocrd_tool(self): assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): + with self.assertRaisesRegex(Exception, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: @@ -260,7 +260,7 @@ def ocrd_tool(self): proc.workspace = ws proc.input_file_grp = 'GRP1,GRP2' proc.page_id = page_id - with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): + with self.assertRaisesRegex(Exception, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): @@ -281,7 +281,7 @@ def ocrd_tool(self): proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err if __name__ == "__main__": main(__file__) From 95d36585bf7d97193e56b9143fde0416cd7b799b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:08:15 +0200 Subject: [PATCH 04/12] Processor.process_workspace: zip_input_files w/o require_first --- src/ocrd/processor/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 516989ae28..5becbf8d81 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -314,17 +314,19 @@ def process_workspace(self, workspace: Workspace) -> None: self.verify() try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) - for input_file_tuple in self.zip_input_files(on_error='abort'): + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): # FIXME: add error handling by catching exceptions in various ways (#579) # for example: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) + log.info("processing page %s", + next(input_file.pageId + for input_file in input_file_tuple + if input_file)) for i, input_file in enumerate(input_file_tuple): - if i == 0: - log.info("processing page %s", input_file.pageId) - elif input_file is None: + if input_file is None: # file/page not found in this file grp continue input_files[i] = input_file From c7298411bafe74287e15646eaa3b9d20b90c2e65 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 09:33:37 +0200 Subject: [PATCH 05/12] Processor.zip_input_files: introduce MissingInputFile exception and config.OCRD_MISSING_INPUT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd/processor/base.py | 20 +++++++++++++++----- src/ocrd_utils/config.py | 6 ++++++ tests/processor/test_processor.py | 2 +- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index bf262b0b96..418d7927a3 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -31,6 +31,8 @@ \b {config.describe('OCRD_DOWNLOAD_INPUT')} \b +{config.describe('OCRD_MISSING_INPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 0ec0747428..fddf6383a7 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -338,7 +338,7 @@ def process_workspace(self, workspace: Workspace) -> None: input_files[i] = self.workspace.download_file(input_file) except ValueError as e: log.error(repr(e)) - log.warning("skipping file %s for page %s", input_file, input_file.pageId) + log.warning(f"failed downloading file {input_file} for page {input_file.pageId}") self.process_page_file(*input_files) except NotImplementedError: # fall back to deprecated method @@ -611,10 +611,12 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) + LOG.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}") # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen + LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -633,6 +635,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen + LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': @@ -644,7 +648,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: raise Exception("Unknown 'on_error' strategy '%s'" % on_error) else: - LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) + LOG.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}") ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): @@ -653,8 +657,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: - # other fallback options? - LOG.error(f'Found no page {page} in file group {ifg}') + # could be from non-unique with on_error=skip or from true gap + LOG.error(f'Found no file for page {page} in file group {ifg}') + if config.OCRD_MISSING_INPUT == 'abort': + raise MissingInputFile(ifg, page, mimetype) + if not any(ifiles): + # must be from non-unique with on_error=skip + LOG.warning(f'Found no files for {page} - skipping') + continue if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 22a566e7bc..11af20249f 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -149,6 +149,12 @@ def _ocrd_download_timeout_parser(val): validator=_validator_boolean, parser=_parser_boolean) +config.add("OCRD_MISSING_INPUT", + description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5d565ea707..aa2124001a 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -281,7 +281,7 @@ def ocrd_tool(self): proc.page_id = page_id assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err if __name__ == "__main__": main(__file__) From 577baa529103de170f6b1259ae6b161b281f475c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 00:06:14 +0200 Subject: [PATCH 06/12] processor parameter decorator: no '{}' default (unnecessary) --- src/ocrd/decorators/parameter_option.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 55abbc2a53..2f8be3d868 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -10,7 +10,7 @@ def _handle_param_option(ctx, param, value): parameter_option = option('-p', '--parameter', help="Parameters, either JSON string or path to JSON file", multiple=True, - default=['{}'], + default=[], # now handled in ocrd_cli_wrap_processor to resolve processor preset files # callback=_handle_param_option callback=lambda ctx, param, kv: list(kv)) From f00ecda84717d274126a736b67e3ab29e5bae83d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 00:07:01 +0200 Subject: [PATCH 07/12] =?UTF-8?q?Processor:=20add=20error=20handling?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduce `config.OCRD_MISSING_OUTPUT` and catch exceptions during `process_page_file`: - `ABORT`: re-raise - `SKIP`: ignore and continue with next page - `COPY`: ignore and provide input PAGE-XML as output (with just `@PcGtsId` and `Metadata` added to simulate the processing step) - introduce `config.OCRD_EXISTING_OUTPUT`: - `ABORT`: re-raise FileExistsError - `SKIP`: ignore and continue with next page - `OVERWRITE`: force overwriting the exact output files (instead of removing output files indiscriminately) - :fire: remove `Workspace.overwrite_mode`, have `--overwrite` merely delegate to `config.OCRD_EXISTING_OUTPUT=OVERWRITE` - introduce `--debug`, just delegate to `config.OCRD_MISSING_OUTPUT=ABORT` - `cli.bashlib.input-files`: delegate everything to `ocrd_cli_wrap_processor` (for CLI handling) and `process_workspace` (for error handling), but override `process_page_file` to (never fail and) print bash-friendly strings for actual processing - update tests, add `test_processor.test_run_output_missing` covering all `OCRD_MISSING_OUTPUT` options and the newly `OCRD_EXISTING_OUTPUT=SKIP` --- src/ocrd/cli/__init__.py | 4 + src/ocrd/cli/bashlib.py | 66 +++++++------- src/ocrd/decorators/__init__.py | 22 +---- src/ocrd/decorators/ocrd_cli_options.py | 1 + src/ocrd/processor/base.py | 89 +++++++++++++++---- src/ocrd/processor/builtin/dummy_processor.py | 13 +-- src/ocrd/processor/helpers.py | 8 +- src/ocrd/workspace.py | 14 +-- src/ocrd_utils/config.py | 12 +++ tests/cli/test_bashlib.py | 2 +- tests/data/__init__.py | 39 +++++++- tests/processor/test_processor.py | 37 +++++++- tests/test_workspace.py | 23 +++-- 13 files changed, 224 insertions(+), 106 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 418d7927a3..3722e3c21e 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -33,6 +33,10 @@ \b {config.describe('OCRD_MISSING_INPUT')} \b +{config.describe('OCRD_MISSING_OUTPUT')} +\b +{config.describe('OCRD_EXISTING_OUTPUT')} +\b {config.describe('OCRD_METS_CACHING')} \b {config.describe('OCRD_MAX_PROCESSOR_CACHE')} diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index 2c57bb412a..26139cb48f 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -20,13 +20,16 @@ from ocrd.decorators import ( parameter_option, parameter_override_option, - ocrd_loglevel + ocrd_loglevel, + ocrd_cli_wrap_processor ) from ocrd_utils import ( is_local_filename, get_local_filename, initLogging, - make_file_id + getLogger, + make_file_id, + config ) from ocrd.resolver import Resolver from ocrd.processor import Processor @@ -81,11 +84,15 @@ def bashlib_constants(name): @bashlib_cli.command('input-files') @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME) @click.option('-w', '--working-dir', help="Working Directory") -@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT') -@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT') +@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None) +@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None) # repeat some other processor options for convenience (will be ignored here) @click.option('-g', '--page-id', help="ID(s) of the pages to process") -@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist") +@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n" + "(with '--page-id', remove only those).\n" + "Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE") +@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n" + "Short-hand for OCRD_MISSING_OUTPUT=ABORT") @parameter_option @parameter_override_option @ocrd_loglevel @@ -100,37 +107,26 @@ def bashlib_input_files(**kwargs): (The printing format is one associative array initializer per line.) """ - initLogging() - mets = kwargs.pop('mets') - working_dir = kwargs.pop('working_dir') - if is_local_filename(mets) and not isfile(get_local_filename(mets)): - msg = "File does not exist: %s" % mets - raise FileNotFoundError(msg) - resolver = Resolver() - workspace = resolver.workspace_from_url(mets, working_dir) class BashlibProcessor(Processor): @property def ocrd_tool(self): - return {} + return {'executable': '', 'steps': ['']} @property - def executable(self): - return '' - processor = BashlibProcessor(None) - # go half way of the normal run_processor / process_workspace call tree - processor.workspace = workspace - processor.page_id = kwargs['page_id'] - processor.input_file_grp = kwargs['input_file_grp'] - processor.output_file_grp = kwargs['output_file_grp'] - for input_files in processor.zip_input_files(mimetype=None, on_error='abort'): - # ensure all input files exist locally (without persisting them in the METS) - # - this mimics the default behaviour of all Pythonic processors - input_files = [workspace.download_file(input_file) if input_file else None - for input_file in input_files] - for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: - # make this bash-friendly (show initialization for associative array) - if len(input_files) > 1: - # single quotes allow us to preserve the list value inside the alist - print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ') - else: - print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ') - print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp'])) + def version(self): + return '1.0' + # go half way of the normal run_processor / process_workspace call tree + # by just delegating to process_workspace, overriding process_page_file + # to ensure all input files exist locally (without persisting them in the METS) + # and print what needs to be acted on in bash-friendly way + def process_page_file(self, *input_files): + for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']: + # make this bash-friendly (show initialization for associative array) + if len(input_files) > 1: + # single quotes allow us to preserve the list value inside the alist + value = ' '.join(str(getattr(res, field)) for res in input_files) + else: + value = str(getattr(input_files[0], field)) + print(f"[{field}]='{value}'", end=' ') + output_file_id = make_file_id(input_files[0], kwargs['output_file_grp']) + print(f"[outputFileId]='{output_file_id}'") + ocrd_cli_wrap_processor(BashlibProcessor, **kwargs) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index d9d1fb69dd..364ef4c847 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + debug=False, resolve_resource=None, show_resource=None, list_resources=False, @@ -117,25 +118,10 @@ def resolve(name): resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') - # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505 - # if overwrite - # if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']: - # raise Exception("--overwrite requires --output-file-grp") - # LOG.info("Removing files because of --overwrite") - # for grp in kwargs['output_file_grp'].split(','): - # if page_id: - # for one_page_id in kwargs['page_id'].split(','): - # LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id) - # for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp): - # workspace.remove_file(file, force=True, keep_file=False, page_recursive=True) - # else: - # LOG.debug("Removing all files in output file group %s ", grp) - # # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors) - # workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False) - # workspace.save_mets() - # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace if overwrite: - workspace.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' + if debug: + config.OCRD_MISSING_OUTPUT = 'ABORT' report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a20032..e069b3ea81 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -33,6 +33,7 @@ def cli(mets_url): option('-O', '--output-file-grp', default=None), option('-g', '--page-id'), option('--overwrite', is_flag=True, default=False), + option('--debug', is_flag=True, default=False), option('--profile', is_flag=True, default=False), option('--profile-file', type=Path(dir_okay=False, writable=True)), parameter_option, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index fddf6383a7..0ec2711f61 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -21,6 +21,7 @@ import tarfile import io from deprecated import deprecated +from requests import HTTPError from ocrd.workspace import Workspace from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile @@ -317,16 +318,11 @@ def process_workspace(self, workspace: Workspace) -> None: try: # FIXME: add page parallelization by running multiprocessing.Pool (#322) for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - # FIXME: add error handling by catching exceptions in various ways (#579) - # for example: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) - log.info("processing page %s", - next(input_file.pageId - for input_file in input_file_tuple - if input_file)) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + log.info(f"processing page {page_id}") for i, input_file in enumerate(input_file_tuple): if input_file is None: # file/page not found in this file grp @@ -336,14 +332,71 @@ def process_workspace(self, workspace: Workspace) -> None: continue try: input_files[i] = self.workspace.download_file(input_file) - except ValueError as e: + except (ValueError, FileNotFoundError, HTTPError) as e: log.error(repr(e)) - log.warning(f"failed downloading file {input_file} for page {input_file.pageId}") - self.process_page_file(*input_files) + log.warning(f"failed downloading file {input_file} for page {page_id}") + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self.process_page_file(*input_files) + except Exception as err: + # we have to be broad here, but want to exclude NotImplementedError + if isinstance(err, NotImplementedError): + raise err + if isinstance(err, FileExistsError): + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # FIXME: re-usable/actionable logging + log.exception(f"Failure on page {page_id}: {err}") + if config.OCRD_MISSING_OUTPUT == 'ABORT': + raise err + if config.OCRD_MISSING_OUTPUT == 'SKIP': + continue + if config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") except NotImplementedError: # fall back to deprecated method self.process() + def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: + """ + Copy the given ``input_file`` of the :py:attr:`workspace`, + representing one physical page (passed as one opened + :py:class:`~ocrd_models.OcrdFile` per input fileGrp) + and add it as if it was a processing result. + """ + log = getLogger('ocrd.processor.base') + input_pcgts : OcrdPage + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) + log.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") + try: + input_pcgts = page_from_file(input_file) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + log.error(f"non-PAGE input for page {input_file.pageId}: {err}") + return + output_file_id = make_file_id(input_file, self.output_file_grp) + input_pcgts.set_pcGtsId(output_file_id) + self.add_metadata(input_pcgts) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), + mimetype=MIMETYPE_PAGE, + content=to_xml(input_pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, @@ -366,9 +419,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_ = page_from_file(input_file) assert isinstance(page_, OcrdPage) input_pcgts[i] = page_ - except ValueError as e: + except ValueError as err: # not PAGE and not an image to generate PAGE for - log.info("non-PAGE input for page %s: %s", page_id, e) + log.error("non-PAGE input for page %s: %s", page_id, err) output_file_id = make_file_id(input_files[0], self.output_file_grp) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: @@ -380,7 +433,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc image_file_id, self.output_file_grp, page_id=page_id, - file_path=image_file_path) + file_path=image_file_path, + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) result.pcgts.set_pcGtsId(output_file_id) self.add_metadata(result.pcgts) self.workspace.add_file(file_id=output_file_id, @@ -388,7 +443,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(result.pcgts)) + content=to_xml(result.pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 1b3f7a5aa0..f8890274ae 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -15,7 +15,8 @@ MIME_TO_EXT, MIMETYPE_PAGE, parse_json_string_with_comments, - resource_string + resource_string, + config ) from ocrd_modelfactory import page_from_file @@ -43,14 +44,15 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr local_filename = join(self.output_file_grp, file_id + ext) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) with open(input_file.local_filename, 'rb') as f: - content = f.read() output_file = self.workspace.add_file( file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, - content=content) + content=f.read(), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) assert isinstance(pcgts, OcrdPage) @@ -63,8 +65,9 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr page_id=input_file.pageId, local_filename=join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - + content=to_xml(pcgts), + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) else: if self.parameter['copy_files']: LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index dff14cfca6..08ca0a4683 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -160,6 +160,7 @@ def run_cli( workspace=None, page_id=None, overwrite=None, + debug=None, log_level=None, log_filename=None, input_file_grp=None, @@ -202,6 +203,8 @@ def run_cli( args += ['--parameter', parameter] if overwrite: args += ['--overwrite'] + if debug: + args += ['--debug'] if mets_server_url: args += ['--mets-server-url', mets_server_url] log = getLogger('ocrd.processor.helpers.run_cli') @@ -270,7 +273,10 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) -O, --output-file-grp USE File group(s) used as output -g, --page-id ID Physical page ID(s) to process instead of full document [] --overwrite Remove existing output pages/images - (with "--page-id", remove only those) + (with "--page-id", remove only those). + Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE + --debug Abort on any errors with full stack trace. + Short-hand for OCRD_MISSING_OUTPUT=ABORT --profile Enable profiling --profile-file PROF-PATH Write cProfile stats to PROF-PATH. Implies "--profile" -p, --parameter JSON-PATH Parameters, either verbatim JSON string diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 509b8123b9..2f94913ed7 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -42,7 +42,8 @@ MIME_TO_EXT, MIME_TO_PIL, MIMETYPE_PAGE, - REGEX_PREFIX + REGEX_PREFIX, + config ) from .workspace_backup import WorkspaceBackupManager @@ -75,7 +76,6 @@ class Workspace(): `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to the filesystem directly. baseurl (string, None) : Base URL to prefix to relative URL. - overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally """ def __init__( @@ -91,7 +91,6 @@ def __init__( self.resolver = resolver self.directory = directory self.mets_target = str(Path(directory, mets_basename)) - self.overwrite_mode = False self.is_remote = bool(mets_server_url) if mets is None: if self.is_remote: @@ -243,8 +242,6 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', file_id) - if self.overwrite_mode: - force = True if isinstance(file_id, OcrdFile): file_id = file_id.ID try: @@ -296,9 +293,6 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is `True`. """ - if not force and self.overwrite_mode: - force = True - if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force): raise Exception("No such fileGrp: %s" % USE) @@ -419,8 +413,6 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") if content is not None and not kwargs.get('local_filename'): raise Exception("'content' was set but no 'local_filename'") - if self.overwrite_mode: - kwargs['force'] = True with pushd_popd(self.directory): if kwargs.get('local_filename'): @@ -1101,8 +1093,6 @@ def save_image_file(self, image : Image.Image, The (absolute) path of the created file. """ log = getLogger('ocrd.workspace.save_image_file') - if self.overwrite_mode: - force = True saveargs = dict() if 'dpi' in image.info: saveargs['dpi'] = image.info['dpi'] diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 11af20249f..fa4c34d63b 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -155,6 +155,18 @@ def _ocrd_download_timeout_parser(val): validator=lambda val: val in ['SKIP', 'ABORT'], parser=str) +config.add("OCRD_MISSING_OUTPUT", + description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], + parser=str) + +config.add("OCRD_EXISTING_OUTPUT", + description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]", + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], + parser=str) + config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", description="Default address of Processing Server to connect to (for `ocrd network client processing`).", default=(True, '')) diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 15af493502..b1ab68c7fc 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -98,7 +98,7 @@ def test_constants_fail(self): def test_input_files(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): - _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG']) + _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG', '-O', 'OUTPUT']) assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' " "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out diff --git a/tests/data/__init__.py b/tests/data/__init__.py index e7ef30fc2b..53fa227d01 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -1,8 +1,9 @@ import json import os +import re from pytest import warns from ocrd import Processor -from ocrd_utils import make_file_id +from ocrd_utils import make_file_id, config DUMMY_TOOL = { 'executable': 'ocrd-test', @@ -94,7 +95,41 @@ def process(self): page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), - content='CONTENT') + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) + +class DummyProcessorWithOutputFailures(Processor): + @property + def ocrd_tool(self): + return DUMMY_TOOL + + @property + def version(self): + return '0.0.1' + + @property + def executable(self): + return 'ocrd-test' + + def __init__(self, *args, **kwargs): + kwargs['download_files'] = False + super().__init__(*args, **kwargs) + + # no error handling with old process(), so override new API + def process_page_file(self, input_file): + n = int(re.findall(r'\d+', input_file.pageId)[-1]) + if n % 2: + raise Exception(f"intermittent failure on page {input_file.pageId}") + output_file_id = make_file_id(input_file, self.output_file_grp) + self.workspace.add_file(file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=input_file.pageId, + local_filename=os.path.join(self.output_file_grp, output_file_id), + mimetype=input_file.mimetype, + content='CONTENT', + force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE', + ) class IncompleteProcessor(Processor): @property diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index aa2124001a..064142574e 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -5,7 +5,13 @@ from pathlib import Path from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module -from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.data import ( + DummyProcessor, + DummyProcessorWithRequiredParameters, + DummyProcessorWithOutput, + DummyProcessorWithOutputFailures, + IncompleteProcessor +) from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver @@ -145,20 +151,43 @@ def test_run_output0(self): output_file_grp="OCR-D-OUT") assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 + def test_run_output_missing(self): + ws = self.workspace + from ocrd_utils import config + config.OCRD_MISSING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + # only half succeed + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2 + config.OCRD_MISSING_OUTPUT = 'ABORT' + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert "intermittent" in str(exc.value) + config.OCRD_MISSING_OUTPUT = 'COPY' + config.OCRD_EXISTING_OUTPUT = 'SKIP' + run_processor(DummyProcessorWithOutputFailures, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT") + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') - ws.overwrite_mode = True + from ocrd_utils import config + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') - ws.overwrite_mode = False + config.OCRD_EXISTING_OUTPUT = 'ABORT' with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" - ws.overwrite_mode = True + config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 2fe5f450a0..1ae007ae52 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -270,9 +270,9 @@ def test_remove_file_force(sbb_data_workspace): # TODO check semantics - can a non-existent thing be removed? assert not sbb_data_workspace.remove_file('non-existing-id', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file('non-existing-id', force=False) + with pytest.raises(FileNotFoundError) as not_found_exc: + sbb_data_workspace.remove_file('non-existing-id', force=False) + assert "not found in METS" in str(not_found_exc.value) def test_remove_file_remote_not_available_raises_exception(plain_workspace): @@ -292,9 +292,9 @@ def test_remove_file_remote(plain_workspace): assert plain_workspace.remove_file('page1_img', force=True) # TODO check returned value - # should also "succeed", because overwrite_mode is set which also sets 'force' to 'True' - plain_workspace.overwrite_mode = True - assert not plain_workspace.remove_file('page1_img') + with pytest.raises(FileNotFoundError) as not_found_exc: + plain_workspace.remove_file('page1_img') + assert "not found in METS" in str(not_found_exc.value) def test_rename_file_group(tmp_path): @@ -341,9 +341,6 @@ def test_remove_file_group_force(sbb_data_workspace): # check function and tests semantics # should succeed assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=True) - # should also succeed - sbb_data_workspace.overwrite_mode = True - assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=False) def test_remove_file_group_rmdir(sbb_data_tmp, sbb_data_workspace): @@ -432,9 +429,11 @@ def test_save_image_file(plain_workspace): assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg')) # should succeed assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True) - # should also succeed - plain_workspace.overwrite_mode = True - assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + # should fail + with pytest.raises(FileExistsError) as exists_exc: + plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg') + assert "neither force nor ignore are set" in str(exists_exc.value) + # check file_path kwarg assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png') assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png')) From fdd5d168d0753caad8a19efd49884c52d7934183 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 18:01:31 +0200 Subject: [PATCH 08/12] ocrd_utils.config: add variables to module docstring --- src/ocrd_utils/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index fa4c34d63b..28f95b2162 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -62,7 +62,11 @@ def __init__(self): self._variables = {} def add(self, name, *args, **kwargs): - self._variables[name] = OcrdEnvVariable(name, *args, **kwargs) + var = OcrdEnvVariable(name, *args, **kwargs) + # make visible in ocrd_utils.config docstring (apidoc) + txt = var.describe(wrap_text=False, indent_text=True) + globals()['__doc__'] += "\n\n - " + txt + "\n\n" + self._variables[name] = var return self._variables[name] def has_default(self, name): From 6d87f9e6494a0768541e5b18ae557fd594d8319b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 18:02:36 +0200 Subject: [PATCH 09/12] improve docstrings, re-generate docs --- .../ocrd/ocrd.processor.ocrd_page_result.rst | 7 ++ docs/api/ocrd/ocrd.processor.rst | 1 + src/ocrd/cli/validate.py | 4 +- src/ocrd/cli/workspace.py | 1 + src/ocrd/processor/base.py | 75 ++++++++++--------- src/ocrd/workspace.py | 2 - src/ocrd_models/ocrd_exif.py | 1 + src/ocrd_models/ocrd_mets.py | 6 +- src/ocrd_utils/config.py | 28 ++++++- 9 files changed, 80 insertions(+), 45 deletions(-) create mode 100644 docs/api/ocrd/ocrd.processor.ocrd_page_result.rst diff --git a/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst new file mode 100644 index 0000000000..e13d50e155 --- /dev/null +++ b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst @@ -0,0 +1,7 @@ +ocrd.processor.ocrd\_page\_result module +======================================== + +.. automodule:: ocrd.processor.ocrd_page_result + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd/ocrd.processor.rst b/docs/api/ocrd/ocrd.processor.rst index 801114d2a3..7507d8439b 100644 --- a/docs/api/ocrd/ocrd.processor.rst +++ b/docs/api/ocrd/ocrd.processor.rst @@ -22,3 +22,4 @@ Submodules ocrd.processor.base ocrd.processor.helpers + ocrd.processor.ocrd_page_result diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d053..61d26988a4 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -40,7 +40,7 @@ def validate_cli(): @click.argument('ocrd_tool', required=False, nargs=1) def validate_ocrd_tool(ocrd_tool): ''' - Validate OCRD_TOOL as an ocrd-tool.json file. + Validate OCRD_TOOL as an `ocrd-tool.json` file. ''' if not ocrd_tool: ocrd_tool = 'ocrd-tool.json' @@ -107,7 +107,7 @@ def validate_page(page, **kwargs): @click.argument('tasks', nargs=-1, required=True) def validate_process(tasks, workspace, mets_basename, overwrite, page_id): ''' - Validate a sequence of tasks passable to 'ocrd process' + Validate a sequence of tasks passable to `ocrd process` ''' if workspace: _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a36..e2186a727c 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -308,6 +308,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ } | ocrd workspace bulk-add -r '(?P.*) (?P.*) (?P.*) (?P.*)' \\ -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - + """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace( diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 0ec2711f61..d53c3da0bf 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -144,22 +144,21 @@ def __init__( version=None ): """ - Instantiate, but do not process. Unless ``list_resources`` or - ``show_resource`` or ``show_help`` or ``show_version`` or - ``dump_json`` or ``dump_module_dir`` is true, setup for processing - (parsing and validating parameters, entering the workspace directory). + Instantiate, but do not setup (neither for processing nor other usage). + If given, do parse and validate :py:data:`.parameter`. Args: workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ + If not ``None``, then `chdir` to that directory. Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. Keyword Args: parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ Can be ``None`` even for processing, but then needs to be set before running. - input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \ + input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. - output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \ + output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \ Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \ before processing. page_id (string): comma-separated list of METS physical ``page`` IDs to process \ @@ -287,29 +286,32 @@ def setup(self) -> None: """ pass - @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()') + @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ - Process all files of the :py:attr:`workspace` - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` (or all pages) - under the given :py:attr:`parameter`. + Process all files of the :py:data:`workspace` + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. - (This contains the main functionality and needs to be overridden by subclasses.) + (This contains the main functionality and needs to be + overridden by subclasses.) """ raise NotImplementedError() def process_workspace(self, workspace: Workspace) -> None: """ Process all files of the given ``workspace``, - from the given :py:attr:`input_file_grp` - to the given :py:attr:`output_file_grp` - for the given :py:attr:`page_id` (or all pages) - under the given :py:attr:`parameter`. + from the given :py:data:`input_file_grp` + to the given :py:data:`output_file_grp` + for the given :py:data:`page_id` (or all pages) + under the given :py:data:`parameter`. (This will iterate over pages and files, calling - :py:meth:`process_page`, handling exceptions.) + :py:meth:`.process_page_file` and handling exceptions. + It should be overridden by subclasses to handle cases + like post-processing or computation across pages.) """ log = getLogger('ocrd.processor.base') with pushd_popd(workspace.directory): @@ -370,7 +372,7 @@ def process_workspace(self, workspace: Workspace) -> None: def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None: """ - Copy the given ``input_file`` of the :py:attr:`workspace`, + Copy the given ``input_file`` of the :py:data:`workspace`, representing one physical page (passed as one opened :py:class:`~ocrd_models.OcrdFile` per input fileGrp) and add it as if it was a processing result. @@ -399,14 +401,14 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ - Process the given ``input_files`` of the :py:attr:`workspace`, + Process the given ``input_files`` of the :py:data:`workspace`, representing one physical page (passed as one opened - :py:class:`~ocrd_models.OcrdFile` per input fileGrp) - under the given :py:attr:`parameter`, and make sure the + :py:class:`.OcrdFile` per input fileGrp) + under the given :py:data:`.parameter`, and make sure the results get added accordingly. - (This uses process_page_pcgts, but can be overridden by subclasses - to handle cases like multiple fileGrps, non-PAGE input etc.) + (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses + to handle cases like multiple output fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) @@ -449,28 +451,28 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ - Process the given ``input_pcgts`` of the :py:attr:`workspace`, + Process the given ``input_pcgts`` of the :py:data:`.workspace`, representing one physical page (passed as one parsed - :py:class:`~ocrd_models.OcrdPage` per input fileGrp) - under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd.processor.OcrdPageResult`. + :py:class:`.OcrdPage` per input fileGrp) + under the given :py:data:`.parameter`, and return the + resulting :py:class:`.OcrdPageResult`. Optionally, add to the ``images`` attribute of the resulting - :py:class:`~ocrd.processor.OcrdPageResult` instances - of :py:class:`~ocrd.processor.OcrdPageResultImage`, + :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`, which have required fields for ``pil`` (:py:class:`PIL.Image` image data), ``file_id_suffix`` (used for generating IDs of the saved image) and ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` for setting the filename of the saved image). - (This contains the main functionality and must be overridden by subclasses.) + (This contains the main functionality and must be overridden by subclasses, + unless it does not get called by some overriden :py:meth:`.process_page_file`.) """ raise NotImplementedError() def add_metadata(self, pcgts: OcrdPage) -> None: """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing - the processing step and runtime parameters to :py:class:`~ocrd_models.OcrdPage` ``pcgts``. + the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``. """ metadata_obj = pcgts.get_Metadata() assert metadata_obj is not None @@ -496,7 +498,7 @@ def add_metadata(self, pcgts: OcrdPage) -> None: def resolve_resource(self, val): """ Resolve a resource name to an absolute file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters + `spec `_ Args: val (string): resource value to resolve @@ -522,7 +524,7 @@ def resolve_resource(self, val): def show_resource(self, val): """ Resolve a resource name to a file path with the algorithm in - https://ocr-d.de/en/spec/ocrd_tool#file-parameters, + `spec `_, then print its contents to stdout. Args: @@ -593,7 +595,8 @@ def input_files(self): files for that page) - Otherwise raise an error (complaining that only PAGE-XML warrants having multiple images for a single page) - Algorithm _ + + See `algorithm `_ Returns: A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. @@ -635,11 +638,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): - if ``last``, then the last matching file for the page will be silently selected (as if the last was the only match) - if ``abort``, then an exception will be raised. + Multiple matches for PAGE-XML will always raise an exception. Keyword Args: require_first (boolean): If true, then skip a page entirely whenever it is not available in the first input `fileGrp`. + on_error (string): How to handle multiple file matches per page. mimetype (string): If not `None`, filter by the specified MIME type (literal or regex prefixed by `//`). Otherwise prefer PAGE or image. diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 2f94913ed7..3523d9f15f 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -606,7 +606,6 @@ def image_from_page(self, page, page_id, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to ``fill``: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible @@ -850,7 +849,6 @@ def image_from_segment(self, segment, parent_image, parent_coords, Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to `fill`: - \b - if `"background"` (the default), then fill with the median color of the image; - else if `"none"`, then avoid masking polygons where possible diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 406e60a85a..82b8b7e1c3 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -21,6 +21,7 @@ class OcrdExif(): * ``RGB`` for 24-bit truecolor, * ``I`` for 32-bit signed integer grayscale, * ``F`` for floating-point grayscale + (see PIL concept **mode**) resolution (int): pixel density xResolution (int): pixel density diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cda..4d1e6cba58 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -194,7 +194,7 @@ def unique_identifier(self, purl : str) -> None: @property def agents(self) -> List[OcrdAgent]: """ - List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s + List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent` entries. """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] @@ -218,7 +218,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: @property def file_groups(self) -> List[str]: """ - List the `@USE` of all `mets:fileGrp` entries. + List the ``@USE`` of all ``mets:fileGrp`` entries. """ # WARNING: Actually we cannot return strings in place of elements! @@ -894,7 +894,7 @@ def merge(self, other_mets, force : bool = False, Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` Keyword Args: - force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s) + force (boolean): Whether to do :py:meth:`add_file` with ``force`` (overwriting existing ``mets:file`` entries) fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 28f95b2162..851fb42a8c 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -120,9 +120,11 @@ def raw_value(self, name): description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): + - `CPU`: yields CPU and wall-time, - `RSS`: also yields peak memory (resident set size) - `PSS`: also yields peak memory (proportional set size) + """, validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), default=(True, '')) @@ -154,19 +156,39 @@ def _ocrd_download_timeout_parser(val): parser=_parser_boolean) config.add("OCRD_MISSING_INPUT", - description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]", + description="""\ +How to deal with missing input files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed with next page's input + - `ABORT`: throw :py:class:`.MissingInputFile` + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'ABORT'], parser=str) config.add("OCRD_MISSING_OUTPUT", - description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]", + description="""\ +How to deal with missing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `COPY`: fall back to copying input PAGE to output fileGrp for page + - `ABORT`: re-throw whatever caused processing to fail + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], parser=str) config.add("OCRD_EXISTING_OUTPUT", - description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]", + description="""\ +How to deal with already existing output files (for some fileGrp/pageId) during processing: + + - `SKIP`: ignore and proceed processing next page + - `OVERWRITE`: force writing result to output fileGrp for page + - `ABORT`: re-throw :py:class:`FileExistsError` + +""", default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], parser=str) From 9942bbe6dc42246c0a7e6eda85444aa0f745face Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:05:38 +0200 Subject: [PATCH 10/12] Processor.zip_input_files: more verbose log msg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index d53c3da0bf..55b4619422 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -714,7 +714,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): - LOG.critical(f"Could not find any files for selected pageId {self.page_id}") + LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 8a584e9dcc5794baa9e08556a943bc7e9eb9991f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:07:05 +0200 Subject: [PATCH 11/12] test_processor: test for specific exception Co-authored-by: Konstantin Baierer --- tests/processor/test_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 064142574e..c263d99fce 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -280,7 +280,7 @@ def ocrd_tool(self): assert ('foobar3', 'foobar4') in tuples tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] assert ('foobar3', None) in tuples - with self.assertRaisesRegex(Exception, "Could not determine unique input file"): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files(on_error='abort') ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: @@ -289,7 +289,7 @@ def ocrd_tool(self): proc.workspace = ws proc.input_file_grp = 'GRP1,GRP2' proc.page_id = page_id - with self.assertRaisesRegex(Exception, "Could not determine unique input file"): + with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"): tuples = proc.zip_input_files() def test_zip_input_files_require_first(self): From 8077d45056c9d2682bee5bb5017f79eb0a7b336a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 19:54:26 +0200 Subject: [PATCH 12/12] test_processor: fix missing import --- tests/processor/test_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index c263d99fce..0cbae7d548 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -15,7 +15,7 @@ from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging from ocrd.resolver import Resolver -from ocrd.processor.base import Processor, run_processor, run_cli +from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile from unittest import mock import pytest