From cd4c96c94c6424628de2ccf2eb503d6933eaad9d Mon Sep 17 00:00:00 2001
From: kba <unixprog@gmail.com>
Date: Mon, 19 Aug 2024 14:17:14 +0200
Subject: [PATCH 01/12] add config.OCRD_DOWNLOAD_INPUT

---
 src/ocrd/cli/__init__.py   |  2 ++
 src/ocrd/processor/base.py |  6 ++++--
 src/ocrd_utils/config.py   | 18 +++++++++++++-----
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py
index 322cbde19f..bf262b0b96 100644
--- a/src/ocrd/cli/__init__.py
+++ b/src/ocrd/cli/__init__.py
@@ -29,6 +29,8 @@
 \b
 {config.describe('OCRD_DOWNLOAD_TIMEOUT')}
 \b
+{config.describe('OCRD_DOWNLOAD_INPUT')}
+\b
 {config.describe('OCRD_METS_CACHING')}
 \b
 {config.describe('OCRD_MAX_PROCESSOR_CACHE')}
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 344569677e..78f8b12374 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -29,6 +29,7 @@
     VERSION as OCRD_VERSION,
     MIMETYPE_PAGE,
     MIME_TO_EXT,
+    config,
     getLogger,
     initLogging,
     list_resource_candidates,
@@ -111,7 +112,7 @@ def __init__(
             input_file_grp=None,
             output_file_grp=None,
             page_id=None,
-            download_files=True,
+            download_files=config.OCRD_DOWNLOAD_INPUT,
             version=None
     ):
         """
@@ -137,7 +138,8 @@ def __init__(
                  (or empty for all pages). \
                  Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
                  before processing.
-             download_files (boolean): Whether input files will be downloaded prior to processing.
+             download_files (boolean): Whether input files will be downloaded prior to processing, \
+                 defaults to :py:attr:`ocrd_utils.config.OCRD_DOWNLOAD_INPUT` which is ``True`` by default
         """
         if ocrd_tool is not None:
             deprecation_warning("Passing 'ocrd_tool' as keyword argument to Processor is deprecated - "
diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index d0955a8dcf..22a566e7bc 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -12,6 +12,8 @@
 from tempfile import gettempdir
 from textwrap import fill, indent
 
+_validator_boolean = lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1')
+_parser_boolean = lambda val: bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1')
 
 class OcrdEnvVariable():
 
@@ -102,8 +104,8 @@ def raw_value(self, name):
 
 config.add('OCRD_METS_CACHING',
     description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.',
-    validator=lambda val: val in ('true', 'false', '0', '1'),
-    parser=lambda val: val in ('true', '1'))
+    validator=_validator_boolean,
+    parser=_parser_boolean)
 
 config.add('OCRD_MAX_PROCESSOR_CACHE',
     description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.",
@@ -125,7 +127,7 @@ def raw_value(self, name):
     description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz")
 
 config.add("OCRD_DOWNLOAD_RETRIES",
-    description="Number of times to retry failed attempts for downloads of resource or workspace files.",
+    description="Number of times to retry failed attempts for downloads of resources or workspace files.",
     validator=int,
     parser=int)
 
@@ -141,6 +143,12 @@ def _ocrd_download_timeout_parser(val):
     description="Timeout in seconds for connecting or reading (comma-separated) when downloading.",
     parser=_ocrd_download_timeout_parser)
 
+config.add("OCRD_DOWNLOAD_INPUT",
+    description="Whether to download files not present locally during processing",
+    default=(True, True),
+    validator=_validator_boolean,
+    parser=_parser_boolean)
+
 config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
         description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
         default=(True, ''))
@@ -190,5 +198,5 @@ def _ocrd_download_timeout_parser(val):
 config.add("OCRD_LOGGING_DEBUG",
     description="Print information about the logging setup to STDERR",
     default=(True, False),
-    validator=lambda val: isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1'),
-    parser=lambda val:  val if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1'))
+    validator=_validator_boolean,
+    parser=_parser_boolean)

From 785d60736919590aaa6e2c84a6a487dc46d12468 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 20 Aug 2024 08:05:24 +0200
Subject: [PATCH 02/12] Processor.zip_input_files: warning instead of exception
 for missing input files

---
 src/ocrd/processor/base.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 344569677e..958661f79f 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -575,16 +575,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                     pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
                                 # sort by MIME type so PAGE comes before images
                                 key=lambda file_: file_.mimetype)
-            # Warn if no files found but pageId was specified because that
-            # might be because of invalid page_id (range)
-            if self.page_id and not files_:
-                msg = (f"Could not find any files for --page-id {self.page_id} - "
-                       f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
-                if on_error == 'abort':
-                    raise ValueError(msg)
-                LOG.warning(msg)
             for file_ in files_:
                 if not file_.pageId:
+                    # ignore document-global files
                     continue
                 ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                 if ift[i]:
@@ -629,13 +622,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                 else:
                     LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
                     ift[i] = file_
+        # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
+        if self.page_id and not any(pages):
+            LOG.critical(f"Could not find any files for selected pageId {self.page_id}")
         ifts = list()
         for page, ifiles in pages.items():
             for i, ifg in enumerate(ifgs):
                 if not ifiles[i]:
                     # other fallback options?
-                    LOG.error('found no page %s in file group %s',
-                              page, ifg)
+                    LOG.error(f'Found no page {page} in file group {ifg}')
             if ifiles[0] or not require_first:
                 ifts.append(tuple(ifiles))
         return ifts

From b12849da6dd4a46dd0d9a121c50f9438cb61d6e1 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 20 Aug 2024 08:07:16 +0200
Subject: [PATCH 03/12] Processor.zip_input_files: introduce NonUniqueInputFile
 exception

---
 src/ocrd/processor/__init__.py    |  4 ++-
 src/ocrd/processor/base.py        | 46 ++++++++++++++++++++++---------
 tests/processor/test_processor.py |  6 ++--
 3 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py
index 0b3ce5a56e..b6c1188def 100644
--- a/src/ocrd/processor/__init__.py
+++ b/src/ocrd/processor/__init__.py
@@ -1,6 +1,8 @@
 from .base import (
     Processor,
-    ResourceNotFoundError
+    ResourceNotFoundError,
+    NonUniqueInputFile,
+    MissingInputFile,
 )
 from .ocrd_page_result import (
     OcrdPageResult,
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 958661f79f..516989ae28 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -55,9 +55,36 @@ class ResourceNotFoundError(FileNotFoundError):
     def __init__(self, name, executable):
         self.name = name
         self.executable = executable
-        self.message = "Could not find resource '%s' for executable '%s'. " \
-                       "Try 'ocrd resmgr download %s %s' to download this resource." \
-                       % (name, executable, executable, name)
+        self.message = (f"Could not find resource '{name}' for executable '{executable}'. "
+                        f"Try 'ocrd resmgr download {executable} {name}' to download this resource.")
+        super().__init__(self.message)
+
+class NonUniqueInputFile(ValueError):
+    """
+    An exception signifying the specified fileGrp / pageId / mimetype
+    selector yields multiple PAGE files, or no PAGE files but multiple images,
+    or multiple files of that mimetype.
+    """
+    def __init__(self, fileGrp, pageId, mimetype):
+        self.fileGrp = fileGrp
+        self.pageId = pageId
+        self.mimetype = mimetype
+        self.message = (f"Could not determine unique input file for fileGrp {fileGrp} "
+                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
+        super().__init__(self.message)
+
+class MissingInputFile(ValueError):
+    """
+    An exception signifying the specified fileGrp / pageId / mimetype
+    selector yields no PAGE files, or no PAGE and no image files,
+    or no files of that mimetype.
+    """
+    def __init__(self, fileGrp, pageId, mimetype):
+        self.fileGrp = fileGrp
+        self.pageId = pageId
+        self.mimetype = mimetype
+        self.message = (f"Could not find input file for fileGrp {fileGrp} "
+                        f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}")
         super().__init__(self.message)
 
 class Processor():
@@ -352,7 +379,6 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc
                 file_path=image_file_path)
         result.pcgts.set_pcGtsId(output_file_id)
         self.add_metadata(result.pcgts)
-        # FIXME: what about non-PAGE output like JSON ???
         self.workspace.add_file(file_id=output_file_id,
                                 file_grp=self.output_file_grp,
                                 page_id=page_id,
@@ -592,9 +618,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                         elif on_error == 'last':
                             ift[i] = file_
                         elif on_error == 'abort':
-                            raise ValueError(
-                                "Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
-                                    mimetype, file_.pageId, ifg))
+                            raise NonUniqueInputFile(ifg, file_.pageId, mimetype)
                         else:
                             raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                     elif (ift[i].mimetype == MIMETYPE_PAGE and
@@ -602,9 +626,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                         pass # keep PAGE match
                     elif (ift[i].mimetype == MIMETYPE_PAGE and
                           file_.mimetype == MIMETYPE_PAGE):
-                        raise ValueError(
-                            "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
-                                file_.pageId, ifg))
+                        raise NonUniqueInputFile(ifg, file_.pageId, None)
                     else:
                         # filter was inactive but no PAGE is in control, this must not happen
                         if on_error == 'skip':
@@ -614,9 +636,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                         elif on_error == 'last':
                             ift[i] = file_
                         elif on_error == 'abort':
-                            raise ValueError(
-                                "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
-                                    file_.pageId, ifg))
+                            raise NonUniqueInputFile(ifg, file_.pageId, None)
                         else:
                             raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                 else:
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 2cf8a189b4..5d565ea707 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -251,7 +251,7 @@ def ocrd_tool(self):
                     assert ('foobar3', 'foobar4') in tuples
                     tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
                     assert ('foobar3', None) in tuples
-                    with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
+                    with self.assertRaisesRegex(Exception, "Could not determine unique input file"):
                         tuples = proc.zip_input_files(on_error='abort')
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001')
             for page_id in [None, 'phys_0001,phys_0002']:
@@ -260,7 +260,7 @@ def ocrd_tool(self):
                     proc.workspace = ws
                     proc.input_file_grp = 'GRP1,GRP2'
                     proc.page_id = page_id
-                    with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
+                    with self.assertRaisesRegex(Exception, "Could not determine unique input file"):
                         tuples = proc.zip_input_files()
 
     def test_zip_input_files_require_first(self):
@@ -281,7 +281,7 @@ def ocrd_tool(self):
                     proc.page_id = page_id
                     assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
         r = self.capture_out_err()
-        assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
+        assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err
 
 if __name__ == "__main__":
     main(__file__)

From 95d36585bf7d97193e56b9143fde0416cd7b799b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 20 Aug 2024 08:08:15 +0200
Subject: [PATCH 04/12] Processor.process_workspace: zip_input_files w/o
 require_first

---
 src/ocrd/processor/base.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 516989ae28..5becbf8d81 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -314,17 +314,19 @@ def process_workspace(self, workspace: Workspace) -> None:
             self.verify()
             try:
                 # FIXME: add page parallelization by running multiprocessing.Pool (#322)
-                for input_file_tuple in self.zip_input_files(on_error='abort'):
+                for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
                     # FIXME: add error handling by catching exceptions in various ways (#579)
                     # for example:
                     # - ResourceNotFoundError → use ResourceManager to download (once), then retry
                     # - transient (I/O or OOM) error → maybe sleep, retry
                     # - persistent (data) error → skip / dummy / raise
                     input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
+                    log.info("processing page %s",
+                             next(input_file.pageId
+                                  for input_file in input_file_tuple
+                                  if input_file))
                     for i, input_file in enumerate(input_file_tuple):
-                        if i == 0:
-                            log.info("processing page %s", input_file.pageId)
-                        elif input_file is None:
+                        if input_file is None:
                             # file/page not found in this file grp
                             continue
                         input_files[i] = input_file

From c7298411bafe74287e15646eaa3b9d20b90c2e65 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 20 Aug 2024 09:33:37 +0200
Subject: [PATCH 05/12] Processor.zip_input_files: introduce MissingInputFile
 exception and config.OCRD_MISSING_INPUT

---
 src/ocrd/cli/__init__.py          |  2 ++
 src/ocrd/processor/base.py        | 20 +++++++++++++++-----
 src/ocrd_utils/config.py          |  6 ++++++
 tests/processor/test_processor.py |  2 +-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py
index bf262b0b96..418d7927a3 100644
--- a/src/ocrd/cli/__init__.py
+++ b/src/ocrd/cli/__init__.py
@@ -31,6 +31,8 @@
 \b
 {config.describe('OCRD_DOWNLOAD_INPUT')}
 \b
+{config.describe('OCRD_MISSING_INPUT')}
+\b
 {config.describe('OCRD_METS_CACHING')}
 \b
 {config.describe('OCRD_MAX_PROCESSOR_CACHE')}
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 0ec0747428..fddf6383a7 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -338,7 +338,7 @@ def process_workspace(self, workspace: Workspace) -> None:
                             input_files[i] = self.workspace.download_file(input_file)
                         except ValueError as e:
                             log.error(repr(e))
-                            log.warning("skipping file %s for page %s", input_file, input_file.pageId)
+                            log.warning(f"failed downloading file {input_file} for page {input_file.pageId}")
                     self.process_page_file(*input_files)
             except NotImplementedError:
                 # fall back to deprecated method
@@ -611,10 +611,12 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                     continue
                 ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
                 if ift[i]:
-                    LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg)
+                    LOG.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}")
                     # fileGrp has multiple files for this page ID
                     if mimetype:
                         # filter was active, this must not happen
+                        LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
+                                    f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}")
                         if on_error == 'skip':
                             ift[i] = None
                         elif on_error == 'first':
@@ -633,6 +635,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                         raise NonUniqueInputFile(ifg, file_.pageId, None)
                     else:
                         # filter was inactive but no PAGE is in control, this must not happen
+                        LOG.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} "
+                                    f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}")
                         if on_error == 'skip':
                             ift[i] = None
                         elif on_error == 'first':
@@ -644,7 +648,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                         else:
                             raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
                 else:
-                    LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
+                    LOG.debug(f"adding file {file_.ID} for page {file_.pageId} to input file group {ifg}")
                     ift[i] = file_
         # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
         if self.page_id and not any(pages):
@@ -653,8 +657,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
         for page, ifiles in pages.items():
             for i, ifg in enumerate(ifgs):
                 if not ifiles[i]:
-                    # other fallback options?
-                    LOG.error(f'Found no page {page} in file group {ifg}')
+                    # could be from non-unique with on_error=skip or from true gap
+                    LOG.error(f'Found no file for page {page} in file group {ifg}')
+                    if config.OCRD_MISSING_INPUT == 'abort':
+                        raise MissingInputFile(ifg, page, mimetype)
+            if not any(ifiles):
+                # must be from non-unique with on_error=skip
+                LOG.warning(f'Found no files for {page} - skipping')
+                continue
             if ifiles[0] or not require_first:
                 ifts.append(tuple(ifiles))
         return ifts
diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index 22a566e7bc..11af20249f 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -149,6 +149,12 @@ def _ocrd_download_timeout_parser(val):
     validator=_validator_boolean,
     parser=_parser_boolean)
 
+config.add("OCRD_MISSING_INPUT",
+    description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]",
+    default=(True, 'SKIP'),
+    validator=lambda val: val in ['SKIP', 'ABORT'],
+    parser=str)
+
 config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
         description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
         default=(True, ''))
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 5d565ea707..aa2124001a 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -281,7 +281,7 @@ def ocrd_tool(self):
                     proc.page_id = page_id
                     assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
         r = self.capture_out_err()
-        assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err
+        assert 'ERROR ocrd.processor.base - Found no file for page phys_0001 in file group GRP1' in r.err
 
 if __name__ == "__main__":
     main(__file__)

From 577baa529103de170f6b1259ae6b161b281f475c Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 21 Aug 2024 00:06:14 +0200
Subject: [PATCH 06/12] processor parameter decorator: no '{}' default
 (unnecessary)

---
 src/ocrd/decorators/parameter_option.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py
index 55abbc2a53..2f8be3d868 100644
--- a/src/ocrd/decorators/parameter_option.py
+++ b/src/ocrd/decorators/parameter_option.py
@@ -10,7 +10,7 @@ def _handle_param_option(ctx, param, value):
 parameter_option = option('-p', '--parameter',
                                 help="Parameters, either JSON string or path to JSON file",
                                 multiple=True,
-                                default=['{}'],
+                                default=[],
                                 # now handled in ocrd_cli_wrap_processor to resolve processor preset files
                                 # callback=_handle_param_option
                                 callback=lambda ctx, param, kv: list(kv))

From f00ecda84717d274126a736b67e3ab29e5bae83d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 21 Aug 2024 00:07:01 +0200
Subject: [PATCH 07/12] =?UTF-8?q?Processor:=20add=20error=20handling?=
 =?UTF-8?q?=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- introduce `config.OCRD_MISSING_OUTPUT` and
  catch exceptions during `process_page_file`:
  - `ABORT`: re-raise
  - `SKIP`: ignore and continue with next page
  - `COPY`: ignore and provide input PAGE-XML
    as output (with just `@PcGtsId` and `Metadata`
    added to simulate the processing step)

- introduce `config.OCRD_EXISTING_OUTPUT`:
  - `ABORT`: re-raise FileExistsError
  - `SKIP`: ignore and continue with next page
  - `OVERWRITE`: force overwriting the exact
    output files (instead of removing output files
    indiscriminately)

- :fire: remove `Workspace.overwrite_mode`,
  have `--overwrite` merely delegate to
  `config.OCRD_EXISTING_OUTPUT=OVERWRITE`

- introduce `--debug`, just delegate to
  `config.OCRD_MISSING_OUTPUT=ABORT`

- `cli.bashlib.input-files`: delegate everything
  to `ocrd_cli_wrap_processor` (for CLI handling)
  and `process_workspace` (for error handling),
  but override `process_page_file` to (never fail and)
  print bash-friendly strings for actual processing

- update tests, add `test_processor.test_run_output_missing`
  covering all `OCRD_MISSING_OUTPUT` options and the newly
  `OCRD_EXISTING_OUTPUT=SKIP`
---
 src/ocrd/cli/__init__.py                      |  4 +
 src/ocrd/cli/bashlib.py                       | 66 +++++++-------
 src/ocrd/decorators/__init__.py               | 22 +----
 src/ocrd/decorators/ocrd_cli_options.py       |  1 +
 src/ocrd/processor/base.py                    | 89 +++++++++++++++----
 src/ocrd/processor/builtin/dummy_processor.py | 13 +--
 src/ocrd/processor/helpers.py                 |  8 +-
 src/ocrd/workspace.py                         | 14 +--
 src/ocrd_utils/config.py                      | 12 +++
 tests/cli/test_bashlib.py                     |  2 +-
 tests/data/__init__.py                        | 39 +++++++-
 tests/processor/test_processor.py             | 37 +++++++-
 tests/test_workspace.py                       | 23 +++--
 13 files changed, 224 insertions(+), 106 deletions(-)

diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py
index 418d7927a3..3722e3c21e 100644
--- a/src/ocrd/cli/__init__.py
+++ b/src/ocrd/cli/__init__.py
@@ -33,6 +33,10 @@
 \b
 {config.describe('OCRD_MISSING_INPUT')}
 \b
+{config.describe('OCRD_MISSING_OUTPUT')}
+\b
+{config.describe('OCRD_EXISTING_OUTPUT')}
+\b
 {config.describe('OCRD_METS_CACHING')}
 \b
 {config.describe('OCRD_MAX_PROCESSOR_CACHE')}
diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py
index 2c57bb412a..26139cb48f 100644
--- a/src/ocrd/cli/bashlib.py
+++ b/src/ocrd/cli/bashlib.py
@@ -20,13 +20,16 @@
 from ocrd.decorators import (
     parameter_option,
     parameter_override_option,
-    ocrd_loglevel
+    ocrd_loglevel,
+    ocrd_cli_wrap_processor
 )
 from ocrd_utils import (
     is_local_filename,
     get_local_filename,
     initLogging,
-    make_file_id
+    getLogger,
+    make_file_id,
+    config
 )
 from ocrd.resolver import Resolver
 from ocrd.processor import Processor
@@ -81,11 +84,15 @@ def bashlib_constants(name):
 @bashlib_cli.command('input-files')
 @click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
 @click.option('-w', '--working-dir', help="Working Directory")
-@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT')
-@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT')
+@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None)
+@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None)
 # repeat some other processor options for convenience (will be ignored here)
 @click.option('-g', '--page-id', help="ID(s) of the pages to process")
-@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist")
+@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n"
+              "(with '--page-id', remove only those).\n"
+              "Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE")
+@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n"
+              "Short-hand for OCRD_MISSING_OUTPUT=ABORT")
 @parameter_option
 @parameter_override_option
 @ocrd_loglevel
@@ -100,37 +107,26 @@ def bashlib_input_files(**kwargs):
 
     (The printing format is one associative array initializer per line.)
     """
-    initLogging()
-    mets = kwargs.pop('mets')
-    working_dir = kwargs.pop('working_dir')
-    if is_local_filename(mets) and not isfile(get_local_filename(mets)):
-        msg = "File does not exist: %s" % mets
-        raise FileNotFoundError(msg)
-    resolver = Resolver()
-    workspace = resolver.workspace_from_url(mets, working_dir)
     class BashlibProcessor(Processor):
         @property
         def ocrd_tool(self):
-            return {}
+            return {'executable': '', 'steps': ['']}
         @property
-        def executable(self):
-            return ''
-    processor = BashlibProcessor(None)
-    # go half way of the normal run_processor / process_workspace call tree
-    processor.workspace = workspace
-    processor.page_id = kwargs['page_id']
-    processor.input_file_grp = kwargs['input_file_grp']
-    processor.output_file_grp = kwargs['output_file_grp']
-    for input_files in processor.zip_input_files(mimetype=None, on_error='abort'):
-        # ensure all input files exist locally (without persisting them in the METS)
-        # - this mimics the default behaviour of all Pythonic processors
-        input_files = [workspace.download_file(input_file) if input_file else None
-                       for input_file in input_files]
-        for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
-            # make this bash-friendly (show initialization for associative array)
-            if len(input_files) > 1:
-                # single quotes allow us to preserve the list value inside the alist
-                print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ')
-            else:
-                print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ')
-        print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp']))
+        def version(self):
+            return '1.0'
+        # go half way of the normal run_processor / process_workspace call tree
+        # by just delegating to process_workspace, overriding process_page_file
+        # to ensure all input files exist locally (without persisting them in the METS)
+        # and print what needs to be acted on in bash-friendly way
+        def process_page_file(self, *input_files):
+            for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
+                # make this bash-friendly (show initialization for associative array)
+                if len(input_files) > 1:
+                    # single quotes allow us to preserve the list value inside the alist
+                    value = ' '.join(str(getattr(res, field)) for res in input_files)
+                else:
+                    value = str(getattr(input_files[0], field))
+                print(f"[{field}]='{value}'", end=' ')
+            output_file_id = make_file_id(input_files[0], kwargs['output_file_grp'])
+            print(f"[outputFileId]='{output_file_id}'")
+    ocrd_cli_wrap_processor(BashlibProcessor, **kwargs)
diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py
index d9d1fb69dd..364ef4c847 100644
--- a/src/ocrd/decorators/__init__.py
+++ b/src/ocrd/decorators/__init__.py
@@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor(
     profile_file=None,
     version=False,
     overwrite=False,
+    debug=False,
     resolve_resource=None,
     show_resource=None,
     list_resources=False,
@@ -117,25 +118,10 @@ def resolve(name):
             resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url)
     workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url)
     page_id = kwargs.get('page_id')
-    # XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
-    # if overwrite
-    #     if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']:
-    #         raise Exception("--overwrite requires --output-file-grp")
-    #     LOG.info("Removing files because of --overwrite")
-    #     for grp in kwargs['output_file_grp'].split(','):
-    #         if page_id:
-    #             for one_page_id in kwargs['page_id'].split(','):
-    #                 LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id)
-    #                 for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp):
-    #                     workspace.remove_file(file, force=True, keep_file=False, page_recursive=True)
-    #         else:
-    #             LOG.debug("Removing all files in output file group %s ", grp)
-    #             # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors)
-    #             workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False)
-    #     workspace.save_mets()
-    # XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace
     if overwrite:
-        workspace.overwrite_mode = True
+        config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
+    if debug:
+        config.OCRD_MISSING_OUTPUT = 'ABORT'
     report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id)
     if not report.is_valid:
         raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors))
diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py
index e640a20032..e069b3ea81 100644
--- a/src/ocrd/decorators/ocrd_cli_options.py
+++ b/src/ocrd/decorators/ocrd_cli_options.py
@@ -33,6 +33,7 @@ def cli(mets_url):
         option('-O', '--output-file-grp', default=None),
         option('-g', '--page-id'),
         option('--overwrite', is_flag=True, default=False),
+        option('--debug', is_flag=True, default=False),
         option('--profile', is_flag=True, default=False),
         option('--profile-file', type=Path(dir_okay=False, writable=True)),
         parameter_option,
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index fddf6383a7..0ec2711f61 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -21,6 +21,7 @@
 import tarfile
 import io
 from deprecated import deprecated
+from requests import HTTPError
 
 from ocrd.workspace import Workspace
 from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
@@ -317,16 +318,11 @@ def process_workspace(self, workspace: Workspace) -> None:
             try:
                 # FIXME: add page parallelization by running multiprocessing.Pool (#322)
                 for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False):
-                    # FIXME: add error handling by catching exceptions in various ways (#579)
-                    # for example:
-                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
-                    # - transient (I/O or OOM) error → maybe sleep, retry
-                    # - persistent (data) error → skip / dummy / raise
                     input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
-                    log.info("processing page %s",
-                             next(input_file.pageId
-                                  for input_file in input_file_tuple
-                                  if input_file))
+                    page_id = next(input_file.pageId
+                                   for input_file in input_file_tuple
+                                   if input_file)
+                    log.info(f"processing page {page_id}")
                     for i, input_file in enumerate(input_file_tuple):
                         if input_file is None:
                             # file/page not found in this file grp
@@ -336,14 +332,71 @@ def process_workspace(self, workspace: Workspace) -> None:
                             continue
                         try:
                             input_files[i] = self.workspace.download_file(input_file)
-                        except ValueError as e:
+                        except (ValueError, FileNotFoundError, HTTPError) as e:
                             log.error(repr(e))
-                            log.warning(f"failed downloading file {input_file} for page {input_file.pageId}")
-                    self.process_page_file(*input_files)
+                            log.warning(f"failed downloading file {input_file} for page {page_id}")
+                    # FIXME: differentiate error cases in various ways:
+                    # - ResourceNotFoundError → use ResourceManager to download (once), then retry
+                    # - transient (I/O or OOM) error → maybe sleep, retry
+                    # - persistent (data) error → skip / dummy / raise
+                    try:
+                        self.process_page_file(*input_files)
+                    except Exception as err:
+                        # we have to be broad here, but want to exclude NotImplementedError
+                        if isinstance(err, NotImplementedError):
+                            raise err
+                        if isinstance(err, FileExistsError):
+                            if config.OCRD_EXISTING_OUTPUT == 'ABORT':
+                                raise err
+                            if config.OCRD_EXISTING_OUTPUT == 'SKIP':
+                                continue
+                            if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':
+                                # too late here, must not happen
+                                raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE")
+                        # FIXME: re-usable/actionable logging
+                        log.exception(f"Failure on page {page_id}: {err}")
+                        if config.OCRD_MISSING_OUTPUT == 'ABORT':
+                            raise err
+                        if config.OCRD_MISSING_OUTPUT == 'SKIP':
+                            continue
+                        if config.OCRD_MISSING_OUTPUT == 'COPY':
+                            self._copy_page_file(input_files[0])
+                        else:
+                            desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False)
+                            raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}")
             except NotImplementedError:
                 # fall back to deprecated method
                 self.process()
 
+    def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None:
+        """
+        Copy the given ``input_file`` of the :py:attr:`workspace`,
+        representing one physical page (passed as one opened
+        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
+        and add it as if it was a processing result.
+        """
+        log = getLogger('ocrd.processor.base')
+        input_pcgts : OcrdPage
+        assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile))
+        log.debug(f"parsing file {input_file.ID} for page {input_file.pageId}")
+        try:
+            input_pcgts = page_from_file(input_file)
+        except ValueError as err:
+            # not PAGE and not an image to generate PAGE for
+            log.error(f"non-PAGE input for page {input_file.pageId}: {err}")
+            return
+        output_file_id = make_file_id(input_file, self.output_file_grp)
+        input_pcgts.set_pcGtsId(output_file_id)
+        self.add_metadata(input_pcgts)
+        self.workspace.add_file(file_id=output_file_id,
+                                file_grp=self.output_file_grp,
+                                page_id=input_file.pageId,
+                                local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
+                                mimetype=MIMETYPE_PAGE,
+                                content=to_xml(input_pcgts),
+                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+        )
+
     def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
         """
         Process the given ``input_files`` of the :py:attr:`workspace`,
@@ -366,9 +419,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc
                 page_ = page_from_file(input_file)
                 assert isinstance(page_, OcrdPage)
                 input_pcgts[i] = page_
-            except ValueError as e:
+            except ValueError as err:
                 # not PAGE and not an image to generate PAGE for
-                log.info("non-PAGE input for page %s: %s", page_id, e)
+                log.error("non-PAGE input for page %s: %s", page_id, err)
         output_file_id = make_file_id(input_files[0], self.output_file_grp)
         result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
         for image_result in result.images:
@@ -380,7 +433,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc
                 image_file_id,
                 self.output_file_grp,
                 page_id=page_id,
-                file_path=image_file_path)
+                file_path=image_file_path,
+                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+            )
         result.pcgts.set_pcGtsId(output_file_id)
         self.add_metadata(result.pcgts)
         self.workspace.add_file(file_id=output_file_id,
@@ -388,7 +443,9 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc
                                 page_id=page_id,
                                 local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
                                 mimetype=MIMETYPE_PAGE,
-                                content=to_xml(result.pcgts))
+                                content=to_xml(result.pcgts),
+                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+        )
 
     def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """
diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py
index 1b3f7a5aa0..f8890274ae 100644
--- a/src/ocrd/processor/builtin/dummy_processor.py
+++ b/src/ocrd/processor/builtin/dummy_processor.py
@@ -15,7 +15,8 @@
     MIME_TO_EXT,
     MIMETYPE_PAGE,
     parse_json_string_with_comments,
-    resource_string
+    resource_string,
+    config
 )
 from ocrd_modelfactory import page_from_file
 
@@ -43,14 +44,15 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr
             local_filename = join(self.output_file_grp, file_id + ext)
             LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
             with open(input_file.local_filename, 'rb') as f:
-                content = f.read()
                 output_file = self.workspace.add_file(
                     file_id=file_id,
                     file_grp=self.output_file_grp,
                     page_id=input_file.pageId,
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
-                    content=content)
+                    content=f.read(),
+                    force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+                )
             file_id = file_id + '_PAGE'
             pcgts = page_from_file(output_file)
             assert isinstance(pcgts, OcrdPage)
@@ -63,8 +65,9 @@ def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcr
                                     page_id=input_file.pageId,
                                     local_filename=join(self.output_file_grp, file_id + '.xml'),
                                     mimetype=MIMETYPE_PAGE,
-                                    content=to_xml(pcgts))
-
+                                    content=to_xml(pcgts),
+                                    force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+            )
         else:
             if self.parameter['copy_files']:
                 LOG.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename)
diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py
index dff14cfca6..08ca0a4683 100644
--- a/src/ocrd/processor/helpers.py
+++ b/src/ocrd/processor/helpers.py
@@ -160,6 +160,7 @@ def run_cli(
         workspace=None,
         page_id=None,
         overwrite=None,
+        debug=None,
         log_level=None,
         log_filename=None,
         input_file_grp=None,
@@ -202,6 +203,8 @@ def run_cli(
         args += ['--parameter', parameter]
     if overwrite:
         args += ['--overwrite']
+    if debug:
+        args += ['--debug']
     if mets_server_url:
         args += ['--mets-server-url', mets_server_url]
     log = getLogger('ocrd.processor.helpers.run_cli')
@@ -270,7 +273,10 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
   -O, --output-file-grp USE       File group(s) used as output
   -g, --page-id ID                Physical page ID(s) to process instead of full document []
   --overwrite                     Remove existing output pages/images
-                                  (with "--page-id", remove only those)
+                                  (with "--page-id", remove only those).
+                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
+  --debug                         Abort on any errors with full stack trace.
+                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
   --profile                       Enable profiling
   --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
   -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py
index 509b8123b9..2f94913ed7 100644
--- a/src/ocrd/workspace.py
+++ b/src/ocrd/workspace.py
@@ -42,7 +42,8 @@
     MIME_TO_EXT,
     MIME_TO_PIL,
     MIMETYPE_PAGE,
-    REGEX_PREFIX
+    REGEX_PREFIX,
+    config
 )
 
 from .workspace_backup import WorkspaceBackupManager
@@ -75,7 +76,6 @@ class Workspace():
             `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to
             the filesystem directly.
         baseurl (string, None) : Base URL to prefix to relative URL.
-        overwrite_mode (boolean, False) : Whether to force add operations on this workspace globally
     """
 
     def __init__(
@@ -91,7 +91,6 @@ def __init__(
         self.resolver = resolver
         self.directory = directory
         self.mets_target = str(Path(directory, mets_basename))
-        self.overwrite_mode = False
         self.is_remote = bool(mets_server_url)
         if mets is None:
             if self.is_remote:
@@ -243,8 +242,6 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals
         """
         log = getLogger('ocrd.workspace.remove_file')
         log.debug('Deleting mets:file %s', file_id)
-        if self.overwrite_mode:
-            force = True
         if isinstance(file_id, OcrdFile):
             file_id = file_id.ID
         try:
@@ -296,9 +293,6 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False,
             page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.
                 Has no effect unless ``page_recursive`` is `True`.
         """
-        if not force and self.overwrite_mode:
-            force = True
-
         if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force):
             raise Exception("No such fileGrp: %s" % USE)
 
@@ -419,8 +413,6 @@ def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSi
             raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.")
         if content is not None and not kwargs.get('local_filename'):
             raise Exception("'content' was set but no 'local_filename'")
-        if self.overwrite_mode:
-            kwargs['force'] = True
 
         with pushd_popd(self.directory):
             if kwargs.get('local_filename'):
@@ -1101,8 +1093,6 @@ def save_image_file(self, image : Image.Image,
             The (absolute) path of the created file.
         """
         log = getLogger('ocrd.workspace.save_image_file')
-        if self.overwrite_mode:
-            force = True
         saveargs = dict()
         if 'dpi' in image.info:
             saveargs['dpi'] = image.info['dpi']
diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index 11af20249f..fa4c34d63b 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -155,6 +155,18 @@ def _ocrd_download_timeout_parser(val):
     validator=lambda val: val in ['SKIP', 'ABORT'],
     parser=str)
 
+config.add("OCRD_MISSING_OUTPUT",
+    description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]",
+    default=(True, 'SKIP'),
+    validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
+    parser=str)
+
+config.add("OCRD_EXISTING_OUTPUT",
+    description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]",
+    default=(True, 'SKIP'),
+    validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
+    parser=str)
+
 config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING",
         description="Default address of Processing Server to connect to (for `ocrd network client processing`).",
         default=(True, ''))
diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py
index 15af493502..b1ab68c7fc 100644
--- a/tests/cli/test_bashlib.py
+++ b/tests/cli/test_bashlib.py
@@ -98,7 +98,7 @@ def test_constants_fail(self):
     def test_input_files(self):
         with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir:
             with pushd_popd(wsdir):
-                _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG'])
+                _, out, err = self.invoke_cli(bashlib_cli, ['input-files', '-I', 'OCR-D-IMG', '-O', 'OUTPUT'])
                 assert ("[url]='' [local_filename]='OCR-D-IMG/INPUT_0017.tif' [ID]='INPUT_0017' [mimetype]='image/tiff' "
                         "[pageId]='PHYS_0017' [outputFileId]='OUTPUT_PHYS_0017'") in out
 
diff --git a/tests/data/__init__.py b/tests/data/__init__.py
index e7ef30fc2b..53fa227d01 100644
--- a/tests/data/__init__.py
+++ b/tests/data/__init__.py
@@ -1,8 +1,9 @@
 import json
 import os
+import re
 from pytest import warns
 from ocrd import Processor
-from ocrd_utils import make_file_id
+from ocrd_utils import make_file_id, config
 
 DUMMY_TOOL = {
     'executable': 'ocrd-test',
@@ -94,7 +95,41 @@ def process(self):
                 page_id=input_file.pageId,
                 mimetype=input_file.mimetype,
                 local_filename=os.path.join(self.output_file_grp, file_id),
-                content='CONTENT')
+                content='CONTENT',
+                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+            )
+
+class DummyProcessorWithOutputFailures(Processor):
+    @property
+    def ocrd_tool(self):
+        return DUMMY_TOOL
+
+    @property
+    def version(self):
+        return '0.0.1'
+
+    @property
+    def executable(self):
+        return 'ocrd-test'
+
+    def __init__(self, *args, **kwargs):
+        kwargs['download_files'] = False
+        super().__init__(*args, **kwargs)
+
+    # no error handling with old process(), so override new API
+    def process_page_file(self, input_file):
+        n = int(re.findall(r'\d+', input_file.pageId)[-1])
+        if n % 2:
+            raise Exception(f"intermittent failure on page {input_file.pageId}")
+        output_file_id = make_file_id(input_file, self.output_file_grp)
+        self.workspace.add_file(file_id=output_file_id,
+                                file_grp=self.output_file_grp,
+                                page_id=input_file.pageId,
+                                local_filename=os.path.join(self.output_file_grp, output_file_id),
+                                mimetype=input_file.mimetype,
+                                content='CONTENT',
+                                force=config.OCRD_EXISTING_OUTPUT == 'OVERWRITE',
+        )
 
 class IncompleteProcessor(Processor):
     @property
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index aa2124001a..064142574e 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -5,7 +5,13 @@
 from pathlib import Path
 from os import environ
 from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module
-from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor
+from tests.data import (
+    DummyProcessor,
+    DummyProcessorWithRequiredParameters,
+    DummyProcessorWithOutput,
+    DummyProcessorWithOutputFailures,
+    IncompleteProcessor
+)
 
 from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
 from ocrd.resolver import Resolver
@@ -145,20 +151,43 @@ def test_run_output0(self):
                           output_file_grp="OCR-D-OUT")
             assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2
 
+    def test_run_output_missing(self):
+        ws = self.workspace
+        from ocrd_utils import config
+        config.OCRD_MISSING_OUTPUT = 'SKIP'
+        run_processor(DummyProcessorWithOutputFailures, workspace=ws,
+                      input_file_grp="OCR-D-IMG",
+                      output_file_grp="OCR-D-OUT")
+        # only half succeed
+        assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) // 2
+        config.OCRD_MISSING_OUTPUT = 'ABORT'
+        with pytest.raises(Exception) as exc:
+            run_processor(DummyProcessorWithOutputFailures, workspace=ws,
+                          input_file_grp="OCR-D-IMG",
+                          output_file_grp="OCR-D-OUT")
+            assert "intermittent" in str(exc.value)
+        config.OCRD_MISSING_OUTPUT = 'COPY'
+        config.OCRD_EXISTING_OUTPUT = 'SKIP'
+        run_processor(DummyProcessorWithOutputFailures, workspace=ws,
+                      input_file_grp="OCR-D-IMG",
+                      output_file_grp="OCR-D-OUT")
+        assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG"))
+
     def test_run_output_overwrite(self):
         with pushd_popd(tempdir=True) as tempdir:
             ws = self.resolver.workspace_from_nothing(directory=tempdir)
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
             ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002')
-            ws.overwrite_mode = True
+            from ocrd_utils import config
+            config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
             ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001')
-            ws.overwrite_mode = False
+            config.OCRD_EXISTING_OUTPUT = 'ABORT'
             with pytest.raises(Exception) as exc:
                 run_processor(DummyProcessorWithOutput, workspace=ws,
                               input_file_grp="GRP1",
                               output_file_grp="OCR-D-OUT")
                 assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists"
-            ws.overwrite_mode = True
+            config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
             run_processor(DummyProcessorWithOutput, workspace=ws,
                           input_file_grp="GRP1",
                           output_file_grp="OCR-D-OUT")
diff --git a/tests/test_workspace.py b/tests/test_workspace.py
index 2fe5f450a0..1ae007ae52 100644
--- a/tests/test_workspace.py
+++ b/tests/test_workspace.py
@@ -270,9 +270,9 @@ def test_remove_file_force(sbb_data_workspace):
 
     # TODO check semantics - can a non-existent thing be removed?
     assert not sbb_data_workspace.remove_file('non-existing-id', force=True)
-    # should also succeed
-    sbb_data_workspace.overwrite_mode = True
-    assert not sbb_data_workspace.remove_file('non-existing-id', force=False)
+    with pytest.raises(FileNotFoundError) as not_found_exc:
+        sbb_data_workspace.remove_file('non-existing-id', force=False)
+    assert "not found in METS" in str(not_found_exc.value)
 
 
 def test_remove_file_remote_not_available_raises_exception(plain_workspace):
@@ -292,9 +292,9 @@ def test_remove_file_remote(plain_workspace):
     assert plain_workspace.remove_file('page1_img', force=True)
 
     # TODO check returned value
-    # should also "succeed", because overwrite_mode is set which also sets 'force' to 'True'
-    plain_workspace.overwrite_mode = True
-    assert not plain_workspace.remove_file('page1_img')
+    with pytest.raises(FileNotFoundError) as not_found_exc:
+        plain_workspace.remove_file('page1_img')
+    assert "not found in METS" in str(not_found_exc.value)
 
 
 def test_rename_file_group(tmp_path):
@@ -341,9 +341,6 @@ def test_remove_file_group_force(sbb_data_workspace):
     # check function and tests semantics
     # should succeed
     assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=True)
-    # should also succeed
-    sbb_data_workspace.overwrite_mode = True
-    assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=False)
 
 
 def test_remove_file_group_rmdir(sbb_data_tmp, sbb_data_workspace):
@@ -432,9 +429,11 @@ def test_save_image_file(plain_workspace):
     assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg'))
     # should succeed
     assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg', force=True)
-    # should also succeed
-    plain_workspace.overwrite_mode = True
-    assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg')
+    # should fail
+    with pytest.raises(FileExistsError) as exists_exc:
+        plain_workspace.save_image_file(img, 'page1_img', 'IMG', page_id='page1', mimetype='image/jpeg')
+    assert "neither force nor ignore are set" in str(exists_exc.value)
+
     # check file_path kwarg
     assert plain_workspace.save_image_file(img, 'page1_img2', 'IMG', page_id='page1', file_path='IMG/page1_img2.png')
     assert exists(join(plain_workspace.directory, 'IMG', 'page1_img2.png'))

From fdd5d168d0753caad8a19efd49884c52d7934183 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 21 Aug 2024 18:01:31 +0200
Subject: [PATCH 08/12] ocrd_utils.config: add variables to module docstring

---
 src/ocrd_utils/config.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index fa4c34d63b..28f95b2162 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -62,7 +62,11 @@ def __init__(self):
         self._variables = {}
 
     def add(self, name, *args, **kwargs):
-        self._variables[name] = OcrdEnvVariable(name, *args, **kwargs)
+        var = OcrdEnvVariable(name, *args, **kwargs)
+        # make visible in ocrd_utils.config docstring (apidoc)
+        txt = var.describe(wrap_text=False, indent_text=True)
+        globals()['__doc__'] += "\n\n - " + txt + "\n\n"
+        self._variables[name] = var
         return self._variables[name]
 
     def has_default(self, name):

From 6d87f9e6494a0768541e5b18ae557fd594d8319b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 21 Aug 2024 18:02:36 +0200
Subject: [PATCH 09/12] improve docstrings, re-generate docs

---
 .../ocrd/ocrd.processor.ocrd_page_result.rst  |  7 ++
 docs/api/ocrd/ocrd.processor.rst              |  1 +
 src/ocrd/cli/validate.py                      |  4 +-
 src/ocrd/cli/workspace.py                     |  1 +
 src/ocrd/processor/base.py                    | 75 ++++++++++---------
 src/ocrd/workspace.py                         |  2 -
 src/ocrd_models/ocrd_exif.py                  |  1 +
 src/ocrd_models/ocrd_mets.py                  |  6 +-
 src/ocrd_utils/config.py                      | 28 ++++++-
 9 files changed, 80 insertions(+), 45 deletions(-)
 create mode 100644 docs/api/ocrd/ocrd.processor.ocrd_page_result.rst

diff --git a/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst
new file mode 100644
index 0000000000..e13d50e155
--- /dev/null
+++ b/docs/api/ocrd/ocrd.processor.ocrd_page_result.rst
@@ -0,0 +1,7 @@
+ocrd.processor.ocrd\_page\_result module
+========================================
+
+.. automodule:: ocrd.processor.ocrd_page_result
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/api/ocrd/ocrd.processor.rst b/docs/api/ocrd/ocrd.processor.rst
index 801114d2a3..7507d8439b 100644
--- a/docs/api/ocrd/ocrd.processor.rst
+++ b/docs/api/ocrd/ocrd.processor.rst
@@ -22,3 +22,4 @@ Submodules
 
    ocrd.processor.base
    ocrd.processor.helpers
+   ocrd.processor.ocrd_page_result
diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py
index b26803d053..61d26988a4 100644
--- a/src/ocrd/cli/validate.py
+++ b/src/ocrd/cli/validate.py
@@ -40,7 +40,7 @@ def validate_cli():
 @click.argument('ocrd_tool', required=False, nargs=1)
 def validate_ocrd_tool(ocrd_tool):
     '''
-    Validate OCRD_TOOL as an ocrd-tool.json file.
+    Validate OCRD_TOOL as an `ocrd-tool.json` file.
     '''
     if not ocrd_tool:
         ocrd_tool = 'ocrd-tool.json'
@@ -107,7 +107,7 @@ def validate_page(page, **kwargs):
 @click.argument('tasks', nargs=-1, required=True)
 def validate_process(tasks, workspace, mets_basename, overwrite, page_id):
     '''
-    Validate a sequence of tasks passable to 'ocrd process'
+    Validate a sequence of tasks passable to `ocrd process`
     '''
     if workspace:
         _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks],
diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py
index 0c70fd3a36..e2186a727c 100644
--- a/src/ocrd/cli/workspace.py
+++ b/src/ocrd/cli/workspace.py
@@ -308,6 +308,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
           echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
         } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
           -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
+
     """
     log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
     workspace = Workspace(
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 0ec2711f61..d53c3da0bf 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -144,22 +144,21 @@ def __init__(
             version=None
     ):
         """
-        Instantiate, but do not process. Unless ``list_resources`` or
-        ``show_resource`` or ``show_help`` or ``show_version`` or
-        ``dump_json`` or ``dump_module_dir`` is true, setup for processing
-        (parsing and validating parameters, entering the workspace directory).
+        Instantiate, but do not setup (neither for processing nor other usage).
+        If given, do parse and validate :py:data:`.parameter`.
 
         Args:
              workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \
+                 If not ``None``, then `chdir` to that directory.
                  Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
                  before processing.
         Keyword Args:
              parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \
                  Can be ``None`` even for processing, but then needs to be set before running.
-             input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. \
+             input_file_grp (string): comma-separated list of METS ``fileGrp`` used for input. \
                  Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
                  before processing.
-             output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. \
+             output_file_grp (string): comma-separated list of METS ``fileGrp`` used for output. \
                  Deprecated since version 3.0: Should be ``None`` here, but then needs to be set \
                  before processing.
              page_id (string): comma-separated list of METS physical ``page`` IDs to process \
@@ -287,29 +286,32 @@ def setup(self) -> None:
         """
         pass
 
-    @deprecated(version='3.0', reason='process() should be replaced with process_page() and process_workspace()')
+    @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')
     def process(self) -> None:
         """
-        Process all files of the :py:attr:`workspace`
-        from the given :py:attr:`input_file_grp`
-        to the given :py:attr:`output_file_grp`
-        for the given :py:attr:`page_id` (or all pages)
-        under the given :py:attr:`parameter`.
+        Process all files of the :py:data:`workspace`
+        from the given :py:data:`input_file_grp`
+        to the given :py:data:`output_file_grp`
+        for the given :py:data:`page_id` (or all pages)
+        under the given :py:data:`parameter`.
 
-        (This contains the main functionality and needs to be overridden by subclasses.)
+        (This contains the main functionality and needs to be
+        overridden by subclasses.)
         """
         raise NotImplementedError()
 
     def process_workspace(self, workspace: Workspace) -> None:
         """
         Process all files of the given ``workspace``,
-        from the given :py:attr:`input_file_grp`
-        to the given :py:attr:`output_file_grp`
-        for the given :py:attr:`page_id` (or all pages)
-        under the given :py:attr:`parameter`.
+        from the given :py:data:`input_file_grp`
+        to the given :py:data:`output_file_grp`
+        for the given :py:data:`page_id` (or all pages)
+        under the given :py:data:`parameter`.
 
         (This will iterate over pages and files, calling
-        :py:meth:`process_page`, handling exceptions.)
+        :py:meth:`.process_page_file` and handling exceptions.
+        It should be overridden by subclasses to handle cases
+        like post-processing or computation across pages.)
         """
         log = getLogger('ocrd.processor.base')
         with pushd_popd(workspace.directory):
@@ -370,7 +372,7 @@ def process_workspace(self, workspace: Workspace) -> None:
 
     def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> None:
         """
-        Copy the given ``input_file`` of the :py:attr:`workspace`,
+        Copy the given ``input_file`` of the :py:data:`workspace`,
         representing one physical page (passed as one opened
         :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
         and add it as if it was a processing result.
@@ -399,14 +401,14 @@ def _copy_page_file(self, input_file : Union[OcrdFile, ClientSideOcrdFile]) -> N
 
     def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
         """
-        Process the given ``input_files`` of the :py:attr:`workspace`,
+        Process the given ``input_files`` of the :py:data:`workspace`,
         representing one physical page (passed as one opened
-        :py:class:`~ocrd_models.OcrdFile` per input fileGrp)
-        under the given :py:attr:`parameter`, and make sure the
+        :py:class:`.OcrdFile` per input fileGrp)
+        under the given :py:data:`.parameter`, and make sure the
         results get added accordingly.
 
-        (This uses process_page_pcgts, but can be overridden by subclasses
-        to handle cases like multiple fileGrps, non-PAGE input etc.)
+        (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses
+        to handle cases like multiple output fileGrps, non-PAGE input etc.)
         """
         log = getLogger('ocrd.processor.base')
         input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
@@ -449,28 +451,28 @@ def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOc
 
     def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """
-        Process the given ``input_pcgts`` of the :py:attr:`workspace`,
+        Process the given ``input_pcgts`` of the :py:data:`.workspace`,
         representing one physical page (passed as one parsed
-        :py:class:`~ocrd_models.OcrdPage` per input fileGrp)
-        under the given :py:attr:`parameter`, and return the
-        resulting :py:class:`~ocrd.processor.OcrdPageResult`.
+        :py:class:`.OcrdPage` per input fileGrp)
+        under the given :py:data:`.parameter`, and return the
+        resulting :py:class:`.OcrdPageResult`.
 
         Optionally, add to the ``images`` attribute of the resulting
-        :py:class:`~ocrd.processor.OcrdPageResult` instances
-        of :py:class:`~ocrd.processor.OcrdPageResultImage`,
+        :py:class:`.OcrdPageResult` instances of :py:class:`.OcrdPageResultImage`,
         which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
         ``file_id_suffix`` (used for generating IDs of the saved image) and
         ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
         for setting the filename of the saved image).
 
-        (This contains the main functionality and must be overridden by subclasses.)
+        (This contains the main functionality and must be overridden by subclasses,
+        unless it does not get called by some overriden :py:meth:`.process_page_file`.)
         """
         raise NotImplementedError()
 
     def add_metadata(self, pcgts: OcrdPage) -> None:
         """
         Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
-        the processing step and runtime parameters to :py:class:`~ocrd_models.OcrdPage` ``pcgts``.
+        the processing step and runtime parameters to :py:class:`.OcrdPage` ``pcgts``.
         """
         metadata_obj = pcgts.get_Metadata()
         assert metadata_obj is not None
@@ -496,7 +498,7 @@ def add_metadata(self, pcgts: OcrdPage) -> None:
     def resolve_resource(self, val):
         """
         Resolve a resource name to an absolute file path with the algorithm in
-        https://ocr-d.de/en/spec/ocrd_tool#file-parameters
+        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_
 
         Args:
             val (string): resource value to resolve
@@ -522,7 +524,7 @@ def resolve_resource(self, val):
     def show_resource(self, val):
         """
         Resolve a resource name to a file path with the algorithm in
-        https://ocr-d.de/en/spec/ocrd_tool#file-parameters,
+        `spec <https://ocr-d.de/en/spec/ocrd_tool#file-parameters>`_,
         then print its contents to stdout.
 
         Args:
@@ -593,7 +595,8 @@ def input_files(self):
           files for that page)
         - Otherwise raise an error (complaining that only PAGE-XML warrants
           having multiple images for a single page)
-        Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_
+
+        See `algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>`_
 
         Returns:
             A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects.
@@ -635,11 +638,13 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
         - if ``last``, then the last matching file for the page will be
           silently selected (as if the last was the only match)
         - if ``abort``, then an exception will be raised.
+
         Multiple matches for PAGE-XML will always raise an exception.
 
         Keyword Args:
              require_first (boolean): If true, then skip a page entirely
                  whenever it is not available in the first input `fileGrp`.
+             on_error (string): How to handle multiple file matches per page.
              mimetype (string): If not `None`, filter by the specified MIME
                  type (literal or regex prefixed by `//`). Otherwise prefer
                  PAGE or image.
diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py
index 2f94913ed7..3523d9f15f 100644
--- a/src/ocrd/workspace.py
+++ b/src/ocrd/workspace.py
@@ -606,7 +606,6 @@ def image_from_page(self, page, page_id,
         Cropping uses a polygon mask (not just the bounding box rectangle).
         Areas outside the polygon will be filled according to ``fill``:
 
-        \b
         - if `"background"` (the default),
           then fill with the median color of the image;
         - else if `"none"`, then avoid masking polygons where possible
@@ -850,7 +849,6 @@ def image_from_segment(self, segment, parent_image, parent_coords,
         Cropping uses a polygon mask (not just the bounding box rectangle).
         Areas outside the polygon will be filled according to `fill`:
 
-        \b
         - if `"background"` (the default),
           then fill with the median color of the image;
         - else if `"none"`, then avoid masking polygons where possible
diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py
index 406e60a85a..82b8b7e1c3 100644
--- a/src/ocrd_models/ocrd_exif.py
+++ b/src/ocrd_models/ocrd_exif.py
@@ -21,6 +21,7 @@ class OcrdExif():
             * ``RGB`` for 24-bit truecolor,
             * ``I`` for 32-bit signed integer grayscale,
             * ``F`` for floating-point grayscale
+
           (see PIL concept **mode**)
         resolution (int): pixel density
         xResolution (int): pixel density
diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py
index d6da3e1cda..4d1e6cba58 100644
--- a/src/ocrd_models/ocrd_mets.py
+++ b/src/ocrd_models/ocrd_mets.py
@@ -194,7 +194,7 @@ def unique_identifier(self, purl : str) -> None:
     @property
     def agents(self) -> List[OcrdAgent]:
         """
-        List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s
+        List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent` entries.
         """
         return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)]
 
@@ -218,7 +218,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent:
     @property
     def file_groups(self) -> List[str]:
         """
-        List the `@USE` of all `mets:fileGrp` entries.
+        List the ``@USE`` of all ``mets:fileGrp`` entries.
         """
 
         # WARNING: Actually we cannot return strings in place of elements!
@@ -894,7 +894,7 @@ def merge(self, other_mets, force : bool = False,
         Add all files from other_mets.
         Accepts the same kwargs as :py:func:`find_files`
         Keyword Args:
-            force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s)
+            force (boolean): Whether to do :py:meth:`add_file` with ``force`` (overwriting existing ``mets:file`` entries)
             fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS
             fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS
             pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS
diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index 28f95b2162..851fb42a8c 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -120,9 +120,11 @@ def raw_value(self, name):
     description="""\
 Whether to enable gathering runtime statistics
 on the `ocrd.profile` logger (comma-separated):
+
 - `CPU`: yields CPU and wall-time,
 - `RSS`: also yields peak memory (resident set size)
 - `PSS`: also yields peak memory (proportional set size)
+
 """,
   validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')),
   default=(True, ''))
@@ -154,19 +156,39 @@ def _ocrd_download_timeout_parser(val):
     parser=_parser_boolean)
 
 config.add("OCRD_MISSING_INPUT",
-    description="How to deal with missing input files (for some fileGrp/pageId) during processing [SKIP|ABORT]",
+    description="""\
+How to deal with missing input files (for some fileGrp/pageId) during processing:
+
+ - `SKIP`: ignore and proceed with next page's input
+ - `ABORT`: throw :py:class:`.MissingInputFile`
+
+""",
     default=(True, 'SKIP'),
     validator=lambda val: val in ['SKIP', 'ABORT'],
     parser=str)
 
 config.add("OCRD_MISSING_OUTPUT",
-    description="How to deal with missing output files (for some fileGrp/pageId) during processing [SKIP|COPY|ABORT]",
+    description="""\
+How to deal with missing output files (for some fileGrp/pageId) during processing:
+
+ - `SKIP`: ignore and proceed processing next page
+ - `COPY`: fall back to copying input PAGE to output fileGrp for page
+ - `ABORT`: re-throw whatever caused processing to fail
+
+""",
     default=(True, 'SKIP'),
     validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'],
     parser=str)
 
 config.add("OCRD_EXISTING_OUTPUT",
-    description="How to deal with already existing output files (for some fileGrp/pageId) during processing [SKIP|OVERWRITE|ABORT]",
+    description="""\
+How to deal with already existing output files (for some fileGrp/pageId) during processing:
+
+ - `SKIP`: ignore and proceed processing next page
+ - `OVERWRITE`: force writing result to output fileGrp for page
+ - `ABORT`: re-throw :py:class:`FileExistsError`
+
+""",
     default=(True, 'SKIP'),
     validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'],
     parser=str)

From 9942bbe6dc42246c0a7e6eda85444aa0f745face Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Wed, 21 Aug 2024 18:05:38 +0200
Subject: [PATCH 10/12] Processor.zip_input_files: more verbose log msg

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 src/ocrd/processor/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index d53c3da0bf..55b4619422 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -714,7 +714,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
                     ift[i] = file_
         # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
         if self.page_id and not any(pages):
-            LOG.critical(f"Could not find any files for selected pageId {self.page_id}")
+            LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.")
         ifts = list()
         for page, ifiles in pages.items():
             for i, ifg in enumerate(ifgs):

From 8a584e9dcc5794baa9e08556a943bc7e9eb9991f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com>
Date: Wed, 21 Aug 2024 18:07:05 +0200
Subject: [PATCH 11/12] test_processor: test for specific exception

Co-authored-by: Konstantin Baierer <kba@users.noreply.github.com>
---
 tests/processor/test_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 064142574e..c263d99fce 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -280,7 +280,7 @@ def ocrd_tool(self):
                     assert ('foobar3', 'foobar4') in tuples
                     tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
                     assert ('foobar3', None) in tuples
-                    with self.assertRaisesRegex(Exception, "Could not determine unique input file"):
+                    with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"):
                         tuples = proc.zip_input_files(on_error='abort')
             ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001')
             for page_id in [None, 'phys_0001,phys_0002']:
@@ -289,7 +289,7 @@ def ocrd_tool(self):
                     proc.workspace = ws
                     proc.input_file_grp = 'GRP1,GRP2'
                     proc.page_id = page_id
-                    with self.assertRaisesRegex(Exception, "Could not determine unique input file"):
+                    with self.assertRaisesRegex(NonUniqueInputFile, "Could not determine unique input file"):
                         tuples = proc.zip_input_files()
 
     def test_zip_input_files_require_first(self):

From 8077d45056c9d2682bee5bb5017f79eb0a7b336a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 21 Aug 2024 19:54:26 +0200
Subject: [PATCH 12/12] test_processor: fix missing import

---
 tests/processor/test_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index c263d99fce..0cbae7d548 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -15,7 +15,7 @@
 
 from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
 from ocrd.resolver import Resolver
-from ocrd.processor.base import Processor, run_processor, run_cli
+from ocrd.processor import Processor, run_processor, run_cli, NonUniqueInputFile
 
 from unittest import mock
 import pytest