Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle errors in zip_input_files #12

Merged
merged 13 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/ocrd/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@
\b
{config.describe('OCRD_DOWNLOAD_TIMEOUT')}
\b
{config.describe('OCRD_DOWNLOAD_INPUT')}
\b
{config.describe('OCRD_MISSING_INPUT')}
\b
{config.describe('OCRD_MISSING_OUTPUT')}
\b
{config.describe('OCRD_EXISTING_OUTPUT')}
\b
{config.describe('OCRD_METS_CACHING')}
\b
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
Expand Down
66 changes: 31 additions & 35 deletions src/ocrd/cli/bashlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@
from ocrd.decorators import (
parameter_option,
parameter_override_option,
ocrd_loglevel
ocrd_loglevel,
ocrd_cli_wrap_processor
)
from ocrd_utils import (
is_local_filename,
get_local_filename,
initLogging,
make_file_id
getLogger,
make_file_id,
config
)
from ocrd.resolver import Resolver
from ocrd.processor import Processor
Expand Down Expand Up @@ -81,11 +84,15 @@ def bashlib_constants(name):
@bashlib_cli.command('input-files')
@click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
@click.option('-w', '--working-dir', help="Working Directory")
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT')
@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT')
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None)
@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None)
# repeat some other processor options for convenience (will be ignored here)
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist")
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n"
"(with '--page-id', remove only those).\n"
"Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE")
@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n"
"Short-hand for OCRD_MISSING_OUTPUT=ABORT")
@parameter_option
@parameter_override_option
@ocrd_loglevel
Expand All @@ -100,37 +107,26 @@ def bashlib_input_files(**kwargs):

(The printing format is one associative array initializer per line.)
"""
initLogging()
mets = kwargs.pop('mets')
working_dir = kwargs.pop('working_dir')
if is_local_filename(mets) and not isfile(get_local_filename(mets)):
msg = "File does not exist: %s" % mets
raise FileNotFoundError(msg)
resolver = Resolver()
workspace = resolver.workspace_from_url(mets, working_dir)
class BashlibProcessor(Processor):
@property
def ocrd_tool(self):
return {}
return {'executable': '', 'steps': ['']}
@property
def executable(self):
return ''
processor = BashlibProcessor(None)
# go half way of the normal run_processor / process_workspace call tree
processor.workspace = workspace
processor.page_id = kwargs['page_id']
processor.input_file_grp = kwargs['input_file_grp']
processor.output_file_grp = kwargs['output_file_grp']
for input_files in processor.zip_input_files(mimetype=None, on_error='abort'):
# ensure all input files exist locally (without persisting them in the METS)
# - this mimics the default behaviour of all Pythonic processors
input_files = [workspace.download_file(input_file) if input_file else None
for input_file in input_files]
for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
if len(input_files) > 1:
# single quotes allow us to preserve the list value inside the alist
print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ')
else:
print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ')
print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp']))
def version(self):
return '1.0'
# go half way of the normal run_processor / process_workspace call tree
# by just delegating to process_workspace, overriding process_page_file
# to ensure all input files exist locally (without persisting them in the METS)
# and print what needs to be acted on in bash-friendly way
def process_page_file(self, *input_files):
for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
if len(input_files) > 1:
# single quotes allow us to preserve the list value inside the alist
value = ' '.join(str(getattr(res, field)) for res in input_files)
else:
value = str(getattr(input_files[0], field))
print(f"[{field}]='{value}'", end=' ')
output_file_id = make_file_id(input_files[0], kwargs['output_file_grp'])
print(f"[outputFileId]='{output_file_id}'")
ocrd_cli_wrap_processor(BashlibProcessor, **kwargs)
22 changes: 4 additions & 18 deletions src/ocrd/decorators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor(
profile_file=None,
version=False,
overwrite=False,
debug=False,
resolve_resource=None,
show_resource=None,
list_resources=False,
Expand Down Expand Up @@ -117,25 +118,10 @@ def resolve(name):
resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url)
workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url)
page_id = kwargs.get('page_id')
# XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
# if overwrite
# if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']:
# raise Exception("--overwrite requires --output-file-grp")
# LOG.info("Removing files because of --overwrite")
# for grp in kwargs['output_file_grp'].split(','):
# if page_id:
# for one_page_id in kwargs['page_id'].split(','):
# LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id)
# for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp):
# workspace.remove_file(file, force=True, keep_file=False, page_recursive=True)
# else:
# LOG.debug("Removing all files in output file group %s ", grp)
# # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors)
# workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False)
# workspace.save_mets()
# XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace
if overwrite:
workspace.overwrite_mode = True
config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
if debug:
config.OCRD_MISSING_OUTPUT = 'ABORT'
report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id)
if not report.is_valid:
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors))
Expand Down
1 change: 1 addition & 0 deletions src/ocrd/decorators/ocrd_cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def cli(mets_url):
option('-O', '--output-file-grp', default=None),
option('-g', '--page-id'),
option('--overwrite', is_flag=True, default=False),
option('--debug', is_flag=True, default=False),
option('--profile', is_flag=True, default=False),
option('--profile-file', type=Path(dir_okay=False, writable=True)),
parameter_option,
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd/decorators/parameter_option.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def _handle_param_option(ctx, param, value):
parameter_option = option('-p', '--parameter',
help="Parameters, either JSON string or path to JSON file",
multiple=True,
default=['{}'],
default=[],
# now handled in ocrd_cli_wrap_processor to resolve processor preset files
# callback=_handle_param_option
callback=lambda ctx, param, kv: list(kv))
Expand Down
4 changes: 3 additions & 1 deletion src/ocrd/processor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .base import (
Processor,
ResourceNotFoundError
ResourceNotFoundError,
NonUniqueInputFile,
MissingInputFile,
)
from .ocrd_page_result import (
OcrdPageResult,
Expand Down
Loading