From 7c978a2737f03289bebbff36373c4187ffa989c9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 12 Oct 2023 11:38:52 +0200 Subject: [PATCH] bulk-add: Distinguish url and local_filename, fix #1086 --- ocrd/ocrd/cli/workspace.py | 30 ++++++++------- tests/cli/test_workspace.py | 75 +++++++++++++++++++------------------ 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index a1490d9900..67ea91f751 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -251,7 +251,8 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ @click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False) @click.option('-g', '--page-id', help="physical page ID of the file", required=False) @click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False) -@click.option('-u', '--url', help="local filesystem path in the workspace directory (copied from source file if different)", required=False) +@click.option('-u', '--url', help="Remote URL of the file", required=False) +@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False) @click.option('-G', '--file-grp', help="File group USE of the file", required=True) @click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True) @click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False) @@ -260,7 +261,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ @click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True) @click.argument('file_glob', nargs=-1, required=True) @pass_workspace -def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip): +def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip): """ Add files in bulk to an OCR-D workspace. @@ -289,7 +290,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ - --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\ + --local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\ - \b @@ -297,8 +298,8 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\ echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\ echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ - } | ocrd workspace bulk-add -r '(?P.*) (?P.*) (?P.*) (?P.*)' \\ - -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ url }}' - + } | ocrd workspace bulk-add -r '(?P.*) (?P.*) (?P.*) (?P.*)' \\ + -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) @@ -334,21 +335,24 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp group_dict = m.groupdict() # set up file info - file_dict = {'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp} + file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp} - # Flag to track whether 'url' should be 'src' - url_is_src = False + # Flag to track whether 'local_filename' should be 'src' + local_filename_is_src = False # expand templates for param_name in file_dict: if not file_dict[param_name]: - if param_name == 'url': - url_is_src = True + if param_name == 'local_filename': + local_filename_is_src = True continue elif param_name in ['mimetype', 'file_id']: # auto-filled below once the other # replacements have happened continue + elif param_name == 'url': + # Remote URL is not required + continue raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})") for group_name in group_dict: file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name]) @@ -373,10 +377,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath)) # copy files if src != url - if url_is_src: - file_dict['url'] = str(srcpath) + if local_filename_is_src: + file_dict['local_filename'] = srcpath else: - destpath = Path(workspace.directory, file_dict['url']) + destpath = Path(workspace.directory, file_dict['local_filename']) if srcpath != destpath and not destpath.exists(): log.info("cp '%s' '%s'", srcpath, destpath) if not dry_run: diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index 1dad3280c0..db2fa68528 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -439,30 +439,29 @@ def test_bulk_add0(self): Path(srcdir, "OCR-D-IMG", "page_%04d.tif" % i).write_text('') for i in range(NO_FILES): Path(srcdir, "OCR-D-PAGE", "page_%04d.xml" % i).write_text('') - with TemporaryDirectory() as wsdir: - with pushd_popd(wsdir): - ws = self.resolver.workspace_from_nothing(directory=wsdir) - exit_code, out, err = self.invoke_cli(workspace_cli, [ - 'bulk-add', - '--ignore', - '--regex', r'^.*/(?P[^/]+)/page_(?P.*)\.(?P[^\.]*)$', - '--url', '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}', - '--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}', - '--page-id', 'PHYS_{{ pageid }}', - '--file-grp', '{{ fileGrp }}', - '%s/*/*' % srcdir - ]) - # print('exit_code', exit_code) - # print('out', out) - # print('err', err) - ws.reload_mets() - self.assertEqual(len(ws.mets.file_groups), 2) - self.assertEqual(len(ws.mets.find_all_files()), 2 * NO_FILES) - self.assertEqual(len(ws.mets.find_all_files(mimetype='image/tiff')), NO_FILES) - self.assertEqual(len(ws.mets.find_all_files(ID='//FILE_OCR-D-IMG_000.*')), 10) - self.assertEqual(len(ws.mets.find_all_files(ID='//FILE_.*_000.*')), 20) - self.assertEqual(len(ws.mets.find_all_files(pageId='PHYS_0001')), 2) - self.assertEqual(ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].url, 'OCR-D-PAGE/FILE_0001.xml') + with pushd_popd(tempdir=True) as wsdir: + ws = self.resolver.workspace_from_nothing(directory=wsdir) + exit_code, out, err = self.invoke_cli(workspace_cli, [ + 'bulk-add', + '--ignore', + '--regex', r'^.*/(?P[^/]+)/page_(?P.*)\.(?P[^\.]*)$', + '--local-filename', '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}', + '--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}', + '--page-id', 'PHYS_{{ pageid }}', + '--file-grp', '{{ fileGrp }}', + '%s/*/*' % srcdir + ]) + # print('exit_code', exit_code) + # print('out', out) + # print('err', err) + ws.reload_mets() + assert len(ws.mets.file_groups) == 2 + assert len(ws.mets.find_all_files()) == 2 * NO_FILES + assert len(ws.mets.find_all_files(mimetype='image/tiff')) == NO_FILES + assert len(ws.mets.find_all_files(ID='//FILE_OCR-D-IMG_000.*')) == 10 + assert len(ws.mets.find_all_files(ID='//FILE_.*_000.*')) == 20 + assert len(ws.mets.find_all_files(pageId='PHYS_0001')) == 2 + assert ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].local_filename == Path('OCR-D-PAGE/FILE_0001.xml') def test_bulk_add_missing_param(self): with pushd_popd(tempdir=True) as wsdir: @@ -487,20 +486,22 @@ def test_bulk_add_gen_id(self): Path(wsdir, 'c.ext').write_text('') _, out, err = self.invoke_cli(workspace_cli, [ 'bulk-add', - '-r', r'(?P.*) (?P.*) (?P.*) (?P.*) (?P.*)', + '-r', r'(?P.*) (?P.*) (?P.*) (?P.*) (?P.*)', '-G', '{{ filegrp }}', '-g', '{{ pageid }}', '-S', '{{ src }}', # '-i', '{{ fileid }}', # XXX skip --file-id '-m', '{{ mimetype }}', - '-u', "{{ url }}", + '-l', "{{ local_filename }}", + '-u', "https://host/{{ filegrp }}/{{ local_filename }}", 'a b c.ext d e']) ws.reload_mets() print(out) assert next(ws.mets.find_files()).ID == 'b_c' - assert next(ws.mets.find_files()).url == 'd' + assert next(ws.mets.find_files()).local_filename == Path('d') + assert next(ws.mets.find_files()).url == 'https://host/b/d' - def test_bulk_add_derive_url(self): + def test_bulk_add_derive_local_filename(self): with pushd_popd(tempdir=True) as wsdir: ws = self.resolver.workspace_from_nothing(directory=wsdir) Path(wsdir, 'srcdir').mkdir() @@ -511,22 +512,22 @@ def test_bulk_add_derive_url(self): '-G', '{{ filegrp }}', '-g', '{{ pageid }}', '-S', '{{ src }}', - # '-u', "{{ url }}", # XXX skip --url + # '-l', "{{ local_filename }}", # XXX skip --local-filename 'p0001 SEG srcdir/src.xml']) # print('out', out) # print('err', err) ws.reload_mets() - assert next(ws.mets.find_files()).url == 'srcdir/src.xml' + assert next(ws.mets.find_files()).local_filename == Path('srcdir/src.xml') def test_bulk_add_stdin(self): resolver = Resolver() with pushd_popd(tempdir=True) as wsdir: ws = resolver.workspace_from_nothing(directory=wsdir) Path(wsdir, 'BIN').mkdir() - Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('') - Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('') - Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('') - Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('') + Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('', encoding='UTF-8') + Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('', encoding='UTF-8') + Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('', encoding='UTF-8') + Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('', encoding='UTF-8') with mock_stdin( 'PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png BIN/FILE_0001_BIN.IMG-wolf.png image/png\n' 'PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png BIN/FILE_0002_BIN.IMG-wolf.png image/png\n' @@ -540,7 +541,8 @@ def test_bulk_add_stdin(self): '-g', '{{ pageid }}', '-i', '{{ fileid }}', '-m', '{{ mimetype }}', - '-u', "{{ dest }}", + '-l', "{{ dest }}", + '-u', "https://host/{{ fileid }}/{{ dest }}", '-']) ws.reload_mets() assert len(ws.mets.file_groups) == 1 @@ -548,7 +550,8 @@ def test_bulk_add_stdin(self): f = next(ws.mets.find_files()) assert f.mimetype == 'image/png' assert f.ID == 'FILE_0001_BIN.IMG-wolf' - assert f.url == 'BIN/FILE_0001_BIN.IMG-wolf.png' + assert f.local_filename == Path('BIN/FILE_0001_BIN.IMG-wolf.png') + assert f.url == 'https://host/FILE_0001_BIN.IMG-wolf/BIN/FILE_0001_BIN.IMG-wolf.png' if __name__ == '__main__': main(__file__)