Skip to content

Commit

Permalink
bulk-add: Distinguish url and local_filename, fix #1086
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Oct 12, 2023
1 parent c5f466a commit 7c978a2
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 49 deletions.
30 changes: 17 additions & 13 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
@click.option('-u', '--url', help="local filesystem path in the workspace directory (copied from source file if different)", required=False)
@click.option('-u', '--url', help="Remote URL of the file", required=False)
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False)
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False)
Expand All @@ -260,7 +261,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
@click.argument('file_glob', nargs=-1, required=True)
@pass_workspace
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip):
"""
Add files in bulk to an OCR-D workspace.
Expand Down Expand Up @@ -289,16 +290,16 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
--file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
--page-id 'PHYS_{{ pageid }}' \\
--file-grp "{{ fileGrp }}" \\
--url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
--local-filename '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
-
\b
{ echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
} | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<url>.*)' \\
-G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ url }}' -
} | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
-G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
"""
log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
Expand Down Expand Up @@ -334,21 +335,24 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
group_dict = m.groupdict()

# set up file info
file_dict = {'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}
file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp}

# Flag to track whether 'url' should be 'src'
url_is_src = False
# Flag to track whether 'local_filename' should be 'src'
local_filename_is_src = False

# expand templates
for param_name in file_dict:
if not file_dict[param_name]:
if param_name == 'url':
url_is_src = True
if param_name == 'local_filename':
local_filename_is_src = True
continue
elif param_name in ['mimetype', 'file_id']:
# auto-filled below once the other
# replacements have happened
continue
elif param_name == 'url':
# Remote URL is not required
continue
raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
for group_name in group_dict:
file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
Expand All @@ -373,10 +377,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath))

# copy files if src != url
if url_is_src:
file_dict['url'] = str(srcpath)
if local_filename_is_src:
file_dict['local_filename'] = srcpath
else:
destpath = Path(workspace.directory, file_dict['url'])
destpath = Path(workspace.directory, file_dict['local_filename'])
if srcpath != destpath and not destpath.exists():
log.info("cp '%s' '%s'", srcpath, destpath)
if not dry_run:
Expand Down
75 changes: 39 additions & 36 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,30 +439,29 @@ def test_bulk_add0(self):
Path(srcdir, "OCR-D-IMG", "page_%04d.tif" % i).write_text('')
for i in range(NO_FILES):
Path(srcdir, "OCR-D-PAGE", "page_%04d.xml" % i).write_text('')
with TemporaryDirectory() as wsdir:
with pushd_popd(wsdir):
ws = self.resolver.workspace_from_nothing(directory=wsdir)
exit_code, out, err = self.invoke_cli(workspace_cli, [
'bulk-add',
'--ignore',
'--regex', r'^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$',
'--url', '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}',
'--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}',
'--page-id', 'PHYS_{{ pageid }}',
'--file-grp', '{{ fileGrp }}',
'%s/*/*' % srcdir
])
# print('exit_code', exit_code)
# print('out', out)
# print('err', err)
ws.reload_mets()
self.assertEqual(len(ws.mets.file_groups), 2)
self.assertEqual(len(ws.mets.find_all_files()), 2 * NO_FILES)
self.assertEqual(len(ws.mets.find_all_files(mimetype='image/tiff')), NO_FILES)
self.assertEqual(len(ws.mets.find_all_files(ID='//FILE_OCR-D-IMG_000.*')), 10)
self.assertEqual(len(ws.mets.find_all_files(ID='//FILE_.*_000.*')), 20)
self.assertEqual(len(ws.mets.find_all_files(pageId='PHYS_0001')), 2)
self.assertEqual(ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].url, 'OCR-D-PAGE/FILE_0001.xml')
with pushd_popd(tempdir=True) as wsdir:
ws = self.resolver.workspace_from_nothing(directory=wsdir)
exit_code, out, err = self.invoke_cli(workspace_cli, [
'bulk-add',
'--ignore',
'--regex', r'^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$',
'--local-filename', '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}',
'--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}',
'--page-id', 'PHYS_{{ pageid }}',
'--file-grp', '{{ fileGrp }}',
'%s/*/*' % srcdir
])
# print('exit_code', exit_code)
# print('out', out)
# print('err', err)
ws.reload_mets()
assert len(ws.mets.file_groups) == 2
assert len(ws.mets.find_all_files()) == 2 * NO_FILES
assert len(ws.mets.find_all_files(mimetype='image/tiff')) == NO_FILES
assert len(ws.mets.find_all_files(ID='//FILE_OCR-D-IMG_000.*')) == 10
assert len(ws.mets.find_all_files(ID='//FILE_.*_000.*')) == 20
assert len(ws.mets.find_all_files(pageId='PHYS_0001')) == 2
assert ws.mets.find_all_files(ID='FILE_OCR-D-PAGE_0001')[0].local_filename == Path('OCR-D-PAGE/FILE_0001.xml')

def test_bulk_add_missing_param(self):
with pushd_popd(tempdir=True) as wsdir:
Expand All @@ -487,20 +486,22 @@ def test_bulk_add_gen_id(self):
Path(wsdir, 'c.ext').write_text('')
_, out, err = self.invoke_cli(workspace_cli, [
'bulk-add',
'-r', r'(?P<pageid>.*) (?P<filegrp>.*) (?P<src>.*) (?P<url>.*) (?P<mimetype>.*)',
'-r', r'(?P<pageid>.*) (?P<filegrp>.*) (?P<src>.*) (?P<local_filename>.*) (?P<mimetype>.*)',
'-G', '{{ filegrp }}',
'-g', '{{ pageid }}',
'-S', '{{ src }}',
# '-i', '{{ fileid }}', # XXX skip --file-id
'-m', '{{ mimetype }}',
'-u', "{{ url }}",
'-l', "{{ local_filename }}",
'-u', "https://host/{{ filegrp }}/{{ local_filename }}",
'a b c.ext d e'])
ws.reload_mets()
print(out)
assert next(ws.mets.find_files()).ID == 'b_c'
assert next(ws.mets.find_files()).url == 'd'
assert next(ws.mets.find_files()).local_filename == Path('d')
assert next(ws.mets.find_files()).url == 'https://host/b/d'

def test_bulk_add_derive_url(self):
def test_bulk_add_derive_local_filename(self):
with pushd_popd(tempdir=True) as wsdir:
ws = self.resolver.workspace_from_nothing(directory=wsdir)
Path(wsdir, 'srcdir').mkdir()
Expand All @@ -511,22 +512,22 @@ def test_bulk_add_derive_url(self):
'-G', '{{ filegrp }}',
'-g', '{{ pageid }}',
'-S', '{{ src }}',
# '-u', "{{ url }}", # XXX skip --url
# '-l', "{{ local_filename }}", # XXX skip --local-filename
'p0001 SEG srcdir/src.xml'])
# print('out', out)
# print('err', err)
ws.reload_mets()
assert next(ws.mets.find_files()).url == 'srcdir/src.xml'
assert next(ws.mets.find_files()).local_filename == Path('srcdir/src.xml')

def test_bulk_add_stdin(self):
resolver = Resolver()
with pushd_popd(tempdir=True) as wsdir:
ws = resolver.workspace_from_nothing(directory=wsdir)
Path(wsdir, 'BIN').mkdir()
Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('')
Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('')
Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('')
Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('')
Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('', encoding='UTF-8')
Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('', encoding='UTF-8')
Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('', encoding='UTF-8')
Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('', encoding='UTF-8')
with mock_stdin(
'PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png BIN/FILE_0001_BIN.IMG-wolf.png image/png\n'
'PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png BIN/FILE_0002_BIN.IMG-wolf.png image/png\n'
Expand All @@ -540,15 +541,17 @@ def test_bulk_add_stdin(self):
'-g', '{{ pageid }}',
'-i', '{{ fileid }}',
'-m', '{{ mimetype }}',
'-u', "{{ dest }}",
'-l', "{{ dest }}",
'-u', "https://host/{{ fileid }}/{{ dest }}",
'-'])
ws.reload_mets()
assert len(ws.mets.file_groups) == 1
assert len(list(ws.mets.find_files())) == 4
f = next(ws.mets.find_files())
assert f.mimetype == 'image/png'
assert f.ID == 'FILE_0001_BIN.IMG-wolf'
assert f.url == 'BIN/FILE_0001_BIN.IMG-wolf.png'
assert f.local_filename == Path('BIN/FILE_0001_BIN.IMG-wolf.png')
assert f.url == 'https://host/FILE_0001_BIN.IMG-wolf/BIN/FILE_0001_BIN.IMG-wolf.png'

if __name__ == '__main__':
main(__file__)

0 comments on commit 7c978a2

Please sign in to comment.