Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Processor helpers #530

Merged
merged 6 commits into from
Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ocrd_models/ocrd_models/ocrd_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __str__(self):
# return 'OcrdFile[' + '\n\t' + props + '\n\t]'
props = ', '.join([
'='.join([k, getattr(self, k) if getattr(self, k) else '---'])
for k in ['mimetype', 'ID', 'url', 'local_filename']
for k in ['fileGrp', 'ID', 'mimetype', 'url', 'local_filename']
])
return '<OcrdFile ' + props + ']/> '

Expand Down
40 changes: 39 additions & 1 deletion ocrd_utils/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

FS-related utilities

* ``is_string``, ``membername``, ``concat_padded``, ``nth_url_segment``, ``remove_non_path_from_url``, ``parse_json_string_or_file``
* ``is_string``, ``membername``, ``concat_padded``, ``nth_url_segment``, ``remove_non_path_from_url``, ``parse_json_string_or_file``, ``assert_file_grp_cardinality``, ``make_file_id``

String and OOP utilities

Expand All @@ -72,6 +72,7 @@
'abspath',
'adjust_canvas_to_rotation',
'adjust_canvas_to_transposition',
'assert_file_grp_cardinality',
'bbox_from_points',
'bbox_from_xywh',
'bbox_from_polygon',
Expand All @@ -88,6 +89,7 @@
'nth_url_segment',
'remove_non_path_from_url',
'logging',
'make_file_id',
'membername',
'image_from_polygon',
'parse_json_string_or_file',
Expand Down Expand Up @@ -809,3 +811,39 @@ def parse_json_string_or_file(value='{}'): # pylint: disable=unused-argument
if err:
raise err # pylint: disable=raising-bad-type
return ret

def assert_file_grp_cardinality(grps, n):
"""
Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
"""
if isinstance(grps, str):
grps = grps.split(',')
assert len(grps) == n, \
"Expected exactly %d output file group%s, but '%s' has %d" % (
n, '' if n == 1 else 's', grps, len(grps))
bertsky marked this conversation as resolved.
Show resolved Hide resolved


def make_file_id(ocrd_file, output_file_grp):
"""
Derive a new file ID for an output file from an existing input file ``ocrd_file``
and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
(as a fallback counter). Increment counter until there is no more ID conflict.
"""
ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
if ret == ocrd_file.ID:
m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
if m:
n = m.group(1)
else:
ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
try:
n = ids.index(ocrd_file.ID)
except ValueError:
n = len(ids)
kba marked this conversation as resolved.
Show resolved Hide resolved
ret = concat_padded(output_file_grp, n)
while ocrd_file.mets.find_files(ID=ret):
n += 1
ret = concat_padded(output_file_grp, n)
kba marked this conversation as resolved.
Show resolved Hide resolved
return ret
49 changes: 48 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from ocrd_utils import (
abspath,

assert_file_grp_cardinality,
make_file_id,

bbox_from_points,
bbox_from_xywh,

Expand Down Expand Up @@ -38,6 +41,22 @@
MIME_TO_PIL, PIL_TO_MIME,
)
from ocrd_models.utils import xmllint_format
from ocrd_models import OcrdFile, OcrdMets

class MockOcrdFile(OcrdFile):
"""
OcrdFile with mocked fileGrp access
"""
@property
def fileGrp(self):
return self.__filegrp
@fileGrp.setter
def fileGrp(self, fileGrp):
self.__filegrp = fileGrp
def __init__(self, *args, fileGrp=None, ocrd_mets=None, **kwargs):
super(MockOcrdFile, self).__init__(*args, **kwargs)
self.fileGrp = fileGrp if fileGrp else None
self.ocrd_mets = ocrd_mets if ocrd_mets else None

class TestUtils(TestCase):

Expand Down Expand Up @@ -212,6 +231,34 @@ def test_mime_ext(self):
self.assertEqual(MIME_TO_PIL['image/jp2'], 'JP2')
self.assertEqual(PIL_TO_MIME['JP2'], 'image/jp2')

def test_assert_file_grp_cardinality(self):
with self.assertRaisesRegex(AssertionError, "Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"):
assert_file_grp_cardinality('FOO,BAR', 5)
with self.assertRaisesRegex(AssertionError, "Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"):
assert_file_grp_cardinality('FOO,BAR', 1)
assert_file_grp_cardinality('FOO,BAR', 2)

def test_mock_file(self):
f = MockOcrdFile(None, ID="MAX_0012", fileGrp='MAX')
self.assertEqual(f.fileGrp, 'MAX')

def test_make_file_id_simple(self):
self.assertEqual(make_file_id(MockOcrdFile(None, ID="MAX_0012", fileGrp='MAX'), 'FOO'), 'FOO_0012')

def test_make_file_id_mets(self):
mets = OcrdMets.empty_mets()
for i in range(1, 10):
mets.add_file('FOO', ID="FOO_%04d" % (i), mimetype="image/tiff")
mets.add_file('BAR', ID="BAR_%04d" % (i), mimetype="image/tiff")
self.assertEqual(make_file_id(mets.find_files(ID='BAR_0007')[0], 'FOO'), 'FOO_0007')
f = mets.add_file('ABC', ID="BAR_7", mimetype="image/tiff")
self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0010')
mets.remove_file(fileGrp='FOO')
self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0001')
mets.add_file('FOO', ID="FOO_0001", mimetype="image/tiff")
# print('\n'.join(['%s' % of for of in mets.find_files()]))
self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0002')


if __name__ == '__main__':
main()
main(__file__)