Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Processor helpers #530

Merged
merged 6 commits into from
Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ocrd_models/ocrd_models/ocrd_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __str__(self):
# return 'OcrdFile[' + '\n\t' + props + '\n\t]'
props = ', '.join([
'='.join([k, getattr(self, k) if getattr(self, k) else '---'])
for k in ['mimetype', 'ID', 'url', 'local_filename']
for k in ['fileGrp', 'ID', 'mimetype', 'url', 'local_filename']
])
return '<OcrdFile ' + props + ']/> '

Expand Down
12 changes: 9 additions & 3 deletions ocrd_utils/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,15 +829,21 @@ def make_file_id(ocrd_file, output_file_grp):
and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
(as a fallback counter).
(as a fallback counter). Increment counter until there is no more ID conflict.
"""
ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
if ret == ocrd_file.ID:
m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
if m:
n = m.group(1)
else:
files = ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)
n = files.index(ocrd_file)
ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
try:
n = ids.index(ocrd_file.ID)
except ValueError:
n = len(ids)
kba marked this conversation as resolved.
Show resolved Hide resolved
ret = concat_padded(output_file_grp, n)
while ocrd_file.mets.find_files(ID=ret):
n += 1
ret = concat_padded(output_file_grp, n)
kba marked this conversation as resolved.
Show resolved Hide resolved
return ret
28 changes: 20 additions & 8 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
MIME_TO_PIL, PIL_TO_MIME,
)
from ocrd_models.utils import xmllint_format
from ocrd_models import OcrdFile
from ocrd_models import OcrdFile, OcrdMets

class MockOcrdFile(OcrdFile):
"""
Expand All @@ -53,10 +53,10 @@ def fileGrp(self):
@fileGrp.setter
def fileGrp(self, fileGrp):
self.__filegrp = fileGrp
def __init__(self, *args, fileGrp=None, **kwargs):
def __init__(self, *args, fileGrp=None, ocrd_mets=None, **kwargs):
super(MockOcrdFile, self).__init__(*args, **kwargs)
if fileGrp:
self.fileGrp = fileGrp
self.fileGrp = fileGrp if fileGrp else None
self.ocrd_mets = ocrd_mets if ocrd_mets else None

class TestUtils(TestCase):

Expand Down Expand Up @@ -242,10 +242,22 @@ def test_mock_file(self):
f = MockOcrdFile(None, ID="MAX_0012", fileGrp='MAX')
self.assertEqual(f.fileGrp, 'MAX')

def test_make_file_id(self):
self.assertEqual(make_file_id(MockOcrdFile(None, ID="MAX_0012", fileGrp='MAX'), 'FOO', 0), 'FOO_0012')
self.assertEqual(make_file_id(MockOcrdFile(None, ID="MAX_0012", fileGrp='BAR'), 'FOO', 0), 'FOO_0012')
self.assertEqual(make_file_id(MockOcrdFile(None, ID="MAXMAXMAX", fileGrp='BAR'), 'FOO', 11), 'FOO_0012')
def test_make_file_id_simple(self):
self.assertEqual(make_file_id(MockOcrdFile(None, ID="MAX_0012", fileGrp='MAX'), 'FOO'), 'FOO_0012')

def test_make_file_id_mets(self):
mets = OcrdMets.empty_mets()
for i in range(1, 10):
mets.add_file('FOO', ID="FOO_%04d" % (i), mimetype="image/tiff")
mets.add_file('BAR', ID="BAR_%04d" % (i), mimetype="image/tiff")
self.assertEqual(make_file_id(mets.find_files(ID='BAR_0007')[0], 'FOO'), 'FOO_0007')
f = mets.add_file('ABC', ID="BAR_7", mimetype="image/tiff")
self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0010')
mets.remove_file(fileGrp='FOO')
self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0001')
mets.add_file('FOO', ID="FOO_0001", mimetype="image/tiff")
# print('\n'.join(['%s' % of for of in mets.find_files()]))
self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0002')


if __name__ == '__main__':
Expand Down