put derived images under output fileGrp…

Instead of writing AlternativeImages under a separate fileGrp (named `OCR-D-IMG-*`), place them under the output fileGrp (along with PAGE-XML) as well. Differentiate their file ID further by adding suffixes (named `.IMG-*`).
bertsky · Aug 14, 2020 · 6f7f785 · 6f7f785
1 parent a7b2d16
commit 6f7f785
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 149 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.1.0] - 2020-08-14
+
+Changed:
+
+ * put derived images under output fileGrp, using file ID suffixes
+
 ## [0.0.5] - 2020-07-08
 
 Fixed:

diff --git a/ocrd_wrap/ocrd-tool.json b/ocrd_wrap/ocrd-tool.json
@@ -1,6 +1,6 @@
 {
     "git_url": "https://github.com/bertsky/ocrd_wrap",
-    "version": "0.0.5",
+    "version": "0.1.0",
     "tools": {
         "ocrd-preprocess-image": {
             "executable": "ocrd-preprocess-image",

diff --git a/ocrd_wrap/shell.py b/ocrd_wrap/shell.py
@@ -8,7 +8,9 @@
 
 from ocrd import Processor
 from ocrd_utils import (
-    getLogger, concat_padded,
+    getLogger,
+    make_file_id,
+    assert_file_grp_cardinality,
     MIMETYPE_PAGE,
     MIME_TO_PIL,
     MIME_TO_EXT
@@ -24,22 +26,13 @@
 
 TOOL = 'ocrd-preprocess-image'
 LOG = getLogger('processor.ShellPreprocessor')
-FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-PROC'
 
 class ShellPreprocessor(Processor):
 
     def __init__(self, *args, **kwargs):
         kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
         kwargs['version'] = OCRD_TOOL['version']
         super(ShellPreprocessor, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            try:
-                self.page_grp, self.image_grp = self.output_file_grp.split(',')
-            except ValueError:
-                self.page_grp = self.output_file_grp
-                self.image_grp = FALLBACK_FILEGRP_IMG
-                LOG.info("No output file group for images specified, falling back to '%s'",
-                         FALLBACK_FILEGRP_IMG)
 
     def process(self):
         """Performs coords-preserving image operations via runtime shell calls anywhere.
@@ -68,9 +61,11 @@ def process(self):
         
         If the shell returns with a failure, skip that segment with an
         approriate error message.
-        Otherwise, add the new image to the workspace with its fileGrp USE
-        attribute given in the second position of the output fileGrp, or
-        ``OCR-D-IMG-PROC``. Reference it as AlternativeImage in the element,
+        Otherwise, add the new image to the workspace along with the
+        output fileGrp, and using a file ID with suffix ``.IMG-``,
+        and further identification of the input element.
+        
+        Reference it as AlternativeImage in the element,
         adding ``output_feature_added`` to its @comments.
         
         Produce a new PAGE output file by serialising the resulting hierarchy.
@@ -83,9 +78,11 @@ def process(self):
             raise Exception("command parameter requires @INFILE pattern")
         if '@OUTFILE' not in command:
             raise Exception("command parameter requires @OUTFILE pattern")
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
-            file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
+            file_id = make_file_id(input_file, self.output_file_grp)
             page_id = input_file.pageId or input_file.ID
             LOG.info("INPUT FILE %i / %s", n, page_id)
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -158,17 +155,12 @@ def process(self):
                                                       "glyph '%s'" % glyph.id, None,
                                                       file_id + '_' + glyph.id)
 
-            # Use input_file's basename for the new file -
-            # this way the files retain the same basenames:
-            file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.page_grp, n)
             self.workspace.add_file(
                 ID=file_id,
-                file_grp=self.page_grp,
+                file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(self.page_grp,
+                local_filename=os.path.join(self.output_file_grp,
                                             file_id + '.xml'),
                 content=to_xml(pcgts))
 
@@ -184,11 +176,11 @@ def _process_segment(self, segment, image, coords, where, page_id, file_id):
         in_fd, in_fname = mkstemp(suffix=file_id + MIME_TO_EXT[input_mime])
         image.save(in_fname, format=MIME_TO_PIL[input_mime])
         # prepare output file name
-        out_id = file_id + '_' + feature_added
-        out_fname = os.path.join(self.image_grp,
+        out_id = file_id + '.IMG-' + feature_added.upper()
+        out_fname = os.path.join(self.output_file_grp,
                                  out_id + MIME_TO_EXT[output_mime])
-        if not os.path.exists(self.image_grp):
-            makedirs(self.image_grp)
+        if not os.path.exists(self.output_file_grp):
+            makedirs(self.output_file_grp)
         # remove quotation around filename patterns, if any
         command = command.replace('"@INFILE"', '@INFILE')
         command = command.replace('"@OUTFILE"', '@OUTFILE')
@@ -226,10 +218,10 @@ def _process_segment(self, segment, image, coords, where, page_id, file_id):
         self.workspace.add_file(
             ID=out_id,
             local_filename=out_fname,
-            file_grp=self.image_grp,
+            file_grp=self.output_file_grp,
             pageId=page_id,
             mimetype=output_mime)
         LOG.info("created file ID: %s, file_grp: %s, path: %s",
-                 out_id, self.image_grp, out_fname)
+                 out_id, self.output_file_grp, out_fname)
         segment.add_AlternativeImage(AlternativeImageType(
             filename=out_fname, comments=features))
diff --git a/ocrd_wrap/skimage_binarize.py b/ocrd_wrap/skimage_binarize.py
@@ -14,7 +14,9 @@
 
 from ocrd import Processor
 from ocrd_utils import (
-    getLogger, concat_padded,
+    getLogger,
+    make_file_id,
+    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -28,25 +30,16 @@
 
 TOOL = 'ocrd-skimage-binarize'
 LOG = getLogger('processor.SkimageBinarize')
-FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-BIN'
 
 class SkimageBinarize(Processor):
 
     def __init__(self, *args, **kwargs):
         kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
         kwargs['version'] = OCRD_TOOL['version']
         super(SkimageBinarize, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            try:
-                self.page_grp, self.image_grp = self.output_file_grp.split(',')
-            except ValueError:
-                self.page_grp = self.output_file_grp
-                self.image_grp = FALLBACK_FILEGRP_IMG
-                LOG.info("No output file group for images specified, falling back to '%s'",
-                         FALLBACK_FILEGRP_IMG)
 
     def process(self):
-        """Performs binarization of segment or page images with Skimage on the workspace.
+        """Performs binarization of segment or page images with scikit-image on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         then iterate over the element hierarchy down to the requested
@@ -59,19 +52,19 @@ def process(self):
         
         Next, binarize the image according to ``method`` with skimage.
         
-        Then write the new image to the workspace with the fileGrp USE given
-        in the second position of the output fileGrp, or ``OCR-D-IMG-BIN``,
-        and an ID based on input file and input element.
+        Then write the new image to the workspace along with the output fileGrp,
+        and using a file ID with suffix ``.IMG-BIN`` with further identification
+        of the input element.
         
         Produce a new PAGE output file by serialising the resulting hierarchy.
         """
         oplevel = self.parameter['level-of-operation']
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
+            file_id = make_file_id(input_file, self.output_file_grp)
             page_id = input_file.pageId or input_file.ID
-            file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.image_grp, n)
             LOG.info("INPUT FILE %i / %s", n, page_id)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -114,7 +107,7 @@ def odd(n):
                 if oplevel == 'page':
                     self._process_segment(page, page_image, page_coords,
                                           "page '%s'" % page_id, input_file.pageId,
-                                          file_id)
+                                          file_id + '.IMG-BIN')
                     continue
                 regions = page.get_AllRegions(classes=['Text'])
                 if not regions:
@@ -125,7 +118,7 @@ def odd(n):
                     if oplevel == 'region':
                         self._process_segment(region, region_image, region_coords,
                                               "region '%s'" % region.id, None,
-                                              file_id + '_' + region.id)
+                                              file_id + '.IMG-BIN_' + region.id)
                         continue
                     lines = region.get_TextLine()
                     if not lines:
@@ -136,7 +129,7 @@ def odd(n):
                         if oplevel == 'line':
                             self._process_segment(line, line_image, line_coords,
                                                   "line '%s'" % line.id, None,
-                                                  file_id + '_' + line.id)
+                                                  file_id + '.IMG-BIN_' + line.id)
                             continue
                         words = line.get_Word()
                         if not words:
@@ -147,7 +140,7 @@ def odd(n):
                             if oplevel == 'word':
                                 self._process_segment(word, word_image, word_coords,
                                                       "word '%s'" % word.id, None,
-                                                      file_id + '_' + word.id)
+                                                      file_id + '.IMG-BIN_' + word.id)
                                 continue
                             glyphs = word.get_Glyph()
                             if not glyphs:
@@ -157,19 +150,14 @@ def odd(n):
                                     glyph, word_image, word_coords, feature_filter='binarized')
                                 self._process_segment(glyph, glyph_image, glyph_coords,
                                                       "glyph '%s'" % glyph.id, None,
-                                                      file_id + '_' + glyph.id)
+                                                      file_id + '.IMG-BIN_' + glyph.id)
 
-            # Use input_file's basename for the new file -
-            # this way the files retain the same basenames:
-            file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.page_grp, n)
             self.workspace.add_file(
                 ID=file_id,
-                file_grp=self.page_grp,
+                file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(self.page_grp,
+                local_filename=os.path.join(self.output_file_grp,
                                             file_id + '.xml'),
                 content=to_xml(pcgts))
 
@@ -196,7 +184,7 @@ def _process_segment(self, segment, image, coords, where, page_id, file_id):
         file_path = self.workspace.save_image_file(
             image,
             file_id,
-            file_grp=self.image_grp,
+            file_grp=self.output_file_grp,
             page_id=page_id)
         segment.add_AlternativeImage(AlternativeImageType(
             filename=file_path, comments=features))

diff --git a/ocrd_wrap/skimage_denoise.py b/ocrd_wrap/skimage_denoise.py
@@ -10,7 +10,9 @@
 
 from ocrd import Processor
 from ocrd_utils import (
-    getLogger, concat_padded,
+    getLogger,
+    make_file_id,
+    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -24,25 +26,16 @@
 
 TOOL = 'ocrd-skimage-denoise'
 LOG = getLogger('processor.SkimageDenoise')
-FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DEN'
 
 class SkimageDenoise(Processor):
 
     def __init__(self, *args, **kwargs):
         kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
         kwargs['version'] = OCRD_TOOL['version']
         super(SkimageDenoise, self).__init__(*args, **kwargs)
-        if hasattr(self, 'output_file_grp'):
-            try:
-                self.page_grp, self.image_grp = self.output_file_grp.split(',')
-            except ValueError:
-                self.page_grp = self.output_file_grp
-                self.image_grp = FALLBACK_FILEGRP_IMG
-                LOG.info("No output file group for images specified, falling back to '%s'",
-                         FALLBACK_FILEGRP_IMG)
 
     def process(self):
-        """Performs binary denoising of segment or page images with Skimage on the workspace.
+        """Performs binary denoising of segment or page images with scikit-image on the workspace.
         
         Open and deserialize PAGE input files and their respective images,
         then iterate over the element hierarchy down to the requested
@@ -56,19 +49,19 @@ def process(self):
         Next, denoise the image by removing too small connected components
         with skimage.
         
-        Then write the new image to the workspace with the fileGrp USE given
-        in the second position of the output fileGrp, or ``OCR-D-IMG-DEN``,
-        and an ID based on input file and input element.
+        Then write the new image to the workspace along with the output fileGrp,
+        and using a file ID with suffix ``.IMG-DEN`` with further identification
+        of the input element.
         
         Produce a new PAGE output file by serialising the resulting hierarchy.
         """
         oplevel = self.parameter['level-of-operation']
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
+            file_id = make_file_id(input_file, self.output_file_grp)
             page_id = input_file.pageId or input_file.ID
-            file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.image_grp, n)
             LOG.info("INPUT FILE %i / %s", n, page_id)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
@@ -106,7 +99,7 @@ def process(self):
                 if oplevel == 'page':
                     self._process_segment(page, page_image, page_coords, maxsize,
                                           "page '%s'" % page_id, input_file.pageId,
-                                          file_id)
+                                          file_id + '.IMG-DEN')
                     continue
                 regions = page.get_AllRegions(classes=['Text'])
                 if not regions:
@@ -117,7 +110,7 @@ def process(self):
                     if oplevel == 'region':
                         self._process_segment(region, region_image, region_coords, maxsize,
                                               "region '%s'" % region.id, None,
-                                              file_id + '_' + region.id)
+                                              file_id + '.IMG-DEN_' + region.id)
                         continue
                     lines = region.get_TextLine()
                     if not lines:
@@ -128,7 +121,7 @@ def process(self):
                         if oplevel == 'line':
                             self._process_segment(line, line_image, line_coords, maxsize,
                                                   "line '%s'" % line.id, None,
-                                                  file_id + '_' + line.id)
+                                                  file_id + '.IMG-DEN_' + line.id)
                             continue
                         words = line.get_Word()
                         if not words:
@@ -139,7 +132,7 @@ def process(self):
                             if oplevel == 'word':
                                 self._process_segment(word, word_image, word_coords, maxsize,
                                                       "word '%s'" % word.id, None,
-                                                      file_id + '_' + word.id)
+                                                      file_id + '.IMG-DEN_' + word.id)
                                 continue
                             glyphs = word.get_Glyph()
                             if not glyphs:
@@ -149,19 +142,14 @@ def process(self):
                                     glyph, word_image, word_coords, feature_selector='binarized')
                                 self._process_segment(glyph, glyph_image, glyph_coords, maxsize,
                                                       "glyph '%s'" % glyph.id, None,
-                                                      file_id + '_' + glyph.id)
+                                                      file_id + '.IMG-DEN_' + glyph.id)
 
-            # Use input_file's basename for the new file -
-            # this way the files retain the same basenames:
-            file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.page_grp, n)
             self.workspace.add_file(
                 ID=file_id,
-                file_grp=self.page_grp,
+                file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(self.page_grp,
+                local_filename=os.path.join(self.output_file_grp,
                                             file_id + '.xml'),
                 content=to_xml(pcgts))
 
@@ -178,7 +166,7 @@ def _process_segment(self, segment, image, coords, maxsize, where, page_id, file
         file_path = self.workspace.save_image_file(
             image,
             file_id,
-            file_grp=self.image_grp,
+            file_grp=self.output_file_grp,
             page_id=page_id)
         segment.add_AlternativeImage(AlternativeImageType(
             filename=file_path, comments=features))