qurator-spk · vahidrezanezhad · Oct 22, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.egg-info
+__pycache__
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "repo/assets"]
+	path = repo/assets
+	url = https://github.com/OCR-D/assets
diff --git a/Makefile b/Makefile
@@ -0,0 +1,4 @@
+all: install
+
+install:
+	pip install .
diff --git a/README.md b/README.md
@@ -1,18 +1,30 @@
 # Binarization
+
 > Binarization for document images
 
 ## Introduction
-This tool performs document image binarization (i.e. transform colour/grayscale to black-and-white pixels) for OCR using multiple trained models.
+
+This tool performs document image binarization (i.e. transform colour/grayscale
+to black-and-white pixels) for OCR using multiple trained models.
 
 ## Installation
+
 Clone the repository, enter it and run  
-`./make`
+
+`pip install .`
 
 ### Models
+
 Pre-trained models can be downloaded from here:   
+
 https://qurator-data.de/sbb_binarization/
 
 ## Usage 
-`sbb_binarize -m <directory with models> -i <image file> 
--p <set to true to let the model see the image divided into patches> 
--s <directory where the results will be saved>`
+
+```sh
+sbb_binarize \
+  -m <directory with models> \
+  -i <image file> \
+  -p <set to true to let the model see the image divided into patches> \
+  -s <directory where the results will be saved>`
+```
diff --git a/ocrd-tool.json b/ocrd-tool.json
@@ -0,0 +1 @@
+sbb_binarize/ocrd-tool.json
diff --git a/repo/assets b/repo/assets
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+numpy >= 1.17.0, < 1.19.0
+setuptools >= 41
+opencv-python-headless
+ocrd >= 2.18.0
+keras >= 2.3.1, < 2.4
+tensorflow >= 1.15, < 1.16
diff --git a/sbb_binarize/cli.py b/sbb_binarize/cli.py
@@ -0,0 +1,28 @@
+"""
+sbb_binarize CLI
+"""
+
+from argparse import ArgumentParser
+
+from .sbb_binarize import SbbBinarizer
+
+def main():
+    parser = ArgumentParser()
+
+    parser.add_argument('-i', '--image', default=None, help='image.')
+    parser.add_argument('-p', '--patches', default=False, help='by setting this parameter to true you let the model to see the image in patches.')
+    parser.add_argument('-s', '--save', default=False, help='save prediction with a given name here. The name and format should be given (outputname.tif).')
+    parser.add_argument('-m', '--model', default=None, help='models directory.')
+
+    options = parser.parse_args()
+
+    binarizer = SbbBinarizer(
+            image_path=options.image,
+            model=options.model,
+            patches=options.patches,
+            save=options.save
+    )
+    binarizer.run()
+
+if __name__ == "__main__":
+    main()
diff --git a/sbb_binarize/ocrd-tool.json b/sbb_binarize/ocrd-tool.json
@@ -0,0 +1,32 @@
+{
+  "version": "0.0.1",
+  "git_url": "https://github.com/qurator-spk/sbb_binarization",
+  "tools": {
+    "ocrd-sbb-binarize": {
+      "executable": "ocrd-sbb-binarize",
+      "description": "Smart binarization with sbb_binarization",
+      "categories": ["Image preprocessing"],
+      "steps": ["preprocessing/optimization/binarization"],
+      "input_file_grp": [],
+      "output_file_grp": [],
+      "parameters": {
+        "operation_level": {
+          "type": "string",
+          "enum": ["page", "region", "line"],
+          "default": "page",
+          "description": "PAGE XML hierarchy level to operate on"
+        },
+        "patches": {
+          "description": "by setting this parameter to true you let the model to see the image in patches.",
+          "type": "boolean",
+          "default": false
+        },
+        "model": {
+          "description": "models directory.",
+          "type": "string",
+          "required": true
+        }
+      }
+    }
+  }
+}
diff --git a/sbb_binarize/ocrd_cli.py b/sbb_binarize/ocrd_cli.py
@@ -0,0 +1,129 @@
+# TODO: AlternativeImage 'binarized' comment should be additive
+
+import os.path
+from pkg_resources import resource_string
+from json import loads
+
+from PIL import Image
+import numpy as np
+import cv2
+from click import command
+
+from ocrd_utils import (
+    getLogger,
+    assert_file_grp_cardinality,
+    make_file_id,
+    MIMETYPE_PAGE
+)
+from ocrd import Processor
+from ocrd_modelfactory import page_from_file
+from ocrd_models.ocrd_page import AlternativeImageType, to_xml
+from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
+
+from .sbb_binarize import SbbBinarizer
+
+OCRD_TOOL = loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
+TOOL = 'ocrd-sbb-binarize'
+
+def cv2pil(img):
+    color_coverted = cv2.cvtColor(img, cv2.COLOR_BGRA2RGB)
+    return Image.fromarray(color_coverted)
+
+def pil2cv(img):
+    # from ocrd/workspace.py
+    color_conversion = cv2.COLOR_GRAY2BGR if img.mode in ('1', 'L') else  cv2.COLOR_RGB2BGR
+    pil_as_np_array = np.array(img).astype('uint8') if img.mode == '1' else np.array(img)
+    return cv2.cvtColor(pil_as_np_array, color_conversion)
+
+class SbbBinarizeProcessor(Processor):
+
+    def __init__(self, *args, **kwargs):
+        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
+        kwargs['version'] = OCRD_TOOL['version']
+        super().__init__(*args, **kwargs)
+
+    def _run_binarizer(self, img):
+        return cv2pil(
+                SbbBinarizer(
+                    image=pil2cv(img),
+                    model=self.model_path,
+                    patches=self.use_patches,
+                    save=None).run())
+
+    def process(self):
+        """
+        Binarize with sbb_binarization
+        """
+        LOG = getLogger('processor.SbbBinarize')
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
+        oplevel = self.parameter['operation_level']
+        self.use_patches = self.parameter['patches'] # pylint: disable=attribute-defined-outside-init
+        self.model_path = self.parameter['model'] # pylint: disable=attribute-defined-outside-init
+
+        for n, input_file in enumerate(self.input_files):
+            file_id = make_file_id(input_file, self.output_file_grp)
+            page_id = input_file.pageId or input_file.ID
+            LOG.info("INPUT FILE %i / %s", n, page_id)
+            pcgts = page_from_file(self.workspace.download_file(input_file))
+            self.add_metadata(pcgts)
+            pcgts.set_pcGtsId(file_id)
+            page = pcgts.get_Page()
+
+            if oplevel == 'page':
+                LOG.info("Binarizing on 'page' level in page '%s'", page_id)
+                page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id)
+                bin_image = self._run_binarizer(page_image)
+                # update METS (add the image file):
+                bin_image_path = self.workspace.save_image_file(bin_image,
+                        file_id + '.IMG-BIN',
+                        page_id=page_id,
+                        file_grp=self.output_file_grp)
+                page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comment="binarized"))
+
+            else:
+                regions = page.get_AllRegions(['Text', 'Table'])
+                if not regions:
+                    LOG.warning("Page '%s' contains no text/table regions", page_id)
+
+                for region in regions:
+                    region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)
+
+                    if oplevel == 'region':
+                        region_image_bin = self._run_binarizer(region_image)
+                        region_image_bin_path = self.workspace.save_image_file(
+                                region_image_bin,
+                                "%s_%s.IMG-BIN" % (file_id, region.id),
+                                page_id=page_id,
+                                file_grp=self.output_file_grp)
+                        region.add_AlternativeImage(
+                            AlternativeImageType(filename=region_image_bin_path, comments='binarized'))
+
+                    elif oplevel == 'line':
+                        lines = region.get_TextLine()
+                        if not lines:
+                            LOG.warning("Page '%s' region '%s' contains no text lines", page_id, region.id)
+                        for line in lines:
+                            line_image, line_xywh = self.workspace.image_from_segment(line, page_image, page_xywh)
+                            line_image_bin = self._run_binarizer(line_image)
+                            line_image_bin_path = self.workspace.save_image_file(
+                                    line_image_bin,
+                                    "%s_%s_%s.IMG-BIN" % (file_id, region.id, line.id),
+                                    page_id=page_id,
+                                    file_grp=self.output_file_grp)
+                            line.add_AlternativeImage(
+                                AlternativeImageType(filename=line_image_bin_path, comments='binarized'))
+
+            self.workspace.add_file(
+                ID=file_id,
+                file_grp=self.output_file_grp,
+                pageId=input_file.pageId,
+                mimetype=MIMETYPE_PAGE,
+                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
+                content=to_xml(pcgts))
+
+@command()
+@ocrd_cli_options
+def cli(*args, **kwargs):
+    return ocrd_cli_wrap_processor(SbbBinarizeProcessor, *args, **kwargs)