add possibility to use bounding rectangles instead of polygons

OCR4all · Sep 16, 2021 · de67b3b · de67b3b
1 parent ddd8fe0
commit de67b3b
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 17 deletions.
diff --git a/ocr4all_helper_scripts/cli/pagelineseg.py b/ocr4all_helper_scripts/cli/pagelineseg.py
@@ -17,7 +17,7 @@
                    "ImageRegions overlap with TextRegions.")
 @click.option("--minscale", type=float, default=12.0,
               help="Minimum scale permitted.")
-@click.option("--maxlines", type=float, default=300,
+@click.option("--maxlines", type=int, default=300,
               help="Maximum number of lines permitted.")
 @click.option("--threshold", type=float, default=0.2,
               help="Baseline threshold.")
@@ -59,11 +59,14 @@
               help="Widen black separators (to account for warping).")
 @click.option("--max-whiteseps", type=int, default=-1,
               help="Maximum amount of whitespace column separators.")
-@click.option("--minheight-whiteseps", type=float, default=10,
+@click.option("--minheight-whiteseps", type=int, default=10,
               help="Minimum column height (units=scale).")
-def pagelineseg_cli(dataset, remove_images, minscale, maxlines, threshold, usegauss, scale, hscale, vscale,
-                    filter_strength, maxskew, skewsteps, parallel, smear_x, smear_y, growth_x, growth_y, fail_save,
-                    max_blackseps, widen_blackseps, max_whiteseps, minheight_whiteseps):
+@click.option("--bounding-rectangle", is_flag=True, default=False, help="Uses bounding rectangles instead of polygons.")
+def pagelineseg_cli(dataset: str, remove_images: bool, minscale: float, maxlines: int, threshold: float,
+                    usegauss: bool, scale: float, hscale: float, vscale: float, filter_strength: float, maxskew: float,
+                    skewsteps: int, parallel: int, smear_x: float, smear_y: float, growth_x: float, growth_y: float,
+                    fail_save: int, max_blackseps: int, widen_blackseps: int, max_whiteseps: int,
+                    minheight_whiteseps: int, bounding_rectangle: bool):
     with Path(dataset).open('r') as data_file:
         dataset = json.load(data_file)
 
@@ -94,7 +97,8 @@ def parallel_proc(data):
                                                     maxskew=maxskew,
                                                     skewsteps=skewsteps,
                                                     usegauss=usegauss,
-                                                    remove_images=remove_images)
+                                                    remove_images=remove_images,
+                                                    bounding_box=bounding_rectangle)
 
         with Path(path_out).open("w+") as output_file:
             pagelineseg_helper.s_print(f"Save annotations into '{path_out}'")

diff --git a/ocr4all_helper_scripts/helpers/pagelineseg_helper.py b/ocr4all_helper_scripts/helpers/pagelineseg_helper.py
@@ -42,8 +42,13 @@ def s_print_error(*objs):
     s_print("ERROR: ", *objs, file=sys.stderr)
 
 
-def compute_lines(segmentation: np.ndarray, smear_strength: Tuple[float, float], scale: int,
-                  growth: Tuple[float, float], max_iterations: int, filter_strength: float) -> List[Record]:
+def compute_lines(segmentation: np.ndarray,
+                  smear_strength: Tuple[float, float],
+                  scale: int,
+                  growth: Tuple[float, float],
+                  max_iterations: int,
+                  filter_strength: float,
+                  bounding_box: bool) -> List[Record]:
     """Given a line segmentation map, computes a list of tuples consisting of 2D slices and masked images.
     Implementation derived from ocropy with changes to allow extracting the line coords/polygons.
     """
@@ -62,7 +67,7 @@ def compute_lines(segmentation: np.ndarray, smear_strength: Tuple[float, float],
         result.label = i + 1
         result.bounds = o
         polygon = []
-        if ((segmentation[o] != 0) == (segmentation[o] != i + 1)).any():
+        if ((segmentation[o] != 0) == (segmentation[o] != i + 1)).any() and not bounding_box:
             ppoints = approximate_smear_polygon(mask, smear_strength, growth, max_iterations)
             ppoints = ppoints[1:] if ppoints else []
             polygon = [(o[1].start + x, o[0].start + y) for x, y in ppoints]
@@ -201,7 +206,8 @@ def segment(im: Image, scale: float = None, max_blackseps: int = 0, widen_blacks
             minheight_whiteseps: int = 10, filter_strength: float = 1.0,
             smear_strength: Tuple[float, float] = (1.0, 2.0), growth: Tuple[float, float] = (1.1, 1.1),
             orientation: int = 0, fail_save_iterations: int = 1000, vscale: float = 1.0, hscale: float = 1.0,
-            minscale: float = 12.0, maxlines: int = 300, threshold: float = 0.2, usegauss: bool = False):
+            minscale: float = 12.0, maxlines: int = 300, threshold: float = 0.2, usegauss: bool = False,
+            bounding_box: bool = False):
     """
     Segments a page into text lines.
     Segments a page into text lines and returns the absolute coordinates of
@@ -254,7 +260,8 @@ def segment(im: Image, scale: float = None, max_blackseps: int = 0, widen_blacks
                                        scale,
                                        growth,
                                        fail_save_iterations,
-                                       filter_strength)
+                                       filter_strength,
+                                       bounding_box)
 
     # Translate each point back to original
     delta_x = (im_rotated.width - im.width) / 2
@@ -286,7 +293,7 @@ def pagelineseg(xmlfile: str,
                 widen_blackseps: int = 10,
                 max_whiteseps: int = -1,
                 minheight_whiteseps: int = 10,
-                minscale: int = 12,
+                minscale: float = 12.0,
                 maxlines: int = 300,
                 smear_strength: Tuple[float, float] = (1.0, 2.0),
                 growth: Tuple[float, float] = (1.1, 1.1),
@@ -295,7 +302,8 @@ def pagelineseg(xmlfile: str,
                 maxskew: float = 2.0,
                 skewsteps: int = 8,
                 usegauss: bool = False,
-                remove_images: bool = False):
+                remove_images: bool = False,
+                bounding_box: bool = False):
     name = Path(imgpath).name.split(".")[0]
     s_print(f"""Start process for '{name}'
         |- Image: '{imgpath}'
@@ -316,6 +324,8 @@ def pagelineseg(xmlfile: str,
     if remove_images:
         imageutils.remove_images(im, root)
 
+    pageutils.remove_existing_textlines(root)
+
     for coord_idx, coord in enumerate(sorted(coordmap)):
         coords = coordmap[coord]['coords']
 
@@ -345,12 +355,16 @@ def pagelineseg(xmlfile: str,
                                 max_whiteseps=max_whiteseps,
                                 minheight_whiteseps=minheight_whiteseps,
                                 filter_strength=filter_strength,
-                                smear_strength=smear_strength, growth=growth,
+                                smear_strength=smear_strength,
+                                growth=growth,
                                 orientation=orientation,
                                 fail_save_iterations=fail_save_iterations,
-                                vscale=vscale, hscale=hscale,
-                                minscale=minscale, maxlines=maxlines,
-                                usegauss=usegauss)
+                                vscale=vscale,
+                                hscale=hscale,
+                                minscale=minscale,
+                                maxlines=maxlines,
+                                usegauss=usegauss,
+                                bounding_box=bounding_box)
 
         else:
             lines = []

diff --git a/ocr4all_helper_scripts/utils/pageutils.py b/ocr4all_helper_scripts/utils/pageutils.py
@@ -40,3 +40,8 @@ def construct_coordmap(tree: etree.Element) -> dict:
             coordmap[region_id]["orientation"] = float(text_region.attrib["orientation"])
 
     return coordmap
+
+
+def remove_existing_textlines(tree: etree.Element):
+    for textline in tree.findall(".//{*}TextLine"):
+        textline.getparent().remove(textline)