Let TableExtractorML inherit TreeExtractor to use its updated parse() (#106)

HiromuHota · web-flow · commit 862391a90534 · 2020-11-02T22:33:13.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   ([#102](https://github.com/HazyResearch/pdftotree/pull/102), [@HiromuHota][HiromuHota])
 - Use sys.maxsize not to cause "OverflowError: cannot convert float infinity to integer".
   ([#104](https://github.com/HazyResearch/pdftotree/issues/104), [@HiromuHota][HiromuHota])
+- Let TableExtractorML inherit TreeExtractor to use its updated parse().
+  ([#105](https://github.com/HazyResearch/pdftotree/issues/105), [@HiromuHota][HiromuHota])
 
 ## 0.5.0 - 2020-10-13
 
diff --git a/pdftotree/ml/TableExtractML.py b/pdftotree/ml/TableExtractML.py
@@ -1,128 +1,30 @@
 import logging
-from builtins import filter, object, range, str
 
 import numpy as np
-from pdfminer.utils import Plane
 from wand.color import Color
 from wand.drawing import Drawing
 
 from pdftotree.ml.features import get_alignment_features, get_lines_features
-from pdftotree.utils.bbox_utils import compute_iou, get_rectangles
+from pdftotree.TreeExtract import TreeExtractor
+from pdftotree.utils.bbox_utils import compute_iou
 from pdftotree.utils.display_utils import pdf_to_img
-from pdftotree.utils.lines_utils import (
-    extend_horizontal_lines,
-    extend_vertical_lines,
-    get_vertical_and_horizontal,
-    merge_horizontal_lines,
-    merge_vertical_lines,
-    reorder_lines,
-)
-from pdftotree.utils.pdf.pdf_parsers import parse_layout
-from pdftotree.utils.pdf.pdf_utils import analyze_pages, normalize_pdf
 
 logger = logging.getLogger(__name__)
 
 
-class TableExtractorML(object):
+class TableExtractorML(TreeExtractor):
     """
     Object to extract tables regions from pdf files
     """
 
     def __init__(self, pdf_file):
-        self.pdf_file = pdf_file
-        self.elems = {}
-        self.font_stats = {}
+        super().__init__(pdf_file)
         self.lines_bboxes = []
         self.alignments_bboxes = []
         self.intersection_bboxes = []
         self.bboxes = []
         self.candidates = []
         self.features = []
-        self.iou_thresh = 0.8
-        self.scanned = False
-
-    def identify_scanned_page(self, boxes, page_bbox, page_width, page_height):
-        plane = Plane(page_bbox)
-        plane.extend(boxes)
-        cid2obj = [set([i]) for i in range(len(boxes))]  # initialize clusters
-        # default object map to cluster with its own index
-        obj2cid = list(range(len(boxes)))
-        prev_clusters = obj2cid
-        while True:
-            for i1, b1 in enumerate(boxes):
-                for i2, b2 in enumerate(boxes):
-                    box1 = b1.bbox
-                    box2 = b2.bbox
-                    if (
-                        box1[0] == box2[0]
-                        and box1[2] == box2[2]
-                        and round(box1[3]) == round(box2[1])
-                    ):
-                        min_i = min(i1, i2)
-                        max_i = max(i1, i2)
-                        cid1 = obj2cid[min_i]
-                        cid2 = obj2cid[max_i]
-                        for obj_iter in cid2obj[cid2]:
-                            cid2obj[cid1].add(obj_iter)
-                            obj2cid[obj_iter] = cid1
-                        cid2obj[cid2] = set()
-            if prev_clusters == obj2cid:
-                break
-            prev_clusters = obj2cid
-        clusters = [[boxes[i] for i in cluster] for cluster in filter(bool, cid2obj)]
-        if (
-            len(clusters) == 1
-            and clusters[0][0].bbox[0] < -0.0
-            and clusters[0][0].bbox[1] <= 0
-            and abs(clusters[0][0].bbox[2] - page_width) <= 5
-            and abs(clusters[0][0].bbox[3] - page_height) <= 5
-        ):
-            return True
-        return False
-
-    def parse(self):
-        is_scanned = False
-        lin_seg_present = False
-        for page_num, layout in enumerate(analyze_pages(self.pdf_file)):
-            page_num += 1  # indexes start at 1
-            elems, font_stat = normalize_pdf(layout, scaler=1)
-            self.elems[page_num] = elems
-            self.font_stats[page_num] = font_stat
-            # code to detect if the page is scanned
-            if len(elems.segments) > 0:
-                lin_seg_present = True
-            for fig in elems.figures:
-                if (
-                    fig.bbox[0] <= 0.0
-                    and fig.bbox[1] <= 0.0
-                    and round(fig.bbox[2]) == round(elems.layout.width)
-                    and round(fig.bbox[3]) == round(elems.layout.height)
-                ):
-                    logger.debug(
-                        "{} is scanned because of full-page figure.".format(
-                            self.pdf_file
-                        )
-                    )
-                    is_scanned = True
-            page_scanned = self.identify_scanned_page(
-                elems.figures,
-                elems.layout.bbox,
-                elems.layout.width,
-                elems.layout.height,
-            )
-            # doc is scanned if any page is scanned
-            if page_scanned:
-                logger.debug(
-                    "{} is scanned one of its pages is scanned.".format(self.pdf_file)
-                )
-                is_scanned = True
-        if is_scanned or not lin_seg_present:
-            self.scanned = True
-
-    def get_scanned(self):
-        if len(self.elems) == 0:
-            self.parse()
-        return self.scanned
 
     def get_candidates(self):
         if len(self.elems) == 0:
@@ -181,43 +83,6 @@ def get_candidates_and_features_page_num(self, page_num):
         )
         return boxes, features
 
-    def get_candidates_lines(self, page_num, elems):
-        page_width, page_height = int(elems.layout.width), int(elems.layout.height)
-        lines = reorder_lines(elems.segments)
-        vertical_lines, horizontal_lines = get_vertical_and_horizontal(lines)
-        extended_vertical_lines = extend_vertical_lines(horizontal_lines)
-        extended_horizontal_lines = extend_horizontal_lines(vertical_lines)
-        vertical_lines = merge_vertical_lines(
-            sorted(extended_vertical_lines + vertical_lines)
-        )
-        horizontal_lines = merge_horizontal_lines(
-            sorted(extended_horizontal_lines + horizontal_lines)
-        )
-        rectangles = get_rectangles(sorted(vertical_lines), sorted(horizontal_lines))
-        return [(page_num, page_width, page_height) + bbox for bbox in rectangles]
-
-    def get_candidates_alignments(self, page_num, elems):
-        page_width, page_height = int(elems.layout.width), int(elems.layout.height)
-        font_stat = self.font_stats[page_num]
-        try:
-            nodes, features = parse_layout(elems, font_stat)
-        except Exception:
-            nodes, features = [], []
-        return (
-            [
-                (page_num, page_width, page_height)
-                + (node.y0, node.x0, node.y1, node.x1)
-                for node in nodes
-            ],
-            features,
-        )
-
-    def get_elems(self):
-        return self.elems
-
-    def get_font_stats(self):
-        return self.font_stats
-
     def get_labels(self, gt_tables):
         """
         :param gt_tables: dict, keys are page number and values are list of