HazyResearch · HiromuHota · Oct 8, 2020 · Oct 6, 2020 · Oct 6, 2020 · Oct 7, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   ([#62](https://github.com/HazyResearch/pdftotree/pull/62))
 - [@HiromuHota][HiromuHota]: Greedily extract contents from PDF even if it looks scanned.
   ([#71](https://github.com/HazyResearch/pdftotree/pull/71))
+- [@HiromuHota][HiromuHota]: Extract LTChar even if they are not children of LTTextLine.
+  ([#79](https://github.com/HazyResearch/pdftotree/pull/79))
 
 ## 0.4.1 - 2020-09-21
 

diff --git a/pdftotree/TreeExtract.py b/pdftotree/TreeExtract.py
@@ -35,12 +35,6 @@ def __init__(self, pdf_file):
         self.pdf_file = pdf_file
         self.elems: Dict[int, PDFElems] = {}  # key represents page_num
         self.font_stats: Dict[int, Any] = {}  # key represents page_num
-        self.lines_bboxes = []
-        self.alignments_bboxes = []
-        self.intersection_bboxes = []
-        self.bboxes = []
-        self.candidates = []
-        self.features = []
         self.iou_thresh = 0.8
         self.scanned = False
         self.tree: Dict[int, Any] = {}  # key represents page_num
@@ -272,8 +266,11 @@ def get_html_tree(self) -> str:
             page = doc.createElement("div")
             page.setAttribute("class", "ocr_page")
             page.setAttribute("id", f"page_{page_num}")
+            width = int(self.elems[page_num].layout.width)
+            height = int(self.elems[page_num].layout.height)
             page.setAttribute(
-                "title", f"bbox 0 0 {int(pwidth)} {int(pheight)}; ppageno {page_num-1}"
+                "title",
+                f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
             )
             body.appendChild(page)
             # TODO: We need to detect columns and sort acccordingly.

diff --git a/pdftotree/utils/pdf/layout_utils.py b/pdftotree/utils/pdf/layout_utils.py
@@ -14,17 +14,6 @@
 from pdftotree.utils.pdf.vector_utils import inside, intersect
 
 
-def traverse_layout(root, callback):
-    """
-    Tree walker and invokes the callback as it
-    traverse pdf object tree
-    """
-    callback(root)
-    if isinstance(root, collections.Iterable):
-        for child in root:
-            traverse_layout(child, callback)
-
-
 def get_near_items(tree, tree_key):
     """
     Check both possible neighbors for key

diff --git a/pdftotree/utils/pdf/pdf_parsers.py b/pdftotree/utils/pdf/pdf_parsers.py
@@ -16,6 +16,7 @@
 from pdfminer.utils import Plane
 
 from pdftotree.utils.pdf.node import Node
+from pdftotree.utils.pdf.pdf_utils import PDFElems
 from pdftotree.utils.pdf.vector_utils import center, intersect, l1, xy_reading_order
 
 
@@ -723,13 +724,13 @@ def cluster_vertically_aligned_boxes(
         return tables, table_features
 
 
-def parse_tree_structure(elems, font_stat, page_num, ref_page_seen, tables):
+def parse_tree_structure(elems: PDFElems, font_stat, page_num, ref_page_seen, tables):
     boxes_segments = elems.segments
     boxes_curves = elems.curves
     boxes_figures = elems.figures
     page_width = elems.layout.width
     page_height = elems.layout.height
-    mentions = elems.mentions
+    mentions: List[LTTextLine] = elems.mentions
 
     avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
     width = get_page_width(mentions + boxes_segments + boxes_figures + boxes_curves)
@@ -762,7 +763,7 @@ def parse_tree_structure(elems, font_stat, page_num, ref_page_seen, tables):
     tables_page = tables
 
     # Eliminate tables from these boxes
-    boxes = []
+    boxes: List[LTTextLine] = []
     for idx1, box in enumerate(mentions):
         intersect = False
         for idx2, table in enumerate(tables_page):
@@ -798,7 +799,7 @@ def parse_tree_structure(elems, font_stat, page_num, ref_page_seen, tables):
 
 
 def extract_text_candidates(
-    boxes,
+    boxes: List[LTTextLine],
     page_bbox,
     avg_font_pts,
     width,
@@ -1282,7 +1283,7 @@ def get_page_width(boxes):
     return xmax - xmin
 
 
-def get_char_width(boxes):
+def get_char_width(boxes: List[LTTextLine]) -> float:
     log = logging.getLogger(__name__)
     box_len_sum = 0
     num_char_sum = 0

diff --git a/pdftotree/utils/pdf/pdf_utils.py b/pdftotree/utils/pdf/pdf_utils.py
@@ -11,17 +11,21 @@
 import re
 import string
 from collections import Counter
-from typing import Any, List, NamedTuple, Tuple, Union
+from typing import List, NamedTuple, Tuple, Union
 
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import (
     LAParams,
     LTAnno,
     LTChar,
     LTComponent,
+    LTContainer,
     LTCurve,
     LTFigure,
+    LTLayoutContainer,
     LTLine,
+    LTPage,
+    LTTextContainer,
     LTTextLine,
 )
 from pdfminer.pdfdocument import PDFDocument
@@ -31,7 +35,6 @@
 from pdfminer.utils import apply_matrix_pt
 
 from pdftotree.utils.img_utils import normalize_bbox, normalize_pts
-from pdftotree.utils.pdf.layout_utils import traverse_layout
 
 #  from pdftotree.utils.pdf.vector_utils import *
 
@@ -42,7 +45,7 @@ class PDFElems(NamedTuple):
     segments: List[LTLine]
     curves: List[LTCurve]
     figures: List[LTFigure]
-    layout: Any  # assigned to by PDFPageAggregator.get_result
+    layout: LTPage
     chars: List[Union[LTChar, LTAnno]]
 
 
@@ -150,25 +153,30 @@ def analyze_pages(file_name, char_margin=1.0):
             yield layout
 
 
-def normalize_pdf(layout, scaler) -> Tuple[PDFElems, Counter]:
+def normalize_pdf(layout: LTPage, scaler) -> Tuple[PDFElems, Counter]:
     """
     Normalizes pdf object coordinates (bot left) to image
     conventions (top left origin).
     Returns the list of chars and average char size
     """
     chars = []
-    mentions = []
+    mentions: List[LTTextContainer] = []
     height = scaler * layout.height
     font_size_counter = collections.Counter()
     # Lines longer than this are included in segments
     pts_thres = 2.0 * scaler
     segments = []
     curves = []
     figures = []
+    mention: LTTextContainer = None
+    _font = None
 
     def processor(m):
         # Normalizes the coordinate system to be consistent with
         # image library conventions (top left as origin)
+        if isinstance(m, LTContainer):
+            for child in m:
+                processor(child)
         if isinstance(m, LTComponent):
             m.set_bbox(normalize_bbox(m.bbox, height, scaler))
 
@@ -188,6 +196,28 @@ def processor(m):
 
             # Collect stats on the chars
             if isinstance(m, LTChar):
+                # Construct LTTextContainer from LTChar(s) outside of LTTextContainer
+                nonlocal _font
+                nonlocal mention
+                font = (m.fontname, m.size)
+                if font != _font:
+                    if _font is not None:
+                        mention.font_name, mention.font_size = _font_of_mention(mention)
+                        layout_container = LTLayoutContainer((0, 0, 0, 0))  # dummy bbox
+                        laparams = LAParams(
+                            char_margin=1.0, word_margin=0.1, detect_vertical=True
+                        )
+                        for textline in layout_container.group_objects(
+                            laparams, mention
+                        ):
+                            textline.font_name, textline.font_size = _font_of_mention(
+                                textline
+                            )
+                            mentions.append(textline)
+                    mention = LTTextContainer()
+                    _font = font
+                mention.add(m)
+
                 chars.append(m)
                 # fonts could be rotated 90/270 degrees
                 font_size = _font_size_of(m)
@@ -209,7 +239,7 @@ def processor(m):
         if isinstance(m, LTAnno):
             chars.append(m)
 
-    traverse_layout(layout, processor)
+    processor(layout)
 
     # Resets mention y0 to the first y0 of alphanum character instead of
     # considering exotic unicode symbols and sub/superscripts to reflect

diff --git a/tests/input/CentralSemiconductorCorp_2N4013.pdf b/tests/input/CentralSemiconductorCorp_2N4013.pdf
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -74,6 +74,23 @@ def get_bbox(node: Tag) -> box:
     assert all([figure.contains(word) for word in words])
 
 
+def test_LTChar_under_LTFigure():
+    """Test on a PDF where LTChar(s) are children of LTFigure."""
+    output = pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf")
+    soup = BeautifulSoup(output)
+    line: Tag = soup.find(class_="ocrx_line")
+    assert [word.text for word in line.find_all(class_="ocrx_word")] == [
+        "Small",
+        "Signal",
+        "Transistors",
+    ]
+
+    # The table in the 1st page should contain 18 columns
+    page = soup.find(class_="ocr_page")
+    table = page.find("table")
+    assert len(table.find("tr").find_all("td")) == 18
+
+
 def test_ml_completion():
     """Simply test that ML-based parse runs without errors."""
     output = pdftotree.parse(