Skip to content
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
([#62](https://github.com/HazyResearch/pdftotree/pull/62))
- [@HiromuHota][HiromuHota]: Greedily extract contents from PDF even if it looks scanned.
([#71](https://github.com/HazyResearch/pdftotree/pull/71))
- [@HiromuHota][HiromuHota]: Extract LTChar even if they are not children of LTTextLine.
([#79](https://github.com/HazyResearch/pdftotree/pull/79))

## 0.4.1 - 2020-09-21

Expand Down
11 changes: 4 additions & 7 deletions pdftotree/TreeExtract.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,6 @@ def __init__(self, pdf_file):
self.pdf_file = pdf_file
self.elems: Dict[int, PDFElems] = {} # key represents page_num
self.font_stats: Dict[int, Any] = {} # key represents page_num
self.lines_bboxes = []
self.alignments_bboxes = []
self.intersection_bboxes = []
self.bboxes = []
self.candidates = []
self.features = []
self.iou_thresh = 0.8
self.scanned = False
self.tree: Dict[int, Any] = {} # key represents page_num
Expand Down Expand Up @@ -272,8 +266,11 @@ def get_html_tree(self) -> str:
page = doc.createElement("div")
page.setAttribute("class", "ocr_page")
page.setAttribute("id", f"page_{page_num}")
width = int(self.elems[page_num].layout.width)
height = int(self.elems[page_num].layout.height)
page.setAttribute(
"title", f"bbox 0 0 {int(pwidth)} {int(pheight)}; ppageno {page_num-1}"
"title",
f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
)
body.appendChild(page)
# TODO: We need to detect columns and sort acccordingly.
Expand Down
11 changes: 0 additions & 11 deletions pdftotree/utils/pdf/layout_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,6 @@
from pdftotree.utils.pdf.vector_utils import inside, intersect


def traverse_layout(root, callback):
"""
Tree walker and invokes the callback as it
traverse pdf object tree
"""
callback(root)
if isinstance(root, collections.Iterable):
for child in root:
traverse_layout(child, callback)


def get_near_items(tree, tree_key):
"""
Check both possible neighbors for key
Expand Down
11 changes: 6 additions & 5 deletions pdftotree/utils/pdf/pdf_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pdfminer.utils import Plane

from pdftotree.utils.pdf.node import Node
from pdftotree.utils.pdf.pdf_utils import PDFElems
from pdftotree.utils.pdf.vector_utils import center, intersect, l1, xy_reading_order


Expand Down Expand Up @@ -723,13 +724,13 @@ def cluster_vertically_aligned_boxes(
return tables, table_features


def parse_tree_structure(elems, font_stat, page_num, ref_page_seen, tables):
def parse_tree_structure(elems: PDFElems, font_stat, page_num, ref_page_seen, tables):
boxes_segments = elems.segments
boxes_curves = elems.curves
boxes_figures = elems.figures
page_width = elems.layout.width
page_height = elems.layout.height
mentions = elems.mentions
mentions: List[LTTextLine] = elems.mentions

avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
width = get_page_width(mentions + boxes_segments + boxes_figures + boxes_curves)
Expand Down Expand Up @@ -762,7 +763,7 @@ def parse_tree_structure(elems, font_stat, page_num, ref_page_seen, tables):
tables_page = tables

# Eliminate tables from these boxes
boxes = []
boxes: List[LTTextLine] = []
for idx1, box in enumerate(mentions):
intersect = False
for idx2, table in enumerate(tables_page):
Expand Down Expand Up @@ -798,7 +799,7 @@ def parse_tree_structure(elems, font_stat, page_num, ref_page_seen, tables):


def extract_text_candidates(
boxes,
boxes: List[LTTextLine],
page_bbox,
avg_font_pts,
width,
Expand Down Expand Up @@ -1282,7 +1283,7 @@ def get_page_width(boxes):
return xmax - xmin


def get_char_width(boxes):
def get_char_width(boxes: List[LTTextLine]) -> float:
log = logging.getLogger(__name__)
box_len_sum = 0
num_char_sum = 0
Expand Down
42 changes: 36 additions & 6 deletions pdftotree/utils/pdf/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,21 @@
import re
import string
from collections import Counter
from typing import Any, List, NamedTuple, Tuple, Union
from typing import List, NamedTuple, Tuple, Union

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTComponent,
LTContainer,
LTCurve,
LTFigure,
LTLayoutContainer,
LTLine,
LTPage,
LTTextContainer,
LTTextLine,
)
from pdfminer.pdfdocument import PDFDocument
Expand All @@ -31,7 +35,6 @@
from pdfminer.utils import apply_matrix_pt

from pdftotree.utils.img_utils import normalize_bbox, normalize_pts
from pdftotree.utils.pdf.layout_utils import traverse_layout

# from pdftotree.utils.pdf.vector_utils import *

Expand All @@ -42,7 +45,7 @@ class PDFElems(NamedTuple):
segments: List[LTLine]
curves: List[LTCurve]
figures: List[LTFigure]
layout: Any # assigned to by PDFPageAggregator.get_result
layout: LTPage
chars: List[Union[LTChar, LTAnno]]


Expand Down Expand Up @@ -150,25 +153,30 @@ def analyze_pages(file_name, char_margin=1.0):
yield layout


def normalize_pdf(layout, scaler) -> Tuple[PDFElems, Counter]:
def normalize_pdf(layout: LTPage, scaler) -> Tuple[PDFElems, Counter]:
"""
Normalizes pdf object coordinates (bot left) to image
conventions (top left origin).
Returns the list of chars and average char size
"""
chars = []
mentions = []
mentions: List[LTTextContainer] = []
height = scaler * layout.height
font_size_counter = collections.Counter()
# Lines longer than this are included in segments
pts_thres = 2.0 * scaler
segments = []
curves = []
figures = []
mention: LTTextContainer = None
_font = None

def processor(m):
# Normalizes the coordinate system to be consistent with
# image library conventions (top left as origin)
if isinstance(m, LTContainer):
for child in m:
processor(child)
if isinstance(m, LTComponent):
m.set_bbox(normalize_bbox(m.bbox, height, scaler))

Expand All @@ -188,6 +196,28 @@ def processor(m):

# Collect stats on the chars
if isinstance(m, LTChar):
# Construct LTTextContainer from LTChar(s) outside of LTTextContainer
nonlocal _font
nonlocal mention
font = (m.fontname, m.size)
if font != _font:
if _font is not None:
mention.font_name, mention.font_size = _font_of_mention(mention)
layout_container = LTLayoutContainer((0, 0, 0, 0)) # dummy bbox
laparams = LAParams(
char_margin=1.0, word_margin=0.1, detect_vertical=True
)
for textline in layout_container.group_objects(
laparams, mention
):
textline.font_name, textline.font_size = _font_of_mention(
textline
)
mentions.append(textline)
mention = LTTextContainer()
_font = font
mention.add(m)

chars.append(m)
# fonts could be rotated 90/270 degrees
font_size = _font_size_of(m)
Expand All @@ -209,7 +239,7 @@ def processor(m):
if isinstance(m, LTAnno):
chars.append(m)

traverse_layout(layout, processor)
processor(layout)

# Resets mention y0 to the first y0 of alphanum character instead of
# considering exotic unicode symbols and sub/superscripts to reflect
Expand Down
Binary file added tests/input/CentralSemiconductorCorp_2N4013.pdf
Binary file not shown.
17 changes: 17 additions & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ def get_bbox(node: Tag) -> box:
assert all([figure.contains(word) for word in words])


def test_LTChar_under_LTFigure():
"""Test on a PDF where LTChar(s) are children of LTFigure."""
output = pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf")
soup = BeautifulSoup(output)
line: Tag = soup.find(class_="ocrx_line")
assert [word.text for word in line.find_all(class_="ocrx_word")] == [
"Small",
"Signal",
"Transistors",
]

# The table in the 1st page should contain 18 columns
page = soup.find(class_="ocr_page")
table = page.find("table")
assert len(table.find("tr").find_all("td")) == 18


def test_ml_completion():
"""Simply test that ML-based parse runs without errors."""
output = pdftotree.parse(
Expand Down