Skip to content

Commit 936b1c9

Browse files
author
Hiromu Hota
committed
Retrieve page dimensions from layout: LTPage (fix #72)
1 parent 22f9996 commit 936b1c9

File tree

4 files changed

+14
-4
lines changed

4 files changed

+14
-4
lines changed

pdftotree/TreeExtract.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,11 @@ def get_html_tree(self) -> str:
272272
page = doc.createElement("div")
273273
page.setAttribute("class", "ocr_page")
274274
page.setAttribute("id", f"page_{page_num}")
275+
width = int(self.elems[page_num].layout.width)
276+
height = int(self.elems[page_num].layout.height)
275277
page.setAttribute(
276-
"title", f"bbox 0 0 {int(pwidth)} {int(pheight)}; ppageno {page_num-1}"
278+
"title",
279+
f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
277280
)
278281
body.appendChild(page)
279282
# TODO: We need to detect columns and sort acccordingly.

pdftotree/utils/pdf/pdf_utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
import string
1313
from collections import Counter
14-
from typing import Any, List, NamedTuple, Tuple, Union
14+
from typing import List, NamedTuple, Tuple, Union
1515

1616
from pdfminer.converter import PDFPageAggregator
1717
from pdfminer.layout import (
@@ -22,6 +22,7 @@
2222
LTCurve,
2323
LTFigure,
2424
LTLine,
25+
LTPage,
2526
LTTextLine,
2627
)
2728
from pdfminer.pdfdocument import PDFDocument
@@ -42,7 +43,7 @@ class PDFElems(NamedTuple):
4243
segments: List[LTLine]
4344
curves: List[LTCurve]
4445
figures: List[LTFigure]
45-
layout: Any # assigned to by PDFPageAggregator.get_result
46+
layout: LTPage
4647
chars: List[Union[LTChar, LTAnno]]
4748

4849

@@ -150,7 +151,7 @@ def analyze_pages(file_name, char_margin=1.0):
150151
yield layout
151152

152153

153-
def normalize_pdf(layout, scaler) -> Tuple[PDFElems, Counter]:
154+
def normalize_pdf(layout: LTPage, scaler) -> Tuple[PDFElems, Counter]:
154155
"""
155156
Normalizes pdf object coordinates (bot left) to image
156157
conventions (top left origin).
325 KB
Binary file not shown.

tests/test_basic.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ def get_bbox(node: Tag) -> box:
7474
assert all([figure.contains(word) for word in words])
7575

7676

77+
def test_issue_72():
78+
"""Make sure not to cause #72."""
79+
output = pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf")
80+
assert output is not None
81+
82+
7783
def test_ml_completion():
7884
"""Simply test that ML-based parse runs without errors."""
7985
output = pdftotree.parse(

0 commit comments

Comments
 (0)