Skip to content

Commit 862391a

Browse files
authored
Let TableExtractorML inherit TreeExtractor to use its updated parse() (#106)
1 parent cb7b074 commit 862391a

File tree

2 files changed

+6
-139
lines changed

2 files changed

+6
-139
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2323
([#102](https://github.com/HazyResearch/pdftotree/pull/102), [@HiromuHota][HiromuHota])
2424
- Use sys.maxsize not to cause "OverflowError: cannot convert float infinity to integer".
2525
([#104](https://github.com/HazyResearch/pdftotree/issues/104), [@HiromuHota][HiromuHota])
26+
- Let TableExtractorML inherit TreeExtractor to use its updated parse().
27+
([#105](https://github.com/HazyResearch/pdftotree/issues/105), [@HiromuHota][HiromuHota])
2628

2729
## 0.5.0 - 2020-10-13
2830

pdftotree/ml/TableExtractML.py

Lines changed: 4 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,128 +1,30 @@
11
import logging
2-
from builtins import filter, object, range, str
32

43
import numpy as np
5-
from pdfminer.utils import Plane
64
from wand.color import Color
75
from wand.drawing import Drawing
86

97
from pdftotree.ml.features import get_alignment_features, get_lines_features
10-
from pdftotree.utils.bbox_utils import compute_iou, get_rectangles
8+
from pdftotree.TreeExtract import TreeExtractor
9+
from pdftotree.utils.bbox_utils import compute_iou
1110
from pdftotree.utils.display_utils import pdf_to_img
12-
from pdftotree.utils.lines_utils import (
13-
extend_horizontal_lines,
14-
extend_vertical_lines,
15-
get_vertical_and_horizontal,
16-
merge_horizontal_lines,
17-
merge_vertical_lines,
18-
reorder_lines,
19-
)
20-
from pdftotree.utils.pdf.pdf_parsers import parse_layout
21-
from pdftotree.utils.pdf.pdf_utils import analyze_pages, normalize_pdf
2211

2312
logger = logging.getLogger(__name__)
2413

2514

26-
class TableExtractorML(object):
15+
class TableExtractorML(TreeExtractor):
2716
"""
2817
Object to extract tables regions from pdf files
2918
"""
3019

3120
def __init__(self, pdf_file):
32-
self.pdf_file = pdf_file
33-
self.elems = {}
34-
self.font_stats = {}
21+
super().__init__(pdf_file)
3522
self.lines_bboxes = []
3623
self.alignments_bboxes = []
3724
self.intersection_bboxes = []
3825
self.bboxes = []
3926
self.candidates = []
4027
self.features = []
41-
self.iou_thresh = 0.8
42-
self.scanned = False
43-
44-
def identify_scanned_page(self, boxes, page_bbox, page_width, page_height):
45-
plane = Plane(page_bbox)
46-
plane.extend(boxes)
47-
cid2obj = [set([i]) for i in range(len(boxes))] # initialize clusters
48-
# default object map to cluster with its own index
49-
obj2cid = list(range(len(boxes)))
50-
prev_clusters = obj2cid
51-
while True:
52-
for i1, b1 in enumerate(boxes):
53-
for i2, b2 in enumerate(boxes):
54-
box1 = b1.bbox
55-
box2 = b2.bbox
56-
if (
57-
box1[0] == box2[0]
58-
and box1[2] == box2[2]
59-
and round(box1[3]) == round(box2[1])
60-
):
61-
min_i = min(i1, i2)
62-
max_i = max(i1, i2)
63-
cid1 = obj2cid[min_i]
64-
cid2 = obj2cid[max_i]
65-
for obj_iter in cid2obj[cid2]:
66-
cid2obj[cid1].add(obj_iter)
67-
obj2cid[obj_iter] = cid1
68-
cid2obj[cid2] = set()
69-
if prev_clusters == obj2cid:
70-
break
71-
prev_clusters = obj2cid
72-
clusters = [[boxes[i] for i in cluster] for cluster in filter(bool, cid2obj)]
73-
if (
74-
len(clusters) == 1
75-
and clusters[0][0].bbox[0] < -0.0
76-
and clusters[0][0].bbox[1] <= 0
77-
and abs(clusters[0][0].bbox[2] - page_width) <= 5
78-
and abs(clusters[0][0].bbox[3] - page_height) <= 5
79-
):
80-
return True
81-
return False
82-
83-
def parse(self):
84-
is_scanned = False
85-
lin_seg_present = False
86-
for page_num, layout in enumerate(analyze_pages(self.pdf_file)):
87-
page_num += 1 # indexes start at 1
88-
elems, font_stat = normalize_pdf(layout, scaler=1)
89-
self.elems[page_num] = elems
90-
self.font_stats[page_num] = font_stat
91-
# code to detect if the page is scanned
92-
if len(elems.segments) > 0:
93-
lin_seg_present = True
94-
for fig in elems.figures:
95-
if (
96-
fig.bbox[0] <= 0.0
97-
and fig.bbox[1] <= 0.0
98-
and round(fig.bbox[2]) == round(elems.layout.width)
99-
and round(fig.bbox[3]) == round(elems.layout.height)
100-
):
101-
logger.debug(
102-
"{} is scanned because of full-page figure.".format(
103-
self.pdf_file
104-
)
105-
)
106-
is_scanned = True
107-
page_scanned = self.identify_scanned_page(
108-
elems.figures,
109-
elems.layout.bbox,
110-
elems.layout.width,
111-
elems.layout.height,
112-
)
113-
# doc is scanned if any page is scanned
114-
if page_scanned:
115-
logger.debug(
116-
"{} is scanned one of its pages is scanned.".format(self.pdf_file)
117-
)
118-
is_scanned = True
119-
if is_scanned or not lin_seg_present:
120-
self.scanned = True
121-
122-
def get_scanned(self):
123-
if len(self.elems) == 0:
124-
self.parse()
125-
return self.scanned
12628

12729
def get_candidates(self):
12830
if len(self.elems) == 0:
@@ -181,43 +83,6 @@ def get_candidates_and_features_page_num(self, page_num):
18183
)
18284
return boxes, features
18385

184-
def get_candidates_lines(self, page_num, elems):
185-
page_width, page_height = int(elems.layout.width), int(elems.layout.height)
186-
lines = reorder_lines(elems.segments)
187-
vertical_lines, horizontal_lines = get_vertical_and_horizontal(lines)
188-
extended_vertical_lines = extend_vertical_lines(horizontal_lines)
189-
extended_horizontal_lines = extend_horizontal_lines(vertical_lines)
190-
vertical_lines = merge_vertical_lines(
191-
sorted(extended_vertical_lines + vertical_lines)
192-
)
193-
horizontal_lines = merge_horizontal_lines(
194-
sorted(extended_horizontal_lines + horizontal_lines)
195-
)
196-
rectangles = get_rectangles(sorted(vertical_lines), sorted(horizontal_lines))
197-
return [(page_num, page_width, page_height) + bbox for bbox in rectangles]
198-
199-
def get_candidates_alignments(self, page_num, elems):
200-
page_width, page_height = int(elems.layout.width), int(elems.layout.height)
201-
font_stat = self.font_stats[page_num]
202-
try:
203-
nodes, features = parse_layout(elems, font_stat)
204-
except Exception:
205-
nodes, features = [], []
206-
return (
207-
[
208-
(page_num, page_width, page_height)
209-
+ (node.y0, node.x0, node.y1, node.x1)
210-
for node in nodes
211-
],
212-
features,
213-
)
214-
215-
def get_elems(self):
216-
return self.elems
217-
218-
def get_font_stats(self):
219-
return self.font_stats
220-
22186
def get_labels(self, gt_tables):
22287
"""
22388
:param gt_tables: dict, keys are page number and values are list of

0 commit comments

Comments
 (0)