|
1 | 1 | import logging
|
2 |
| -from builtins import filter, object, range, str |
3 | 2 |
|
4 | 3 | import numpy as np
|
5 |
| -from pdfminer.utils import Plane |
6 | 4 | from wand.color import Color
|
7 | 5 | from wand.drawing import Drawing
|
8 | 6 |
|
9 | 7 | from pdftotree.ml.features import get_alignment_features, get_lines_features
|
10 |
| -from pdftotree.utils.bbox_utils import compute_iou, get_rectangles |
| 8 | +from pdftotree.TreeExtract import TreeExtractor |
| 9 | +from pdftotree.utils.bbox_utils import compute_iou |
11 | 10 | from pdftotree.utils.display_utils import pdf_to_img
|
12 |
| -from pdftotree.utils.lines_utils import ( |
13 |
| - extend_horizontal_lines, |
14 |
| - extend_vertical_lines, |
15 |
| - get_vertical_and_horizontal, |
16 |
| - merge_horizontal_lines, |
17 |
| - merge_vertical_lines, |
18 |
| - reorder_lines, |
19 |
| -) |
20 |
| -from pdftotree.utils.pdf.pdf_parsers import parse_layout |
21 |
| -from pdftotree.utils.pdf.pdf_utils import analyze_pages, normalize_pdf |
22 | 11 |
|
23 | 12 | logger = logging.getLogger(__name__)
|
24 | 13 |
|
25 | 14 |
|
26 |
| -class TableExtractorML(object): |
| 15 | +class TableExtractorML(TreeExtractor): |
27 | 16 | """
|
28 | 17 | Object to extract tables regions from pdf files
|
29 | 18 | """
|
30 | 19 |
|
31 | 20 | def __init__(self, pdf_file):
|
32 |
| - self.pdf_file = pdf_file |
33 |
| - self.elems = {} |
34 |
| - self.font_stats = {} |
| 21 | + super().__init__(pdf_file) |
35 | 22 | self.lines_bboxes = []
|
36 | 23 | self.alignments_bboxes = []
|
37 | 24 | self.intersection_bboxes = []
|
38 | 25 | self.bboxes = []
|
39 | 26 | self.candidates = []
|
40 | 27 | self.features = []
|
41 |
| - self.iou_thresh = 0.8 |
42 |
| - self.scanned = False |
43 |
| - |
44 |
| - def identify_scanned_page(self, boxes, page_bbox, page_width, page_height): |
45 |
| - plane = Plane(page_bbox) |
46 |
| - plane.extend(boxes) |
47 |
| - cid2obj = [set([i]) for i in range(len(boxes))] # initialize clusters |
48 |
| - # default object map to cluster with its own index |
49 |
| - obj2cid = list(range(len(boxes))) |
50 |
| - prev_clusters = obj2cid |
51 |
| - while True: |
52 |
| - for i1, b1 in enumerate(boxes): |
53 |
| - for i2, b2 in enumerate(boxes): |
54 |
| - box1 = b1.bbox |
55 |
| - box2 = b2.bbox |
56 |
| - if ( |
57 |
| - box1[0] == box2[0] |
58 |
| - and box1[2] == box2[2] |
59 |
| - and round(box1[3]) == round(box2[1]) |
60 |
| - ): |
61 |
| - min_i = min(i1, i2) |
62 |
| - max_i = max(i1, i2) |
63 |
| - cid1 = obj2cid[min_i] |
64 |
| - cid2 = obj2cid[max_i] |
65 |
| - for obj_iter in cid2obj[cid2]: |
66 |
| - cid2obj[cid1].add(obj_iter) |
67 |
| - obj2cid[obj_iter] = cid1 |
68 |
| - cid2obj[cid2] = set() |
69 |
| - if prev_clusters == obj2cid: |
70 |
| - break |
71 |
| - prev_clusters = obj2cid |
72 |
| - clusters = [[boxes[i] for i in cluster] for cluster in filter(bool, cid2obj)] |
73 |
| - if ( |
74 |
| - len(clusters) == 1 |
75 |
| - and clusters[0][0].bbox[0] < -0.0 |
76 |
| - and clusters[0][0].bbox[1] <= 0 |
77 |
| - and abs(clusters[0][0].bbox[2] - page_width) <= 5 |
78 |
| - and abs(clusters[0][0].bbox[3] - page_height) <= 5 |
79 |
| - ): |
80 |
| - return True |
81 |
| - return False |
82 |
| - |
83 |
| - def parse(self): |
84 |
| - is_scanned = False |
85 |
| - lin_seg_present = False |
86 |
| - for page_num, layout in enumerate(analyze_pages(self.pdf_file)): |
87 |
| - page_num += 1 # indexes start at 1 |
88 |
| - elems, font_stat = normalize_pdf(layout, scaler=1) |
89 |
| - self.elems[page_num] = elems |
90 |
| - self.font_stats[page_num] = font_stat |
91 |
| - # code to detect if the page is scanned |
92 |
| - if len(elems.segments) > 0: |
93 |
| - lin_seg_present = True |
94 |
| - for fig in elems.figures: |
95 |
| - if ( |
96 |
| - fig.bbox[0] <= 0.0 |
97 |
| - and fig.bbox[1] <= 0.0 |
98 |
| - and round(fig.bbox[2]) == round(elems.layout.width) |
99 |
| - and round(fig.bbox[3]) == round(elems.layout.height) |
100 |
| - ): |
101 |
| - logger.debug( |
102 |
| - "{} is scanned because of full-page figure.".format( |
103 |
| - self.pdf_file |
104 |
| - ) |
105 |
| - ) |
106 |
| - is_scanned = True |
107 |
| - page_scanned = self.identify_scanned_page( |
108 |
| - elems.figures, |
109 |
| - elems.layout.bbox, |
110 |
| - elems.layout.width, |
111 |
| - elems.layout.height, |
112 |
| - ) |
113 |
| - # doc is scanned if any page is scanned |
114 |
| - if page_scanned: |
115 |
| - logger.debug( |
116 |
| - "{} is scanned one of its pages is scanned.".format(self.pdf_file) |
117 |
| - ) |
118 |
| - is_scanned = True |
119 |
| - if is_scanned or not lin_seg_present: |
120 |
| - self.scanned = True |
121 |
| - |
122 |
| - def get_scanned(self): |
123 |
| - if len(self.elems) == 0: |
124 |
| - self.parse() |
125 |
| - return self.scanned |
126 | 28 |
|
127 | 29 | def get_candidates(self):
|
128 | 30 | if len(self.elems) == 0:
|
@@ -181,43 +83,6 @@ def get_candidates_and_features_page_num(self, page_num):
|
181 | 83 | )
|
182 | 84 | return boxes, features
|
183 | 85 |
|
184 |
| - def get_candidates_lines(self, page_num, elems): |
185 |
| - page_width, page_height = int(elems.layout.width), int(elems.layout.height) |
186 |
| - lines = reorder_lines(elems.segments) |
187 |
| - vertical_lines, horizontal_lines = get_vertical_and_horizontal(lines) |
188 |
| - extended_vertical_lines = extend_vertical_lines(horizontal_lines) |
189 |
| - extended_horizontal_lines = extend_horizontal_lines(vertical_lines) |
190 |
| - vertical_lines = merge_vertical_lines( |
191 |
| - sorted(extended_vertical_lines + vertical_lines) |
192 |
| - ) |
193 |
| - horizontal_lines = merge_horizontal_lines( |
194 |
| - sorted(extended_horizontal_lines + horizontal_lines) |
195 |
| - ) |
196 |
| - rectangles = get_rectangles(sorted(vertical_lines), sorted(horizontal_lines)) |
197 |
| - return [(page_num, page_width, page_height) + bbox for bbox in rectangles] |
198 |
| - |
199 |
| - def get_candidates_alignments(self, page_num, elems): |
200 |
| - page_width, page_height = int(elems.layout.width), int(elems.layout.height) |
201 |
| - font_stat = self.font_stats[page_num] |
202 |
| - try: |
203 |
| - nodes, features = parse_layout(elems, font_stat) |
204 |
| - except Exception: |
205 |
| - nodes, features = [], [] |
206 |
| - return ( |
207 |
| - [ |
208 |
| - (page_num, page_width, page_height) |
209 |
| - + (node.y0, node.x0, node.y1, node.x1) |
210 |
| - for node in nodes |
211 |
| - ], |
212 |
| - features, |
213 |
| - ) |
214 |
| - |
215 |
| - def get_elems(self): |
216 |
| - return self.elems |
217 |
| - |
218 |
| - def get_font_stats(self): |
219 |
| - return self.font_stats |
220 |
| - |
221 | 86 | def get_labels(self, gt_tables):
|
222 | 87 | """
|
223 | 88 | :param gt_tables: dict, keys are page number and values are list of
|
|
0 commit comments