Skip to content

Commit 0686a18

Browse files
authored
Honor pdftotree's loglevel when calling tabula (#103)
1 parent 3381cc2 commit 0686a18

File tree

5 files changed

+29
-4
lines changed

5 files changed

+29
-4
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
- Embed base64-encoded images inline. Support starting with JPEG and BMP.
1111
([#99](https://github.com/HazyResearch/pdftotree/pull/99), [@HiromuHota][HiromuHota])
1212

13+
### Changed
14+
- Suppress tabula-java's log messages unless pdftotree's logger is set logging.DEBUG.
15+
([#103](https://github.com/HazyResearch/pdftotree/pull/103), [@HiromuHota][HiromuHota])
16+
1317
### Fixed
1418
- List a missing "ocrx_line" in the ocr-capabilities metadata field.
1519
([#94](https://github.com/HazyResearch/pdftotree/issues/94), [@HiromuHota][HiromuHota])

README.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,11 @@ pdftotree as a Python package
4343
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4444

4545
.. code:: python
46+
# Uncomment the followings if tabula should not be silent.
47+
# import logging
48+
# logging.getLogger("pdftotree").setLevel(logging.DEBUG)
4649
4750
import pdftotree
48-
4951
pdftotree.parse(pdf_file, html_path=None, model_type=None, model_path=None, visualize=False):
5052
5153
pdftotree
@@ -73,7 +75,7 @@ This takes a PDF file as input and produces an hOCR file as output::
7375
printed to stdout.
7476
-V, --visualize Whether to output visualization images
7577
-v, --verbose Output INFO level logging.
76-
-vv, --veryverbose Output DEBUG level logging.
78+
-vv, --veryverbose Output DEBUG level logging. Use this if tabula should not be silent.
7779

7880
extract\_tables
7981
~~~~~~~~~~~~~~~

bin/pdftotree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ if __name__ == "__main__":
5858
"--veryverbose",
5959
dest="debug",
6060
action="store_true",
61-
help="Output DEBUG level logging.",
61+
help="Output DEBUG level logging. Use this if tabula should not be silent.",
6262
)
6363
parser.set_defaults(visualize=False)
6464
args = parser.parse_args()

pdftotree/TreeExtract.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,13 @@ def get_html_table(self, table: List[float], page_num) -> Optional[Element]:
446446
:return: DOM element for a table
447447
"""
448448
logger.debug(f"Calling tabula at page: {page_num} and area: {table}.")
449+
loglevel = logging.getLogger("pdftotree").getEffectiveLevel()
449450
table_json = tabula.read_pdf(
450-
self.pdf_file, pages=page_num, area=table, output_format="json"
451+
self.pdf_file,
452+
pages=page_num,
453+
area=table,
454+
output_format="json",
455+
silent=False if loglevel <= logging.DEBUG else True,
451456
)
452457
logger.debug(f"Tabula recognized {len(table_json)} table(s).")
453458
if len(table_json) == 0:

tests/test_basic.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import os
23
from subprocess import PIPE, Popen
34
from typing import Optional
@@ -73,6 +74,19 @@ def test_no_out_of_order(caplog):
7374
assert "Out of order" not in caplog.text
7475

7576

77+
def test_tabula_warning_suppressed(caplog):
78+
"""Test if tabula warnings are suppressed."""
79+
# Warnings suppressed by default
80+
pdftotree.parse("tests/input/112823.pdf")
81+
assert "org.apache.pdfbox" not in caplog.text
82+
83+
# Not to suppress warnings
84+
log = logging.getLogger("pdftotree")
85+
log.setLevel(logging.DEBUG)
86+
pdftotree.parse("tests/input/112823.pdf")
87+
assert "org.apache.pdfbox" in caplog.text
88+
89+
7690
def test_visualize_output(tmp_path):
7791
"""Test if an output can be visualzied."""
7892
html_path = os.path.join(tmp_path, "md.html")

0 commit comments

Comments
 (0)