Skip to content

Commit 91676b8

Browse files
Merge pull request #77 from nipunsadvilkar/npn-pdf-mode-exceptions
2 parents 9069997 + fc61aef commit 91676b8

File tree

6 files changed

+43
-9
lines changed

6 files changed

+43
-9
lines changed

.github/ISSUE_TEMPLATE/bug_report.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
name: Bug report
33
about: Create a report & testcase to help us improve
4-
title: "\U0001F41B"
4+
title: <Appropriate title>
55
labels: ''
66
assignees: ''
77

@@ -26,3 +26,7 @@ Example:
2626

2727
**Additional context**
2828
Add any other context about the problem here.
29+
30+
<details>
31+
<summary>Paste Error Traceback here, if any</summary>
32+
<details>

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# v0.3.2
2+
- 🐛 ✅ Enforce clean=True when doc_type="pdf" - \#75
3+
14
# v0.3.1
25
- 🚑 ✅ Handle Newline character & update tests
36

pysbd/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.3.1"
5+
__version__ = "0.3.2"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

pysbd/processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import re
3-
from pysbd.utils import Text, TextSpan
3+
from pysbd.utils import Text
44
from pysbd.lists_item_replacer import ListItemReplacer
55
from pysbd.exclamation_words import ExclamationWords
66
from pysbd.between_punctuation import BetweenPunctuation

pysbd/segmenter.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
3131
self.clean = clean
3232
self.doc_type = doc_type
3333
self.char_span = char_span
34+
if self.clean and self.char_span:
35+
raise ValueError("char_span must be False if clean is True. "
36+
"Since `clean=True` will modify original text.")
37+
# when doctype is pdf then force user to clean the text
38+
# char_span func wont be provided with pdf doctype also
39+
elif self.doc_type == 'pdf' and not self.clean:
40+
raise ValueError("`doc_type='pdf'` should have `clean=True` & "
41+
"`char_span` should be False since original"
42+
"text will be modified.")
3443

3544
def cleaner(self, text):
3645
if hasattr(self.language_module, "Cleaner"):
@@ -71,11 +80,10 @@ def segment(self, text):
7180
self.original_text = text
7281
if not text:
7382
return []
74-
if self.clean and self.char_span:
75-
raise ValueError("char_span must be False if clean is True. "
76-
"Since `clean=True` will modify original text.")
77-
elif self.clean:
83+
84+
if self.clean or self.doc_type == 'pdf':
7885
text = self.cleaner(text).clean()
86+
7987
postprocessed_sents = self.processor(text).process()
8088
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
8189
if self.char_span:

tests/test_segmenter.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,27 @@ def test_exception_with_both_clean_and_span_true():
6868
"""
6969
with pytest.raises(ValueError) as e:
7070
seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
71-
text = "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>"
72-
seg.segment(text)
71+
assert str(e.value) == "char_span must be False if clean is True. "\
72+
"Since `clean=True` will modify original text."
73+
74+
def test_exception_with_doc_type_pdf_and_clean_false():
75+
"""
76+
Test to force clean=True when doc_type="pdf"
77+
"""
78+
with pytest.raises(ValueError) as e:
79+
seg = pysbd.Segmenter(language="en", clean=False, doc_type='pdf')
80+
assert str(e.value) == ("`doc_type='pdf'` should have `clean=True` & "
81+
"`char_span` should be False since original"
82+
"text will be modified.")
83+
84+
def test_exception_with_doc_type_pdf_and_both_clean_char_span_true():
85+
"""
86+
Test to raise ValueError exception when doc_type="pdf" and
87+
both clean=True and char_span=True
88+
"""
89+
with pytest.raises(ValueError) as e:
90+
seg = pysbd.Segmenter(language="en", clean=True,
91+
doc_type='pdf', char_span=True)
7392
assert str(e.value) == "char_span must be False if clean is True. "\
7493
"Since `clean=True` will modify original text."
7594

0 commit comments

Comments
 (0)