Skip to content

Commit a565060

Browse files
🐛 Handle text with only punctuations & ! at EOL (#38)
🐛 Handle text with only punctuations & ! at EOL
2 parents 05edfdb + 3e20318 commit a565060

File tree

6 files changed

+18
-4
lines changed

6 files changed

+18
-4
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,8 @@
2121
# v0.1.4
2222

2323
- ✨ ✅ Handle intermittent punctuations - \#34
24+
25+
# v0.1.5
26+
27+
- 🐛 Handle text with only punctuations - \#36
28+
- 🐛 Handle exclamation marks at EOL- \#37

pysbd/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.1.4"
5+
__version__ = "0.1.5"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

pysbd/lang/common/numbers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ class Common(object):
77

88
# added special case: r"[。..!!?].*" to handle intermittent dots, exclamation, etc.
99
# TODO: above special cases group can be updated as per developer needs
10-
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"
10+
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"
11+
1112
# # Rubular: http://rubular.com/r/NqCqv372Ix
1213
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'
1314

pysbd/lang/standard.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class DoublePunctuationRules(object):
3939
SecondRule = Rule(r'!\?', '☈')
4040
ThirdRule = Rule(r'\?\?', '☇')
4141
ForthRule = Rule(r'!!', '☄')
42+
DoublePunctuation = r'\?!|!\?|\?\?|!!'
4243
All = [FirstRule, SecondRule, ThirdRule, ForthRule]
4344

4445

pysbd/processor.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,10 @@ def process_text(self, txt):
143143
txt += 'ȸ'
144144
txt = ExclamationWords.apply_rules(txt)
145145
txt = BetweenPunctuation(txt).replace()
146-
txt = Text(txt).apply(*DoublePunctuationRules.All,
147-
Standard.QuestionMarkInQuotationRule,
146+
# handle text having only doublepunctuations
147+
if not re.match(DoublePunctuationRules.DoublePunctuation, txt):
148+
txt = Text(txt).apply(*DoublePunctuationRules.All)
149+
txt = Text(txt).apply(Standard.QuestionMarkInQuotationRule,
148150
*ExclamationPointRules.All)
149151
txt = ListItemReplacer(txt).replace_parens()
150152
txt = self.sentence_boundary_punctuation(txt)
@@ -176,6 +178,8 @@ def sentence_boundary_punctuation(self, txt):
176178
if hasattr(self.language_module, 'ReplaceNonSentenceBoundaryCommaRule'):
177179
txt = Text(txt).apply(
178180
self.language_module.ReplaceNonSentenceBoundaryCommaRule)
181+
# retain exclamation mark if it is an ending character of a given text
182+
txt = re.sub(r'&ᓴ&$', '!', txt)
179183
txt = re.findall(Common.SENTENCE_BOUNDARY_REGEX, txt)
180184
return txt
181185

tests/regression/test_issues.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
('#34', '..', ['..']),
2121
('#34', '. . .', ['. . .']),
2222
('#34', '! ! !', ['! ! !']),
23+
('#36', '??', ['??']),
24+
('#37', "As an example of a different special-purpose mechanism, we have introduced a methodology for letting donors make their donations to charities conditional on donations by other donors (who, in turn, can make their donations conditional) [70]. We have used this mechanism to collect money for Indian Ocean Tsunami and Hurricane Katrina victims. We have also introduced a more general framework for negotiation when one agent's actions have a direct effect (externality) on the other agents' utilities [69]. Both the charities and externalities methodologies require the solution of NP-hard optimization problems in general, but there are some natural tractable cases as well as effective MIP formulations. Recently, Ghosh and Mahdian [86] at Yahoo! Research extended our charities work, and based on this a web-based system for charitable donations was built at Yahoo!",
25+
['As an example of a different special-purpose mechanism, we have introduced a methodology for letting donors make their donations to charities conditional on donations by other donors (who, in turn, can make their donations conditional) [70].', 'We have used this mechanism to collect money for Indian Ocean Tsunami and Hurricane Katrina victims.', "We have also introduced a more general framework for negotiation when one agent's actions have a direct effect (externality) on the other agents' utilities [69].", 'Both the charities and externalities methodologies require the solution of NP-hard optimization problems in general, but there are some natural tractable cases as well as effective MIP formulations.', 'Recently, Ghosh and Mahdian [86] at Yahoo! Research extended our charities work, and based on this a web-based system for charitable donations was built at Yahoo!'])
2326
]
2427

2528
@pytest.mark.parametrize('issue_no,text,expected_sents', TEST_ISSUE_DATA)

0 commit comments

Comments
 (0)