Skip to content

Commit a2bb451

Browse files
✨Add char_span functionality (#40)
✨Add `char_span` functionality
2 parents a565060 + 8d1c94d commit a2bb451

19 files changed

+206
-58
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,9 @@
2626

2727
- 🐛 Handle text with only punctuations - \#36
2828
- 🐛 Handle exclamation marks at EOL- \#37
29+
30+
# v0.2.0
31+
32+
- ✨Add `char_span` parameter (optional) to get sentence & its (start, end) char offsets from original text
33+
- ✨pySBD as a spaCy component example
34+
- 🐛 Fix double question mark swallow bug - \#39

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ print(seg.segment(text))
2424
# ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
2525
```
2626

27+
- Use `pysbd` as a [spaCy](https://spacy.io/usage/processing-pipelines) pipeline component. (recommended)</br>Please refer to example [pysbd\_as\_spacy\_component.py](https://github.com/nipunsadvilkar/pySBD/blob/master/examples/pysbd_as_spacy_component.py)
28+
2729
## Contributing
2830

2931
If you find a text that is incorrectly segmented using pySBD, please submit an issue.

examples/pysbd_as_spacy_component.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
Example of pySBD as a sentencizer component for spaCy
3+
4+
Installation:
5+
pip install spacy
6+
"""
7+
import pysbd
8+
import spacy
9+
10+
def pysbd_sentence_boundaries(doc):
11+
seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
12+
sents_char_spans = seg.segment(doc.text)
13+
char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans]
14+
start_token_ids = [span[0].idx for span in char_spans if span is not None]
15+
for token in doc:
16+
token.is_sent_start = True if token.idx in start_token_ids else False
17+
return doc
18+
19+
if __name__ == "__main__":
20+
text = "My name is Jonas E. Smith. Please turn to p. 55."
21+
nlp = spacy.blank('en')
22+
23+
# add as a spacy pipeline component
24+
nlp.add_pipe(pysbd_sentence_boundaries)
25+
26+
doc = nlp(text)
27+
print('sent_id', 'sentence', sep='\t|\t')
28+
for sent_id, sent in enumerate(doc.sents, start=1):
29+
print(sent_id, sent.text, sep='\t|\t')

pysbd/abbreviation_replacer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import re
3-
from pysbd.rules import Text
3+
from pysbd.utils import Text
44
# TODO: SENTENCE_STARTERS should be lang specific
55
from pysbd.lang.standard import Abbreviation, SENTENCE_STARTERS
66
from pysbd.lang.common.numbers import (Common, SingleLetterAbbreviationRules,
@@ -93,5 +93,5 @@ def scan_for_replacements(self, txt, am, ind, char_array):
9393

9494

9595
if __name__ == "__main__":
96-
s = "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia."
96+
s = "fig. ??"
9797
print(AbbreviationReplacer(s).replace())

pysbd/about.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.1.5"
5+
__version__ = "0.2.0"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

pysbd/clean/rules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# -*- coding: utf-8 -*-
2-
from pysbd.rules import Rule
2+
from pysbd.utils import Rule
33

44

55
class CleanRules(object):

pysbd/cleaner.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import re
3-
from pysbd.rules import Text
3+
from pysbd.utils import Text
44
from pysbd.clean.rules import PDF, HTML, CleanRules as cr
55
from pysbd.lang.standard import Abbreviation
66

@@ -10,7 +10,6 @@ class Cleaner(object):
1010
def __init__(self, text, language='common', doc_type=None):
1111
self.text = text
1212
self.language = language
13-
# self.language_module = Language.get_language_code(language)
1413
self.doc_type = doc_type
1514

1615
def clean(self):
@@ -116,7 +115,6 @@ def clean_consecutive_characters(self):
116115

117116

118117
if __name__ == "__main__":
119-
# text = "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot."
120-
text = "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early"
118+
text = "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot."
121119
c = Cleaner(text)
122120
print(c.clean())

pysbd/exclamation_words.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,5 @@ def apply_rules(cls, text):
1818

1919

2020
if __name__ == "__main__":
21-
# text = "Hello .World work for Yahoo! company"
2221
text = "\"Dinah'll miss me very much to-night, I should think!\"ȸ"
2322
print(ExclamationWords.apply_rules(text))

pysbd/lang/common/ellipsis.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
# -*- coding: utf-8 -*-
2-
from pysbd.rules import Rule
2+
from pysbd.utils import Rule
33

44

55
class EllipsisRules(object):
66

7+
# below rules aren't similar to original rules of pragmatic segmenter
8+
# modification: spaces replaced with same number of symbols
79
# Rubular: http://rubular.com/r/i60hCK81fz
8-
ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏.')
10+
ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏.')
911

1012
# Rubular: http://rubular.com/r/Hdqpd90owl
11-
FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪ')
13+
FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')
1214

1315
# Rubular: http://rubular.com/r/YBG1dIHTRu
14-
ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟')
16+
ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')
1517

1618
# Rubular: http://rubular.com/r/2VvZ8wRbd8
17-
FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝')
19+
FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')
1820

19-
OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪ')
21+
OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')
2022

2123
All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
2224
ThreeConsecutiveRule, OtherThreePeriodRule]

pysbd/lang/common/numbers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import re
3-
from pysbd.rules import Rule
3+
from pysbd.utils import Rule
44

55

66
class Common(object):

0 commit comments

Comments
 (0)