Skip to content

Commit 0f48212

Browse files
📝 🥚 Add example script - pysbd as a spaCy component
1 parent 17f274c commit 0f48212

File tree

1 file changed

+29
-0
lines changed

1 file changed

+29
-0
lines changed

examples/pysbd_as_spacy_component.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
Example of pySBD as a sentencizer component for spaCy
3+
4+
Installation:
5+
pip install spacy
6+
"""
7+
import pysbd
8+
import spacy
9+
10+
def pysbd_sentence_boundaries(doc):
11+
seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
12+
sents_char_spans = seg.segment(doc.text)
13+
char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans]
14+
start_token_ids = [span[0].idx for span in char_spans if span is not None]
15+
for token in doc:
16+
token.is_sent_start = True if token.idx in start_token_ids else False
17+
return doc
18+
19+
if __name__ == "__main__":
20+
text = "My name is Jonas E. Smith. Please turn to p. 55."
21+
nlp = spacy.blank('en')
22+
23+
# add as a spacy pipeline component
24+
nlp.add_pipe(pysbd_sentence_boundaries)
25+
26+
doc = nlp(text)
27+
print('sent_id', 'sentence', sep='\t|\t')
28+
for sent_id, sent in enumerate(doc.sents, start=1):
29+
print(sent_id, sent.text, sep='\t|\t')

0 commit comments

Comments
 (0)