File tree Expand file tree Collapse file tree 1 file changed +29
-0
lines changed Expand file tree Collapse file tree 1 file changed +29
-0
lines changed Original file line number Diff line number Diff line change
1
+ """
2
+ Example of pySBD as a sentencizer component for spaCy
3
+
4
+ Installation:
5
+ pip install spacy
6
+ """
7
+ import pysbd
8
+ import spacy
9
+
10
+ def pysbd_sentence_boundaries (doc ):
11
+ seg = pysbd .Segmenter (language = "en" , clean = False , char_span = True )
12
+ sents_char_spans = seg .segment (doc .text )
13
+ char_spans = [doc .char_span (sent_span .start , sent_span .end ) for sent_span in sents_char_spans ]
14
+ start_token_ids = [span [0 ].idx for span in char_spans if span is not None ]
15
+ for token in doc :
16
+ token .is_sent_start = True if token .idx in start_token_ids else False
17
+ return doc
18
+
19
+ if __name__ == "__main__" :
20
+ text = "My name is Jonas E. Smith. Please turn to p. 55."
21
+ nlp = spacy .blank ('en' )
22
+
23
+ # add as a spacy pipeline component
24
+ nlp .add_pipe (pysbd_sentence_boundaries )
25
+
26
+ doc = nlp (text )
27
+ print ('sent_id' , 'sentence' , sep = '\t |\t ' )
28
+ for sent_id , sent in enumerate (doc .sents , start = 1 ):
29
+ print (sent_id , sent .text , sep = '\t |\t ' )
You can’t perform that action at this time.
0 commit comments