✨Add char_span functionality (#40)

nipunsadvilkar · web-flow · commit a2bb4510e65b · 2019-10-25T16:36:48.000+05:30
✨Add `char_span` functionality
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,3 +26,9 @@
 
 -   🐛 Handle text with only punctuations - \#36
 -   🐛 Handle exclamation marks at EOL- \#37
+
+# v0.2.0
+
+-   ✨Add `char_span` parameter (optional) to get sentence & its (start, end) char offsets from original text
+-   ✨pySBD as a spaCy component example
+-   🐛 Fix double question mark swallow bug - \#39
diff --git a/README.md b/README.md
@@ -24,6 +24,8 @@ print(seg.segment(text))
 # ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
 ```
 
+-   Use `pysbd` as a [spaCy](https://spacy.io/usage/processing-pipelines) pipeline component. (recommended)</br>Please refer to example [pysbd\_as\_spacy\_component.py](https://github.com/nipunsadvilkar/pySBD/blob/master/examples/pysbd_as_spacy_component.py)
+
 ## Contributing
 
 If you find a text that is incorrectly segmented using pySBD, please submit an issue.
diff --git a/examples/pysbd_as_spacy_component.py b/examples/pysbd_as_spacy_component.py
@@ -0,0 +1,29 @@
+"""
+Example of pySBD as a sentencizer component for spaCy
+
+Installation:
+pip install spacy
+"""
+import pysbd
+import spacy
+
+def pysbd_sentence_boundaries(doc):
+    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
+    sents_char_spans = seg.segment(doc.text)
+    char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans]
+    start_token_ids = [span[0].idx for span in char_spans if span is not None]
+    for token in doc:
+        token.is_sent_start = True if token.idx in start_token_ids else False
+    return doc
+
+if __name__ == "__main__":
+    text = "My name is Jonas E. Smith.          Please turn to p. 55."
+    nlp = spacy.blank('en')
+
+    # add as a spacy pipeline component
+    nlp.add_pipe(pysbd_sentence_boundaries)
+
+    doc = nlp(text)
+    print('sent_id', 'sentence', sep='\t|\t')
+    for sent_id, sent in enumerate(doc.sents, start=1):
+        print(sent_id, sent.text, sep='\t|\t')
diff --git a/pysbd/abbreviation_replacer.py b/pysbd/abbreviation_replacer.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import re
-from pysbd.rules import Text
+from pysbd.utils import Text
 # TODO: SENTENCE_STARTERS should be lang specific
 from pysbd.lang.standard import Abbreviation, SENTENCE_STARTERS
 from pysbd.lang.common.numbers import (Common, SingleLetterAbbreviationRules,
@@ -93,5 +93,5 @@ def scan_for_replacements(self, txt, am, ind, char_array):
 
 
 if __name__ == "__main__":
-    s = "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia."
+    s = "fig. ??"
     print(AbbreviationReplacer(s).replace())
diff --git a/pysbd/about.py b/pysbd/about.py
@@ -2,7 +2,7 @@
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 
 __title__ = "pysbd"
-__version__ = "0.1.5"
+__version__ = "0.2.0"
 __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
 __uri__ = "http://nipunsadvilkar.github.io/"
 __author__ = "Nipun Sadvilkar"
diff --git a/pysbd/clean/rules.py b/pysbd/clean/rules.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-from pysbd.rules import Rule
+from pysbd.utils import Rule
 
 
 class CleanRules(object):
diff --git a/pysbd/cleaner.py b/pysbd/cleaner.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import re
-from pysbd.rules import Text
+from pysbd.utils import Text
 from pysbd.clean.rules import PDF, HTML, CleanRules as cr
 from pysbd.lang.standard import Abbreviation
 
@@ -10,7 +10,6 @@ class Cleaner(object):
     def __init__(self, text, language='common', doc_type=None):
         self.text = text
         self.language = language
-        # self.language_module = Language.get_language_code(language)
         self.doc_type = doc_type
 
     def clean(self):
@@ -116,7 +115,6 @@ def clean_consecutive_characters(self):
 
 
 if __name__ == "__main__":
-    # text = "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot."
-    text = "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:  \n\n1. Organise your pregnancy care early"
+    text = "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot."
     c = Cleaner(text)
     print(c.clean())
diff --git a/pysbd/exclamation_words.py b/pysbd/exclamation_words.py
@@ -18,6 +18,5 @@ def apply_rules(cls, text):
 
 
 if __name__ == "__main__":
-    # text = "Hello .World work for Yahoo! company"
     text = "\"Dinah'll miss me very much to-night, I should think!\"ȸ"
     print(ExclamationWords.apply_rules(text))
diff --git a/pysbd/lang/common/ellipsis.py b/pysbd/lang/common/ellipsis.py
@@ -1,22 +1,24 @@
 # -*- coding: utf-8 -*-
-from pysbd.rules import Rule
+from pysbd.utils import Rule
 
 
 class EllipsisRules(object):
 
+    # below rules aren't similar to original rules of pragmatic segmenter
+    # modification: spaces replaced with same number of symbols
     # Rubular: http://rubular.com/r/i60hCK81fz
-    ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏.')
+    ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')
 
     # Rubular: http://rubular.com/r/Hdqpd90owl
-    FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪ')
+    FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')
 
     # Rubular: http://rubular.com/r/YBG1dIHTRu
-    ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟')
+    ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')
 
     # Rubular: http://rubular.com/r/2VvZ8wRbd8
-    FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝')
+    FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')
 
-    OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪ')
+    OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')
 
     All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
            ThreeConsecutiveRule, OtherThreePeriodRule]
diff --git a/pysbd/lang/common/numbers.py b/pysbd/lang/common/numbers.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import re
-from pysbd.rules import Rule
+from pysbd.utils import Rule
 
 
 class Common(object):
diff --git a/pysbd/lang/standard.py b/pysbd/lang/standard.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-from pysbd.rules import Rule
+from pysbd.utils import Rule
 
 
 class Standard(object):
@@ -24,8 +24,8 @@ class Standard(object):
 
 class Abbreviation(object):
     """Defines the abbreviations for each language (if available)"""
-    ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
-    PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
+    ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
+    PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
     NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
 
 
@@ -82,10 +82,12 @@ class SubSymbolsRules(object):
 
 
 class ReinsertEllipsisRules(object):
-    SubThreeConsecutivePeriod = Rule(r'ƪ', '...')
-    SubThreeSpacePeriod = Rule(r'♟', ' . . . ')
-    SubFourSpacePeriod = Rule(r'♝', '. . . .')
-    SubTwoConsecutivePeriod = Rule(r'☏', '..')
+    # below rules aren't similar to original rules of pragmatic segmenter
+    # modification: symbols replaced with same number of ellipses
+    SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
+    SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
+    SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
+    SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
     SubOnePeriod = Rule(r'∮', '.')
     All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
            SubTwoConsecutivePeriod, SubOnePeriod]
diff --git a/pysbd/lists_item_replacer.py b/pysbd/lists_item_replacer.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import string
 import re
-from pysbd.rules import Rule, Text
+from pysbd.utils import Rule, Text
 from functools import partial
 
 
diff --git a/pysbd/processor.py b/pysbd/processor.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import re
 import os
-from pysbd.rules import Text
+from pysbd.utils import Text, TextSpan
 from pysbd.lists_item_replacer import ListItemReplacer
 from pysbd.languages import Language
 from pysbd.lang.standard import (Standard, Abbreviation,
@@ -17,10 +17,23 @@
 
 class Processor(object):
 
-    def __init__(self, text, language='common'):
+    def __init__(self, text, language='common', char_span=False):
+        """Process a text - do pre and post processing - to get proper sentences
+
+        Parameters
+        ----------
+        text : str
+            Original text
+        language : str, optional
+            by default "common" i.e., english text preprocessing
+        char_span : bool, optional
+            Get start & end character offsets of each sentences
+            within original text, by default False
+        """
         self.language = language
         self.language_module = Language.get_language_code(language)
         self.text = text
+        self.char_span = char_span
 
     def process(self):
         if not self.text:
@@ -72,28 +85,48 @@ def split_into_segments(self):
             Text(s).apply(Standard.SingleNewLineRule, *EllipsisRules.All)
             for s in sents
         ]
-        sents = [self.check_for_punctuation(s) for s in sents]
+        sents_w_spans = [self.check_for_punctuation(s) for s in sents]
         # flatten list of list of sentences
-        sents = self.rm_none_flatten(sents)
-        sents = [
-            Text(s).apply(*SubSymbolsRules.All)
-            for s in sents
-        ]
-        post_process_sents = [self.post_process_segments(s) for s in sents]
-        post_process_sents = self.rm_none_flatten(post_process_sents)
-        post_process_sents = [
-            Text(s).apply(Standard.SubSingleQuoteRule)
-            for s in post_process_sents
-        ]
-        return post_process_sents
+        sents_w_spans = self.rm_none_flatten(sents_w_spans)
+        new_spans = []
+        for sent_span in sents_w_spans:
+            if sent_span.sent.endswith('ȸ'):
+                sent_span.end = sent_span.end - 1
+            sent_span.sent = Text(sent_span.sent).apply(*SubSymbolsRules.All)
+            post_process_sent = self.post_process_segments(sent_span.sent)
+            if post_process_sent and isinstance(post_process_sent, str):
+                sent_span.sent = post_process_sent
+                new_spans.append(sent_span)
+            elif isinstance(post_process_sent, list):
+                tmp_char_start = sent_span.start
+                for pps in post_process_sent:
+                    new_spans.append(TextSpan(pps, tmp_char_start, tmp_char_start + len(pps)))
+                    tmp_char_start += len(pps)
+        for ns in new_spans:
+            ns.sent = Text(ns.sent).apply(Standard.SubSingleQuoteRule)
+        if self.char_span:
+            return new_spans
+        else:
+            return [s.sent for s in new_spans]
 
     def post_process_segments(self, txt):
         if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
             return txt
-        if self.consecutive_underscore(txt) or len(txt) < 2:
+
+        # below condition present in pragmatic segmenter
+        # dont know significance of it yet.
+        # if self.consecutive_underscore(txt) or len(txt) < 2:
+        #     return txt
+
+        if re.match(r'\t', txt):
             pass
-        txt = Text(txt).apply(*ReinsertEllipsisRules.All,
-                              Standard.ExtraWhiteSpaceRule)
+
+        # TODO:
+        # Decide on keeping or removing Standard.ExtraWhiteSpaceRule
+        # removed to retain original text spans
+        # txt = Text(txt).apply(*ReinsertEllipsisRules.All,
+        #                       Standard.ExtraWhiteSpaceRule)
+        txt = Text(txt).apply(*ReinsertEllipsisRules.All)
         if re.search(Common.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
             txt = re.split(
                 Common.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
@@ -136,7 +169,8 @@ def check_for_punctuation(self, txt):
             return sents
         else:
             # NOTE: next steps of check_for_punctuation will unpack this list
-            return [txt]
+            return TextSpan(txt, 0, len(txt))
+            # return [txt]
 
     def process_text(self, txt):
         if txt[-1] not in Standard.Punctuations:
@@ -180,7 +214,10 @@ def sentence_boundary_punctuation(self, txt):
                 self.language_module.ReplaceNonSentenceBoundaryCommaRule)
         # retain exclamation mark if it is an ending character of a given text
         txt = re.sub(r'&ᓴ&$', '!', txt)
-        txt = re.findall(Common.SENTENCE_BOUNDARY_REGEX, txt)
+        txt = [
+            TextSpan(m.group(), m.start(), m.end())
+            for m in re.finditer(Common.SENTENCE_BOUNDARY_REGEX, txt)
+            ]
         return txt
 
 
diff --git a/pysbd/punctuation_replacer.py b/pysbd/punctuation_replacer.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import re
-from pysbd.rules import Rule, Text
+from pysbd.utils import Rule, Text
 
 
 class EscapeRegexReservedCharacters(object):
diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py
@@ -6,27 +6,47 @@
 
 class Segmenter(object):
 
-    def __init__(self, language="en", clean=False, doc_type=None):
+    def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
+        """Segments a text into an list of sentences
+        with or withour character offsets from original text
+
+        Parameters
+        ----------
+        language : str, optional
+            specify a language use its two character ISO 639-1 code,
+            by default "en"
+        clean : bool, optional
+            cleans original text, by default False
+        doc_type : [type], optional
+            Normal text or OCRed text, by default None
+            set to `pdf` for OCRed text
+        char_span : bool, optional
+            Get start & end character offsets of each sentences
+            within original text, by default False
+        """
         self.language = language
         self.language_module = Language.get_language_code(language)
         self.clean = clean
         self.doc_type = doc_type
+        self.char_span = char_span
 
     def segment(self, text):
         if not text:
             return []
-        if self.clean:
+        if self.clean and self.char_span:
+            raise ValueError("char_span must be False if clean is True. "
+                             "Since `clean=True` will modify original text.")
+        elif self.clean:
             text = Cleaner(text, doc_type=self.doc_type).clean()
-        processor = Processor(text)
+        processor = Processor(text, char_span=self.char_span)
         segments = processor.process()
         return segments
 
 
 if __name__ == "__main__":
-    # text = "Proof. First let v ∈ V be incident to at least three leaves and suppose there is a minimum power dominating set S of G that does not contain v. If S excludes two or more of the leaves of G incident to v, then those leaves cannot be dominated or forced at any step. Thus, S excludes at most one leaf incident to v, which means S contains at least two leaves ℓ 1 and ℓ 2 incident to v. Then, (S\{ℓ 1 , ℓ 2 }) ∪ {v} is a smaller power dominating set than S, which is a contradiction. Now consider the case in which v ∈ V is incident to exactly two leaves, ℓ 1 and ℓ 2 , and suppose there is a minimum power dominating set S of G such that {v, ℓ 1 , ℓ 2 } ∩ S = ∅. Then neither ℓ 1 nor ℓ 2 can be dominated or forced at any step, contradicting the assumption that S is a power dominating set. If S is a power dominating set that contains ℓ 1 or ℓ 2 , say ℓ 1 , then (S\{ℓ 1 }) ∪ {v} is also a power dominating set and has the same cardinality. Applying this to every vertex incident to exactly two leaves produces the minimum power dominating set required by (3). Definition 3.4. Given a graph G = (V, E) and a set X ⊆ V , define ℓ r (G, X) as the graph obtained by attaching r leaves to each vertex in X. If X = {v 1 , . . . , v k }, we denote the r leaves attached to vertex v i as ℓ"
-    text = "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012)."
+    text = "My name is Jonas E. Smith. Please turn to p. 55."
     print("Input String:\n{}".format(text))
-    seg = Segmenter(language="en", clean=False)
+    seg = Segmenter(language="en", clean=False, char_span=True)
     segments = seg.segment(text)
     print("\n################## Processing #######################\n")
     print("Number of sentences: {}\n".format(len(segments)))
diff --git a/pysbd/utils.py b/pysbd/utils.py
diff --git a/tests/regression/test_issues.py b/tests/regression/test_issues.py
diff --git a/tests/test_char_span.py b/tests/test_char_span.py
diff --git a/tests/test_english_clean.py b/tests/test_english_clean.py