google · jayvdb · Sep 22, 2020 · thatch · Sep 22, 2020 · jayvdb
diff --git a/sre_yield/__init__.py b/sre_yield/__init__.py
@@ -59,6 +59,8 @@ def Not(chars):
     sre_constants.CATEGORY_NOT_DIGIT: Not(string.digits),
     sre_constants.CATEGORY_SPACE: string.whitespace,
     sre_constants.CATEGORY_NOT_SPACE: Not(string.whitespace),
+    sre_constants.CATEGORY_LINEBREAK: "\n",
+    sre_constants.CATEGORY_NOT_LINEBREAK: Not("\n"),
 }
 
 # This constant varies between builds of Python; this is the lower value.
@@ -380,14 +382,15 @@ def in_values(self, items):
         # Special case which distinguishes branch from charset operator
         if items and items[0][0] == sre_constants.NEGATE:
             items = self.branch_values(None, items[1:])
-            return [item for item in self.charset if item not in items]
+            charset = self.category(sre_constants.CATEGORY_NOT_LINEBREAK)
+            return [item for item in charset if item not in items]
         return self.branch_values(None, items)
 
     def not_literal(self, y):
         return self.in_values(((sre_constants.NEGATE,), (sre_constants.LITERAL, y)))
 
     def category(self, y):
-        return CATEGORIES[y]
+        return self.categories[y]
 
     def groupref(self, n):
         self.has_groupref = True
@@ -497,9 +500,19 @@ def __init__(
         if not isinstance(pattern, sre_parse.SubPattern):
             pattern = sre_parse.parse(pattern, flags)
         self.matcher = sre_compile.compile(pattern, flags)
-        if not flags & re.DOTALL:
-            charset = "".join(c for c in charset if c != "\n")
-        self.charset = charset
+        self.categories = CATEGORIES.copy()
+        if flags & re.DOTALL:
+            self.categories[sre_constants.CATEGORY_LINEBREAK] = ""
+            self.categories[sre_constants.CATEGORY_NOT_LINEBREAK] = CHARSET
+
+        if isinstance(charset, dict):
+            self.categories.update(charset)
+
+        elif charset != CHARSET:
+            if not flags & re.DOTALL:
+                charset = "".join(c for c in charset if c != "\n")
+            self.categories[sre_constants.CATEGORY_NOT_LINEBREAK] = charset
+
         self.relaxed = relaxed
 
         self.named_group_lookup = self.matcher.groupindex

diff --git a/sre_yield/tests/test_sre_yield.py b/sre_yield/tests/test_sre_yield.py
@@ -17,6 +17,7 @@
 
 import io
 import re
+import sre_constants
 import sre_parse
 import sys
 import unittest
@@ -46,6 +47,9 @@ def testOtherCases(self):
             300 * 26 * 257,
         )
         self.assertEqual(len(sre_yield.AllStrings("..", charset="0123456789")), 100)
+        categories = sre_yield.CATEGORIES.copy()
+        categories[sre_constants.CATEGORY_WORD] = "0123456789"
+        self.assertEqual(len(sre_yield.AllStrings(r"\w\w", charset=categories)), 100)
         self.assertEqual(len(sre_yield.AllStrings("0*")), 65536)
         # For really big lists, we can't use the len() function any more
         with self.assertRaises(OverflowError):