Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions sre_yield/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def Not(chars):
sre_constants.CATEGORY_NOT_DIGIT: Not(string.digits),
sre_constants.CATEGORY_SPACE: string.whitespace,
sre_constants.CATEGORY_NOT_SPACE: Not(string.whitespace),
sre_constants.CATEGORY_LINEBREAK: "\n",
sre_constants.CATEGORY_NOT_LINEBREAK: Not("\n"),
}

# This constant varies between builds of Python; this is the lower value.
Expand Down Expand Up @@ -380,14 +382,15 @@ def in_values(self, items):
# Special case which distinguishes branch from charset operator
if items and items[0][0] == sre_constants.NEGATE:
items = self.branch_values(None, items[1:])
return [item for item in self.charset if item not in items]
charset = self.category(sre_constants.CATEGORY_NOT_LINEBREAK)
return [item for item in charset if item not in items]
return self.branch_values(None, items)

def not_literal(self, y):
return self.in_values(((sre_constants.NEGATE,), (sre_constants.LITERAL, y)))

def category(self, y):
return CATEGORIES[y]
return self.categories[y]

def groupref(self, n):
self.has_groupref = True
Expand Down Expand Up @@ -497,9 +500,19 @@ def __init__(
if not isinstance(pattern, sre_parse.SubPattern):
pattern = sre_parse.parse(pattern, flags)
self.matcher = sre_compile.compile(pattern, flags)
if not flags & re.DOTALL:
charset = "".join(c for c in charset if c != "\n")
self.charset = charset
self.categories = CATEGORIES.copy()
if flags & re.DOTALL:
self.categories[sre_constants.CATEGORY_LINEBREAK] = ""
self.categories[sre_constants.CATEGORY_NOT_LINEBREAK] = CHARSET
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be = charset lowercase

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Otherwise, does this approach seem reasonable, and worth polishing?


if isinstance(charset, dict):
self.categories.update(charset)

elif charset != CHARSET:
if not flags & re.DOTALL:
charset = "".join(c for c in charset if c != "\n")
self.categories[sre_constants.CATEGORY_NOT_LINEBREAK] = charset

self.relaxed = relaxed

self.named_group_lookup = self.matcher.groupindex
Expand Down
4 changes: 4 additions & 0 deletions sre_yield/tests/test_sre_yield.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import io
import re
import sre_constants
import sre_parse
import sys
import unittest
Expand Down Expand Up @@ -46,6 +47,9 @@ def testOtherCases(self):
300 * 26 * 257,
)
self.assertEqual(len(sre_yield.AllStrings("..", charset="0123456789")), 100)
categories = sre_yield.CATEGORIES.copy()
categories[sre_constants.CATEGORY_WORD] = "0123456789"
self.assertEqual(len(sre_yield.AllStrings(r"\w\w", charset=categories)), 100)
self.assertEqual(len(sre_yield.AllStrings("0*")), 65536)
# For really big lists, we can't use the len() function any more
with self.assertRaises(OverflowError):
Expand Down