You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
67 lines
2.0 KiB
67 lines
2.0 KiB
""" |
|
Iterator based sre token scanner |
|
""" |
|
import re |
|
from re import VERBOSE, MULTILINE, DOTALL |
|
import sre_parse |
|
import sre_compile |
|
import sre_constants |
|
from sre_constants import BRANCH, SUBPATTERN |
|
|
|
__all__ = ['Scanner', 'pattern'] |
|
|
|
FLAGS = (VERBOSE | MULTILINE | DOTALL) |
|
|
|
class Scanner(object): |
|
def __init__(self, lexicon, flags=FLAGS): |
|
self.actions = [None] |
|
# Combine phrases into a compound pattern |
|
s = sre_parse.Pattern() |
|
s.flags = flags |
|
p = [] |
|
for idx, token in enumerate(lexicon): |
|
phrase = token.pattern |
|
try: |
|
subpattern = sre_parse.SubPattern(s, |
|
[(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) |
|
except sre_constants.error: |
|
raise |
|
p.append(subpattern) |
|
self.actions.append(token) |
|
|
|
s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work |
|
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |
|
self.scanner = sre_compile.compile(p) |
|
|
|
def iterscan(self, string, idx=0, context=None): |
|
""" |
|
Yield match, end_idx for each match |
|
""" |
|
match = self.scanner.scanner(string, idx).match |
|
actions = self.actions |
|
lastend = idx |
|
end = len(string) |
|
while True: |
|
m = match() |
|
if m is None: |
|
break |
|
matchbegin, matchend = m.span() |
|
if lastend == matchend: |
|
break |
|
action = actions[m.lastindex] |
|
if action is not None: |
|
rval, next_pos = action(m, context) |
|
if next_pos is not None and next_pos != matchend: |
|
# "fast forward" the scanner |
|
matchend = next_pos |
|
match = self.scanner.scanner(string, matchend).match |
|
yield rval, matchend |
|
lastend = matchend |
|
|
|
|
|
def pattern(pattern, flags=FLAGS): |
|
def decorator(fn): |
|
fn.pattern = pattern |
|
fn.regex = re.compile(pattern, flags) |
|
return fn |
|
return decorator |