Skip to content

Commit

Permalink
faster implementation of ldap.schema.tokenizer.split_tokens()
Browse files Browse the repository at this point in the history
  • Loading branch information
stroeder committed Feb 20, 2017
1 parent 909b2ea commit adb6b47
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 42 deletions.
4 changes: 3 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Released 2.4.33 2017-02-xx

Lib/
* faster implementation of ldap.schema.tokenizer.split_tokens()
(thanks to Christian Heimes)
* removed unused 2nd argument of ldap.schema.tokenizer.split_tokens()

Tests/
Expand Down Expand Up @@ -1345,4 +1347,4 @@ Released 2.0.0pre02 2002-02-01
----------------------------------------------------------------
Released 1.10alpha3 2000-09-19

$Id: CHANGES,v 1.411 2017/02/19 12:36:21 stroeder Exp $
$Id: CHANGES,v 1.412 2017/02/20 10:25:47 stroeder Exp $
82 changes: 41 additions & 41 deletions Lib/ldap/schema/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,51 @@
See http://www.python-ldap.org/ for details.
\$Id: tokenizer.py,v 1.14 2017/02/18 15:32:01 stroeder Exp $
\$Id: tokenizer.py,v 1.15 2017/02/20 10:25:47 stroeder Exp $
"""

import re

TOKENS_FINDALL = re.compile(
r"(\()" # opening parenthesis
r"|" # or
r"(\))" # closing parenthesis
r"|" # or
r"([^'$()\s]+)" # string of length >= 1 without '$() or whitespace
r"|" # or
r"('.*?'(?!\w))" # any string or empty string surrounded by single quotes
# except if right quote is succeeded by alphanumeric char
r"|" # or
r"([^\s]+?)", # residue, all non-whitespace strings
).findall


def split_tokens(s):
"""
Returns list of syntax elements with quotes and spaces
stripped.
"""
result = []
result_append = result.append
s_len = len(s)
i = 0
while i<s_len:
start = i
while i<s_len and s[i]!="'":
if s[i]=="(" or s[i]==")":
if i>start:
result_append(s[start:i])
result_append(s[i])
i +=1 # Consume parentheses
start = i
elif s[i]==" " or s[i]=="$":
if i>start:
result_append(s[start:i])
i +=1
# Consume more space chars
while i<s_len and s[i]==" ":
i +=1
start = i
else:
i +=1
if i>start:
result_append(s[start:i])
i +=1
if i>=s_len:
break
start = i
while i<s_len and s[i]!="'":
i +=1
if i>=start:
result_append(s[start:i])
i +=1
return result # split_tokens()
"""
Returns list of syntax elements with quotes and spaces
stripped.
"""
parts = []
parens = 0
for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
if unquoted:
parts.append(unquoted)
elif quoted:
parts.append(quoted[1:-1])
elif opar:
parens += 1
parts.append(opar)
elif cpar:
parens -= 1
parts.append(cpar)
elif residue == '$':
if not parens:
raise ValueError("'$' outside parenthesis")
else:
raise ValueError(residue, s)
if parens:
raise ValueError('Unbalanced parenthesis in %r' % s)
return parts


def extract_tokens(l,known_tokens):
Expand Down Expand Up @@ -82,4 +83,3 @@ def extract_tokens(l,known_tokens):
else:
i += 1 # Consume unrecognized item
return result

0 comments on commit adb6b47

Please sign in to comment.