From adb6b471507a50ded256787ca3b9002009276604 Mon Sep 17 00:00:00 2001 From: stroeder Date: Mon, 20 Feb 2017 10:25:47 +0000 Subject: [PATCH] faster implementation of ldap.schema.tokenizer.split_tokens() --- CHANGES | 4 +- Lib/ldap/schema/tokenizer.py | 82 ++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 42 deletions(-) diff --git a/CHANGES b/CHANGES index c7157b4..cc1d8d0 100644 --- a/CHANGES +++ b/CHANGES @@ -2,6 +2,8 @@ Released 2.4.33 2017-02-xx Lib/ +* faster implementation of ldap.schema.tokenizer.split_tokens() + (thanks to Christian Heimes) * removed unused 2nd argument of ldap.schema.tokenizer.split_tokens() Tests/ @@ -1345,4 +1347,4 @@ Released 2.0.0pre02 2002-02-01 ---------------------------------------------------------------- Released 1.10alpha3 2000-09-19 -$Id: CHANGES,v 1.411 2017/02/19 12:36:21 stroeder Exp $ +$Id: CHANGES,v 1.412 2017/02/20 10:25:47 stroeder Exp $ diff --git a/Lib/ldap/schema/tokenizer.py b/Lib/ldap/schema/tokenizer.py index b03b043..aa54785 100644 --- a/Lib/ldap/schema/tokenizer.py +++ b/Lib/ldap/schema/tokenizer.py @@ -3,50 +3,51 @@ See http://www.python-ldap.org/ for details. -\$Id: tokenizer.py,v 1.14 2017/02/18 15:32:01 stroeder Exp $ +\$Id: tokenizer.py,v 1.15 2017/02/20 10:25:47 stroeder Exp $ """ +import re + +TOKENS_FINDALL = re.compile( + r"(\()" # opening parenthesis + r"|" # or + r"(\))" # closing parenthesis + r"|" # or + r"([^'$()\s]+)" # string of length >= 1 without '$() or whitespace + r"|" # or + r"('.*?'(?!\w))" # any string or empty string surrounded by single quotes + # except if right quote is succeeded by alphanumeric char + r"|" # or + r"([^\s]+?)", # residue, all non-whitespace strings +).findall + def split_tokens(s): - """ - Returns list of syntax elements with quotes and spaces - stripped. - """ - result = [] - result_append = result.append - s_len = len(s) - i = 0 - while istart: - result_append(s[start:i]) - result_append(s[i]) - i +=1 # Consume parentheses - start = i - elif s[i]==" " or s[i]=="$": - if i>start: - result_append(s[start:i]) - i +=1 - # Consume more space chars - while istart: - result_append(s[start:i]) - i +=1 - if i>=s_len: - break - start = i - while i=start: - result_append(s[start:i]) - i +=1 - return result # split_tokens() + """ + Returns list of syntax elements with quotes and spaces + stripped. + """ + parts = [] + parens = 0 + for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s): + if unquoted: + parts.append(unquoted) + elif quoted: + parts.append(quoted[1:-1]) + elif opar: + parens += 1 + parts.append(opar) + elif cpar: + parens -= 1 + parts.append(cpar) + elif residue == '$': + if not parens: + raise ValueError("'$' outside parenthesis") + else: + raise ValueError(residue, s) + if parens: + raise ValueError('Unbalanced parenthesis in %r' % s) + return parts def extract_tokens(l,known_tokens): @@ -82,4 +83,3 @@ def extract_tokens(l,known_tokens): else: i += 1 # Consume unrecognized item return result -