Skip to content

Commit

Permalink
Refactoring LDIFParser, especially added parsing of change records
Browse files Browse the repository at this point in the history
  • Loading branch information
stroeder committed Jun 20, 2015
1 parent 0b665e2 commit 65cc6fd
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 78 deletions.
12 changes: 11 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ Lib/
ldapurl, more information in some exceptions.
* ldap.ldapobject.LDAPObject:
New convenience methods for SASL GSSAPI or EXTERNAL binds
* Refactored parts in ldif.LDIFParser:
- New class attributes line_counter and byte_counter contain
amount of LDIF data read so far
- Renamed some internally used methods
- Added support for parsing change records currently limited to
changetype: modify
- New separate methods parse_entry_records() (also called by parse())
and parse_change_records()
- Stricter order checking of dn:, changetype:, etc.
- Removed non-existent 'AttrTypeandValueLDIF' from ldif.__all__

----------------------------------------------------------------
Released 2.4.19 2015-01-10
Expand Down Expand Up @@ -1158,4 +1168,4 @@ Released 2.0.0pre02 2002-02-01
----------------------------------------------------------------
Released 1.10alpha3 2000-09-19

$Id: CHANGES,v 1.343 2015/06/11 15:13:44 stroeder Exp $
$Id: CHANGES,v 1.344 2015/06/20 14:09:45 stroeder Exp $
274 changes: 197 additions & 77 deletions Lib/ldif.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
See http://www.python-ldap.org/ for details.
$Id: ldif.py,v 1.80 2015/06/05 21:04:58 stroeder Exp $
$Id: ldif.py,v 1.81 2015/06/20 14:09:45 stroeder Exp $
Python compability note:
Tested with Python 2.0+, but should work with Python 1.5.2+.
Expand All @@ -15,7 +15,7 @@
# constants
'ldif_pattern',
# functions
'AttrTypeandValueLDIF','CreateLDIF','ParseLDIF',
'CreateLDIF','ParseLDIF',
# classes
'LDIFWriter',
'LDIFParser',
Expand All @@ -40,7 +40,9 @@
ldif_pattern = '^((dn(:|::) %(dn_pattern)s)|(%(attrtype_pattern)s(:|::) .*)$)+' % vars()

MOD_OP_INTEGER = {
'add':0,'delete':1,'replace':2
'add' :0, # ldap.MOD_REPLACE
'delete' :1, # ldap.MOD_DELETE
'replace':2, # ldap.MOD_REPLACE
}

MOD_OP_STR = {
Expand Down Expand Up @@ -98,7 +100,7 @@ def __init__(self,output_file,base64_attrs=None,cols=76,line_sep='\n'):
self._line_sep = line_sep
self.records_written = 0

def _unfoldLDIFLine(self,line):
def _unfold_lines(self,line):
"""
Write string line as one or more folded lines
"""
Expand All @@ -117,7 +119,7 @@ def _unfoldLDIFLine(self,line):
self._output_file.write(line[pos:min(line_len,pos+self._cols-1)])
self._output_file.write(self._line_sep)
pos = pos+self._cols-1
return # _unfoldLDIFLine()
return # _unfold_lines()

def _needs_base64_encoding(self,attr_type,attr_value):
"""
Expand All @@ -138,9 +140,9 @@ def _unparseAttrTypeandValue(self,attr_type,attr_value):
"""
if self._needs_base64_encoding(attr_type,attr_value):
# Encode with base64
self._unfoldLDIFLine(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')]))
self._unfold_lines(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')]))
else:
self._unfoldLDIFLine(': '.join([attr_type,attr_value]))
self._unfold_lines(': '.join([attr_type,attr_value]))
return # _unparseAttrTypeandValue()

def _unparseEntryRecord(self,entry):
Expand Down Expand Up @@ -240,17 +242,6 @@ class and override method handle() to implement something meaningful.
Counter for records processed so far
"""

def _stripLineSep(self,s):
"""
Strip trailing line separators from s, but no other whitespaces
"""
if s[-2:]=='\r\n':
return s[:-2]
elif s[-1:]=='\n':
return s[:-1]
else:
return s

def __init__(
self,
input_file,
Expand Down Expand Up @@ -280,46 +271,61 @@ def __init__(
self._process_url_schemes = list_dict([s.lower() for s in (process_url_schemes or [])])
self._ignored_attr_types = list_dict([a.lower() for a in (ignored_attr_types or [])])
self._line_sep = line_sep
self.line_counter = 0
self.byte_counter = 0
self.records_read = 0
self._line = self._readline()

def handle(self,dn,entry):
"""
Process a single content LDIF record. This method should be
implemented by applications using LDIFParser.
"""
pass

def _readline(self):
s = self._input_file.readline()
self.line_counter = self.line_counter + 1
self.byte_counter = self.byte_counter + len(s)
if s[-2:]=='\r\n':
return s[:-2]
elif s[-1:]=='\n':
return s[:-1]
else:
return s

def _unfoldLDIFLine(self):
def _unfold_lines(self):
"""
Unfold several folded lines with trailing space into one line
"""
unfolded_lines = [ self._stripLineSep(self._line) ]
self._line = self._input_file.readline()
unfolded_lines = [ self._line ]
self._line = self._readline()
while self._line and self._line[0]==' ':
unfolded_lines.append(self._stripLineSep(self._line[1:]))
self._line = self._input_file.readline()
unfolded_lines.append(self._line[1:])
self._line = self._readline()
return ''.join(unfolded_lines)

def _parseAttrTypeandValue(self):
def _next_key_and_value(self):
"""
Parse a single attribute type and value pair from one or
more lines of LDIF data
"""
# Reading new attribute line
unfolded_line = self._unfoldLDIFLine()
unfolded_line = self._unfold_lines()
# Ignore comments which can also be folded
while unfolded_line and unfolded_line[0]=='#':
unfolded_line = self._unfoldLDIFLine()
if not unfolded_line or unfolded_line=='\n' or unfolded_line=='\r\n':
return None,None
try:
colon_pos = unfolded_line.index(':')
except ValueError:
# Treat malformed lines without colon as non-existent
unfolded_line = self._unfold_lines()
if not unfolded_line:
return None,None
if unfolded_line=='-':
return '-',None
colon_pos = unfolded_line.index(':')
attr_type = unfolded_line[0:colon_pos]
# if needed attribute value is BASE64 decoded
value_spec = unfolded_line[colon_pos:colon_pos+2]
if value_spec=='::':
if value_spec==': ':
attr_value = unfolded_line[colon_pos+2:]
elif value_spec=='::':
# attribute value needs base64-decoding
attr_value = base64.decodestring(unfolded_line[colon_pos+2:])
elif value_spec==':<':
Expand All @@ -332,60 +338,154 @@ def _parseAttrTypeandValue(self):
attr_value = urllib.urlopen(url).read()
elif value_spec==':\r\n' or value_spec=='\n':
attr_value = ''
else:
attr_value = unfolded_line[colon_pos+2:].lstrip()
return attr_type,attr_value

def parse(self):
def parse_entry_records(self):
"""
Continously read and parse LDIF records
Continously read and parse LDIF entry records
"""
self._line = self._input_file.readline()
k,v = self._next_key_and_value()
if k=='version':
self.version = v
k,v = self._next_key_and_value()
if k==v==None:
k,v = self._next_key_and_value()
else:
self.version = None

while self._line and \
# Loop for processing whole records
while k!=None and \
(not self._max_entries or self.records_read<self._max_entries):

# Reset record
version = None; dn = None; changetype = None; modop = None; entry = {}

attr_type,attr_value = self._parseAttrTypeandValue()

while attr_type!=None and attr_value!=None:
if attr_type=='dn':
# attr type and value pair was DN of LDIF record
if dn!=None:
raise ValueError('Two lines starting with dn: in one record.')
if not is_dn(attr_value):
raise ValueError('No valid string-representation of distinguished name %s.' % (repr(attr_value)))
dn = attr_value
elif attr_type=='version' and dn is None:
version = 1
elif attr_type=='changetype':
# attr type and value pair was DN of LDIF record
if dn is None:
raise ValueError('Read changetype: before getting valid dn: line.')
if changetype!=None:
raise ValueError('Two lines starting with changetype: in one record.')
if not valid_changetype_dict.has_key(attr_value):
raise ValueError('changetype value %s is invalid.' % (repr(attr_value)))
changetype = attr_value
elif attr_value!=None and \
not self._ignored_attr_types.has_key(attr_type.lower()):
# Add the attribute to the entry if not ignored attribute
if entry.has_key(attr_type):
entry[attr_type].append(attr_value)
else:
entry[attr_type]=[attr_value]

# Read the next line within an entry
attr_type,attr_value = self._parseAttrTypeandValue()

# Consume first line which must start with "dn: "
if k!='dn':
raise ValueError('Line %d: First line of record does not start with "dn:": %s' % (self.line_counter,repr(k)))
if not is_dn(v):
raise ValueError('Line %d: Not a valid string-representation for dn: %s.' % (self.line_counter,repr(v)))
dn = v
entry = {}
# Consume second line of record
k,v = self._next_key_and_value()

# Loop for reading the attributes
while k!=None and \
not k.lower() in self._ignored_attr_types:
# Add the attribute to the entry if not ignored attribute
try:
entry[k].append(v)
except KeyError:
entry[k]=[v]
# Read the next line within the record
k,v = self._next_key_and_value()
# Consume empty separator line
k,v = self._next_key_and_value()
if entry:
# append entry to result list
self.handle(dn,entry)
self.records_read = self.records_read+1
self.records_read = self.records_read + 1

return # parse()
return # parse_entry_records()

def parse(self):
"""
Invokes LDIFParser.parse_entry_records() for backward compability
"""
return self.parse_entry_records() # parse()

def handle_change_modify(self,dn,modops,controls=None):
"""
Process a single LDIF record representing a single modify operation.
This method should be implemented by applications using LDIFParser.
"""
controls = [] or None
pass

def parse_change_records(self):
self.changetype_counter = {}
k,v = self._next_key_and_value()
if k=='version':
self.version = v
k,v = self._next_key_and_value()
if k==v==None:
k,v = self._next_key_and_value()
else:
self.version = None

# Loop for processing whole records
while k!=None and \
(not self._max_entries or self.records_read<self._max_entries):

# Consume first line which must start with "dn: "
if k!='dn':
raise ValueError('Line %d: First line of record does not start with "dn:": %s' % (self.line_counter,repr(k)))
if not is_dn(v):
raise ValueError('Line %d: Not a valid string-representation for dn: %s' % (self.line_counter,repr(v)))
dn = v
# Read "control:" lines
controls = []
k,v = self._next_key_and_value()
while k=='control':
try:
control_type,criticality,control_value = v.split(' ',2)
except ValueError:
control_value = None
control_type,criticality = v.split(' ',1)
controls.append((control_type,criticality,control_value))
k,v = self._next_key_and_value()
# Determine changetype first, assuming changetype: as default
changetype = 'modify'
# Consume second line of record
if k=='changetype':
if not v in valid_changetype_dict:
raise ValueError('Invalid changetype: %s' % repr(v))
changetype = v
k,v = self._next_key_and_value()

if changetype=='modify':

# From here we assume a change record is read with changetype: modify
modops = []

# Loop for reading the list of modifications
while k!=None:
# Extract attribute mod-operation (add, delete, replace)
try:
modop = MOD_OP_INTEGER[k]
except KeyError:
raise ValueError('Line %d: Invalid mod-op string: %s' % (self.line_counter,repr(k)))
# we now have the attribute name to be modified
modattr = v
modvalues = []
k,v = self._next_key_and_value()
while k==modattr:
modvalues.append(v)
k,v = self._next_key_and_value()
modops.append((modop,modattr,modvalues or None))
k,v = self._next_key_and_value()
if k=='-':
# Consume next line
k,v = self._next_key_and_value()

if modops:
# append entry to result list
self.handle_modify(dn,modops,controls)

else:

# Consume the unhandled change record
while k!=None:
k,v = self._next_key_and_value()

# Consume empty separation line
k,v = self._next_key_and_value()

# Increment record counters
try:
self.changetype_counter[changetype] = self.changetype_counter[changetype] + 1
except KeyError:
self.changetype_counter[changetype] = 1
self.records_read = self.records_read + 1

return # parse_change_records()


class LDIFRecordList(LDIFParser):
Expand Down Expand Up @@ -451,3 +551,23 @@ def ParseLDIF(f,ignore_attrs=None,maxentries=0):
)
ldif_parser.parse()
return ldif_parser.all_records


if __name__ == '__main__':
import sys,os,time,pprint
parser_class_name = sys.argv[1]
parser_class = vars()[parser_class_name]
parser_method_name = sys.argv[2]
for input_file_name in sys.argv[3:]:
input_file_size = os.stat(input_file_name).st_size
input_file = open(input_file_name,'rb')
ldif_parser = parser_class(input_file)
parser_method = getattr(ldif_parser,parser_method_name)
start_time = time.time()
parser_method()
end_time = time.time()
input_file.close()
print '***Time needed:',end_time-start_time,'seconds'
print '***Records read:',ldif_parser.records_read
print '***Lines read:',ldif_parser.line_counter
print '***Bytes read:',ldif_parser.byte_counter,'of',input_file_size

0 comments on commit 65cc6fd

Please sign in to comment.