From 65cc6fdb4d8caa83e70ae2b393615236697542ed Mon Sep 17 00:00:00 2001
From: stroeder <stroeder>
Date: Sat, 20 Jun 2015 14:09:45 +0000
Subject: [PATCH] Refactoring LDIFParser, especially added parsing of change
 records

---
 CHANGES     |  12 ++-
 Lib/ldif.py | 274 +++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 208 insertions(+), 78 deletions(-)
diff --git a/CHANGES b/CHANGES
index 86614c8..d1eddd5 100644
--- a/CHANGES
+++ b/CHANGES
@@ -11,6 +11,16 @@ Lib/
   ldapurl, more information in some exceptions.
 * ldap.ldapobject.LDAPObject:
   New convenience methods for SASL GSSAPI or EXTERNAL binds
+* Refactored parts in ldif.LDIFParser:
+  - New class attributes line_counter and byte_counter contain
+    amount of LDIF data read so far
+  - Renamed some internally used methods
+  - Added support for parsing change records currently limited to
+    changetype: modify
+  - New separate methods parse_entry_records() (also called by parse())
+    and parse_change_records()
+  - Stricter order checking of dn:, changetype:, etc.
+  - Removed non-existent 'AttrTypeandValueLDIF' from ldif.__all__
 
 ----------------------------------------------------------------
 Released 2.4.19 2015-01-10
@@ -1158,4 +1168,4 @@ Released 2.0.0pre02 2002-02-01
 ----------------------------------------------------------------
 Released 1.10alpha3 2000-09-19
 
-$Id: CHANGES,v 1.343 2015/06/11 15:13:44 stroeder Exp $
+$Id: CHANGES,v 1.344 2015/06/20 14:09:45 stroeder Exp $
diff --git a/Lib/ldif.py b/Lib/ldif.py
index bc47040..cbe741e 100644
--- a/Lib/ldif.py
+++ b/Lib/ldif.py
@@ -3,7 +3,7 @@
 
 See http://www.python-ldap.org/ for details.
 
-$Id: ldif.py,v 1.80 2015/06/05 21:04:58 stroeder Exp $
+$Id: ldif.py,v 1.81 2015/06/20 14:09:45 stroeder Exp $
 
 Python compability note:
 Tested with Python 2.0+, but should work with Python 1.5.2+.
@@ -15,7 +15,7 @@
   # constants
   'ldif_pattern',
   # functions
-  'AttrTypeandValueLDIF','CreateLDIF','ParseLDIF',
+  'CreateLDIF','ParseLDIF',
   # classes
   'LDIFWriter',
   'LDIFParser',
@@ -40,7 +40,9 @@
 ldif_pattern = '^((dn(:|::) %(dn_pattern)s)|(%(attrtype_pattern)s(:|::) .*)$)+' % vars()
 
 MOD_OP_INTEGER = {
-  'add':0,'delete':1,'replace':2
+  'add'    :0, # ldap.MOD_REPLACE
+  'delete' :1, # ldap.MOD_DELETE
+  'replace':2, # ldap.MOD_REPLACE
 }
 
 MOD_OP_STR = {
@@ -98,7 +100,7 @@ def __init__(self,output_file,base64_attrs=None,cols=76,line_sep='\n'):
     self._line_sep = line_sep
     self.records_written = 0
 
-  def _unfoldLDIFLine(self,line):
+  def _unfold_lines(self,line):
     """
     Write string line as one or more folded lines
     """
@@ -117,7 +119,7 @@ def _unfoldLDIFLine(self,line):
         self._output_file.write(line[pos:min(line_len,pos+self._cols-1)])
         self._output_file.write(self._line_sep)
         pos = pos+self._cols-1
-    return # _unfoldLDIFLine()
+    return # _unfold_lines()
 
   def _needs_base64_encoding(self,attr_type,attr_value):
     """
@@ -138,9 +140,9 @@ def _unparseAttrTypeandValue(self,attr_type,attr_value):
     """
     if self._needs_base64_encoding(attr_type,attr_value):
       # Encode with base64
-      self._unfoldLDIFLine(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')]))
+      self._unfold_lines(':: '.join([attr_type,base64.encodestring(attr_value).replace('\n','')]))
     else:
-      self._unfoldLDIFLine(': '.join([attr_type,attr_value]))
+      self._unfold_lines(': '.join([attr_type,attr_value]))
     return # _unparseAttrTypeandValue()
 
   def _unparseEntryRecord(self,entry):
@@ -240,17 +242,6 @@ class and override method handle() to implement something meaningful.
         Counter for records processed so far
   """
 
-  def _stripLineSep(self,s):
-    """
-    Strip trailing line separators from s, but no other whitespaces
-    """
-    if s[-2:]=='\r\n':
-      return s[:-2]
-    elif s[-1:]=='\n':
-      return s[:-1]
-    else:
-      return s
-
   def __init__(
     self,
     input_file,
@@ -280,46 +271,61 @@ def __init__(
     self._process_url_schemes = list_dict([s.lower() for s in (process_url_schemes or [])])
     self._ignored_attr_types = list_dict([a.lower() for a in (ignored_attr_types or [])])
     self._line_sep = line_sep
+    self.line_counter = 0
+    self.byte_counter = 0
     self.records_read = 0
+    self._line = self._readline()
 
   def handle(self,dn,entry):
     """
     Process a single content LDIF record. This method should be
     implemented by applications using LDIFParser.
     """
+    pass
+
+  def _readline(self):
+    s = self._input_file.readline()
+    self.line_counter = self.line_counter + 1
+    self.byte_counter = self.byte_counter + len(s)
+    if s[-2:]=='\r\n':
+      return s[:-2]
+    elif s[-1:]=='\n':
+      return s[:-1]
+    else:
+      return s
 
-  def _unfoldLDIFLine(self):
+  def _unfold_lines(self):
     """
     Unfold several folded lines with trailing space into one line
     """
-    unfolded_lines = [ self._stripLineSep(self._line) ]
-    self._line = self._input_file.readline()
+    unfolded_lines = [ self._line ]
+    self._line = self._readline()
     while self._line and self._line[0]==' ':
-      unfolded_lines.append(self._stripLineSep(self._line[1:]))
-      self._line = self._input_file.readline()
+      unfolded_lines.append(self._line[1:])
+      self._line = self._readline()
     return ''.join(unfolded_lines)
 
-  def _parseAttrTypeandValue(self):
+  def _next_key_and_value(self):
     """
     Parse a single attribute type and value pair from one or
     more lines of LDIF data
     """
     # Reading new attribute line
-    unfolded_line = self._unfoldLDIFLine()
+    unfolded_line = self._unfold_lines()
     # Ignore comments which can also be folded
     while unfolded_line and unfolded_line[0]=='#':
-      unfolded_line = self._unfoldLDIFLine()
-    if not unfolded_line or unfolded_line=='\n' or unfolded_line=='\r\n':
-      return None,None
-    try:
-      colon_pos = unfolded_line.index(':')
-    except ValueError:
-      # Treat malformed lines without colon as non-existent
+      unfolded_line = self._unfold_lines()
+    if not unfolded_line:
       return None,None
+    if unfolded_line=='-':
+      return '-',None
+    colon_pos = unfolded_line.index(':')
     attr_type = unfolded_line[0:colon_pos]
     # if needed attribute value is BASE64 decoded
     value_spec = unfolded_line[colon_pos:colon_pos+2]
-    if value_spec=='::':
+    if value_spec==': ':
+      attr_value = unfolded_line[colon_pos+2:]
+    elif value_spec=='::':
       # attribute value needs base64-decoding
       attr_value = base64.decodestring(unfolded_line[colon_pos+2:])
     elif value_spec==':<':
@@ -332,60 +338,154 @@ def _parseAttrTypeandValue(self):
           attr_value = urllib.urlopen(url).read()
     elif value_spec==':\r\n' or value_spec=='\n':
       attr_value = ''
-    else:
-      attr_value = unfolded_line[colon_pos+2:].lstrip()
     return attr_type,attr_value
 
-  def parse(self):
+  def parse_entry_records(self):
     """
-    Continously read and parse LDIF records
+    Continously read and parse LDIF entry records
     """
-    self._line = self._input_file.readline()
+    k,v = self._next_key_and_value()
+    if k=='version':
+      self.version = v
+      k,v = self._next_key_and_value()
+      if k==v==None:
+        k,v = self._next_key_and_value()
+    else:
+      self.version = None
 
-    while self._line and \
+    # Loop for processing whole records
+    while k!=None and \
           (not self._max_entries or self.records_read<self._max_entries):
-
-      # Reset record
-      version = None; dn = None; changetype = None; modop = None; entry = {}
-
-      attr_type,attr_value = self._parseAttrTypeandValue()
-
-      while attr_type!=None and attr_value!=None:
-        if attr_type=='dn':
-          # attr type and value pair was DN of LDIF record
-          if dn!=None:
-            raise ValueError('Two lines starting with dn: in one record.')
-          if not is_dn(attr_value):
-            raise ValueError('No valid string-representation of distinguished name %s.' % (repr(attr_value)))
-          dn = attr_value
-        elif attr_type=='version' and dn is None:
-          version = 1
-        elif attr_type=='changetype':
-          # attr type and value pair was DN of LDIF record
-          if dn is None:
-            raise ValueError('Read changetype: before getting valid dn: line.')
-          if changetype!=None:
-            raise ValueError('Two lines starting with changetype: in one record.')
-          if not valid_changetype_dict.has_key(attr_value):
-            raise ValueError('changetype value %s is invalid.' % (repr(attr_value)))
-          changetype = attr_value
-        elif attr_value!=None and \
-             not self._ignored_attr_types.has_key(attr_type.lower()):
-          # Add the attribute to the entry if not ignored attribute
-          if entry.has_key(attr_type):
-            entry[attr_type].append(attr_value)
-          else:
-            entry[attr_type]=[attr_value]
-
-        # Read the next line within an entry
-        attr_type,attr_value = self._parseAttrTypeandValue()
-
+      # Consume first line which must start with "dn: "
+      if k!='dn':
+        raise ValueError('Line %d: First line of record does not start with "dn:": %s' % (self.line_counter,repr(k)))
+      if not is_dn(v):
+        raise ValueError('Line %d: Not a valid string-representation for dn: %s.' % (self.line_counter,repr(v)))
+      dn = v
+      entry = {}
+      # Consume second line of record
+      k,v = self._next_key_and_value()
+
+      # Loop for reading the attributes
+      while k!=None and \
+         not k.lower() in self._ignored_attr_types:
+        # Add the attribute to the entry if not ignored attribute
+        try:
+          entry[k].append(v)
+        except KeyError:
+          entry[k]=[v]
+        # Read the next line within the record
+        k,v = self._next_key_and_value()
+      # Consume empty separator line
+      k,v = self._next_key_and_value()
       if entry:
         # append entry to result list
         self.handle(dn,entry)
-        self.records_read = self.records_read+1
+      self.records_read = self.records_read + 1
 
-    return # parse()
+    return # parse_entry_records()
+
+  def parse(self):
+    """
+    Invokes LDIFParser.parse_entry_records() for backward compability
+    """
+    return self.parse_entry_records() # parse()
+
+  def handle_change_modify(self,dn,modops,controls=None):
+    """
+    Process a single LDIF record representing a single modify operation.
+    This method should be implemented by applications using LDIFParser.
+    """
+    controls = [] or None
+    pass
+
+  def parse_change_records(self):
+    self.changetype_counter = {}
+    k,v = self._next_key_and_value()
+    if k=='version':
+      self.version = v
+      k,v = self._next_key_and_value()
+      if k==v==None:
+        k,v = self._next_key_and_value()
+    else:
+      self.version = None
+
+    # Loop for processing whole records
+    while k!=None and \
+          (not self._max_entries or self.records_read<self._max_entries):
+
+      # Consume first line which must start with "dn: "
+      if k!='dn':
+        raise ValueError('Line %d: First line of record does not start with "dn:": %s' % (self.line_counter,repr(k)))
+      if not is_dn(v):
+        raise ValueError('Line %d: Not a valid string-representation for dn: %s' % (self.line_counter,repr(v)))
+      dn = v
+      # Read "control:" lines
+      controls = []
+      k,v = self._next_key_and_value()
+      while k=='control':
+        try:
+          control_type,criticality,control_value = v.split(' ',2)
+        except ValueError:
+          control_value = None
+          control_type,criticality = v.split(' ',1)
+        controls.append((control_type,criticality,control_value))
+        k,v = self._next_key_and_value()
+      # Determine changetype first, assuming changetype: as default
+      changetype = 'modify'
+      # Consume second line of record
+      if k=='changetype':
+        if not v in valid_changetype_dict:
+          raise ValueError('Invalid changetype: %s' % repr(v))
+        changetype = v
+        k,v = self._next_key_and_value()
+
+      if changetype=='modify':
+
+        # From here we assume a change record is read with changetype: modify
+        modops = []
+
+        # Loop for reading the list of modifications
+        while k!=None:
+          # Extract attribute mod-operation (add, delete, replace)
+          try:
+            modop = MOD_OP_INTEGER[k]
+          except KeyError:
+            raise ValueError('Line %d: Invalid mod-op string: %s' % (self.line_counter,repr(k)))
+          # we now have the attribute name to be modified
+          modattr = v
+          modvalues = []
+          k,v = self._next_key_and_value()
+          while k==modattr:
+            modvalues.append(v)
+            k,v = self._next_key_and_value()
+          modops.append((modop,modattr,modvalues or None))
+          k,v = self._next_key_and_value()
+          if k=='-':
+            # Consume next line
+            k,v = self._next_key_and_value()
+
+        if modops:
+          # append entry to result list
+          self.handle_modify(dn,modops,controls)
+
+      else:
+
+        # Consume the unhandled change record
+        while k!=None:
+          k,v = self._next_key_and_value()
+
+      # Consume empty separation line
+      k,v = self._next_key_and_value()
+
+      # Increment record counters
+      try:
+        self.changetype_counter[changetype] = self.changetype_counter[changetype] + 1
+      except KeyError:
+        self.changetype_counter[changetype] = 1
+      self.records_read = self.records_read + 1
+
+    return # parse_change_records()
 
 
 class LDIFRecordList(LDIFParser):
@@ -451,3 +551,23 @@ def ParseLDIF(f,ignore_attrs=None,maxentries=0):
   )
   ldif_parser.parse()
   return ldif_parser.all_records
+
+
+if __name__ == '__main__':
+  import sys,os,time,pprint
+  parser_class_name = sys.argv[1]
+  parser_class = vars()[parser_class_name]
+  parser_method_name = sys.argv[2]
+  for input_file_name in sys.argv[3:]:
+    input_file_size = os.stat(input_file_name).st_size
+    input_file = open(input_file_name,'rb')
+    ldif_parser = parser_class(input_file)
+    parser_method = getattr(ldif_parser,parser_method_name)
+    start_time = time.time()
+    parser_method()
+    end_time = time.time()
+    input_file.close()
+    print '***Time needed:',end_time-start_time,'seconds'
+    print '***Records read:',ldif_parser.records_read
+    print '***Lines read:',ldif_parser.line_counter
+    print '***Bytes read:',ldif_parser.byte_counter,'of',input_file_size