ECN · campb303 · Aug 2, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 28, 2021
diff --git a/setup.py b/setup.py
@@ -39,7 +39,8 @@ def get_all_dependencies():
         # Custom version of python-ldap without SASL requirements
         "python-ldap @ git+https://github.itap.purdue.edu/ECN/python-ldap/@python-ldap-3.3.1",
         "easyad",
-        "dataclasses"
+        "dataclasses",
+        "pyparsing"
     ],
     extras_require={
         "dev": conditional_dependencies["dev"],

diff --git a/src/webqueue2api/parser/errors.py b/src/webqueue2api/parser/errors.py
@@ -6,4 +6,9 @@ def __init__(self, path: str):
 class QueueDoesNotExistError(Exception):
     def __init__(self, path: str):
         self.message = f"Directory {path} not found."
+        super().__init__(self.message)
+
+class ParseError(Exception):
+    def __init__(self, line_number: int, message: str = "Unable to parse item."):
+        self.message = f"{message} at line {line_number}"
         super().__init__(self.message)
diff --git a/src/webqueue2api/parser/item.py b/src/webqueue2api/parser/item.py
diff --git a/src/webqueue2api/parser/parser.py b/src/webqueue2api/parser/parser.py
@@ -0,0 +1,331 @@
+import pyparsing as pp
+import json
+import string
+import email
+import email.errors
+import datetime
+from .utils import format_date_string
+from .errors import ParseError
+
+
+parsed_item = []
+
+
+
+################################################################################
+# Delimiters
+################################################################################
+action_start_delimiter = "*** "
+action_end_delimiter = "***"
+
+edit_start_delimiter = action_start_delimiter + "Edited by: "
+status_start_delimiter = action_start_delimiter + "Status updated by: "
+reply_to_user_delimiter = action_start_delimiter + "Replied by: "
+reply_from_user_start_delimiter = "=== Additional information supplied by user ==="
+reply_from_user_end_delimiter = "==============================================="
+
+
+
+################################################################################
+# Parse Actions: Callbacks for Rules
+################################################################################
+def parse_section_by_type(section_type):
+    """Returns a function to parse a section based on the section type.
+
+    Args:
+            section_type (str): The type of section to parse. 
+            Can be "reply_to_user" | "edit" | "reply_from_user" | "status" | "directory_information" | "initial_message"
+    """
+
+    def parse_section(original_string: str, match_start_index: int, tokens: pp.ParseResults) -> None:
+        """Parses section and adds to global section list.
+
+        Args:
+                original_string (string): The original string passed to PyParsing
+                match_start_index (int): The character index where the match starts.
+                tokens (pyparsing.ParseResults): The PyParsing results.
+
+        Raises:
+                ValueError, OverflowError: If a date cannot be formatted.
+                ParseError: If unexpected formatting is encountered.
+        """
+        tokens_dictionary = tokens.asDict()
+        tokens_dictionary["type"] = section_type
+
+        # Remove empty keys
+        for key in tokens_dictionary.keys(): 
+            if key == "": del tokens_dictionary[key]
+
+        # Parse reply-from-user headers
+        if section_type == "reply_from_user":
+            try:
+                headers = email.message_from_string(tokens_dictionary["headers"])
+            except email.errors.MissingHeaderBodySeparatorDefect as e:
+                parse_error = {
+                    "type": "parse_error",
+                    "datetime": format_date_string(str(datetime.datetime.now())),
+                    "expected": "Header information with a key/value pair seperated by a colon or a newline to seperate the header from the content",
+                }
+                headers_list = tokens_dictionary["headers"].splitlines(keepends=True)
+                for line in headers_list:
+                    if ":" not in line and not line.startswith(" "):
+                        parse_error["got"] = line
+                        line_number = original_string[:(match_start_index + original_string[match_start_index:].find(line))].count("\n") + 1
+                        parse_error["line_num"] = line_number
+                        parsed_item.append(parse_error)
+                        raise ParseError(parse_error["line_num"], f"{parse_error['got']} is a malfomred header or the start of message content without a newline")
+
+            if "Date" not in headers.keys():
+                content_start = tokens_dictionary["content"][0].strip().split("\n", 1)[0]
+                parse_error = {
+                    "type": "parse_error",
+                    "datetime": format_date_string(str(datetime.datetime.now())),
+                    "expected": "A Date header in the reply from user section",
+                    "got": content_start,
+                    "line_num": original_string[:original_string.find(content_start)].count("\n") + 1
+                }
+                raise ParseError(parse_error["line_num"], "Expected a 'Date' header in the reply from user section")
+
+            headers_list = []
+            for key in headers.keys():
+                headers_list.append({"type": key, "content": headers[key]})
+
+            for header in headers_list:
+                if header["type"] == "Date":
+                    tokens_dictionary["datetime"] = header["content"]
+                elif header["type"] == "Subject":
+                    tokens_dictionary["subject"] = header["content"]
+                elif header["type"] == "From":
+                    user_name, user_email = email.utils.parseaddr(header["content"])
+                    tokens_dictionary["from_name"] = user_name
+                    tokens_dictionary["from_email"] = user_email
+                elif header["type"].lower() == "cc":
+                    cc_list = [
+                        { "name": user_name, "email" :user_email } 
+                        for user_name, user_email in email.utils.getaddresses([header["content"]])
+                    ]
+                    tokens_dictionary["cc"] = cc_list
+
+            tokens_dictionary["headers"] = headers_list
+
+        # Format date header
+        if "datetime" in tokens_dictionary.keys():
+            try:
+                formatted_date = format_date_string(tokens_dictionary["datetime"])
+            except (ValueError, OverflowError):
+                line_number = original_string[:original_string.find(f"{key}: {headers[key]}")].count("\n") + 1
+                parsed_item.append({
+                    "type": "parse_error",
+                    "datetime": format_date_string(str(datetime.datetime.now())),
+                    "expected": "Expected a date and/or time",
+                    "got": headers[key],
+                    "line_num": line_number
+                })
+                raise ParseError(line_number, f"Could not format date header")
+
+            tokens_dictionary["datetime"] = formatted_date
+
+        # Convert content string to list of lines
+        if "content" in tokens_dictionary.keys():
+            tokens_dictionary["content"] = tokens_dictionary["content"][0].strip()
+            tokens_dictionary["content"] = tokens_dictionary["content"].splitlines(keepends=True)
+
+        parsed_item.append(tokens_dictionary)
+        return
+
+    return parse_section
+
+def check_for_nested_action(original_string, match_start_index, tokens):
+    """Checks for nested action in reply_from_user.
+
+    Args:
+            original_string (string): The original string passed to PyParsing
+            match_start_index (int): The character index where the match starts.
+            tokens (pyparsing.ParseResults): The PyParsing results.
+
+    Raises:
+            ParseError: If nested action is found.
+    """        
+    token_string = tokens[0]
+    strings_that_indicate_nesting = [
+        edit_start_delimiter,
+        status_start_delimiter,
+        reply_to_user_delimiter,
+        reply_from_user_start_delimiter
+    ]
+
+    for delimiter in strings_that_indicate_nesting:
+        if delimiter in token_string:
+            line_number = 1 + original_string[:match_start_index].count("\n") + token_string[:token_string.find(delimiter)].count("\n")
+            parsed_item.append({
+                "type": "parse_error",
+                "datetime": format_date_string(str(datetime.datetime.now())),
+                "expected": reply_from_user_end_delimiter,
+                "got": f"Found nested action '{delimiter}' in reply from user",
+                "line_num": line_number
+            })
+            raise ParseError(line_number, f"Found nested action '{delimiter}' in reply from user")
+    return
+
+def error_handler(original_string, match_start_index, tokens):
+    token_string = tokens[0][0]
+
+    parse_error = {
+        "type": "parse_error",
+        'datetime': format_date_string(str(datetime.datetime.now())),
+    }
+
+    if token_string == reply_from_user_start_delimiter and \
+        reply_from_user_end_delimiter in original_string[match_start_index:]:
+
+        expected_token = "\n\n"
+        line_number = original_string[:original_string[match_start_index:].find(reply_from_user_end_delimiter) + match_start_index].count("\n") + 1
+
+        parse_error["expected"] = expected_token
+        parse_error["got"] = reply_from_user_end_delimiter
+        parse_error["line_num"] = line_number
+        parsed_item.append(parse_error)
+        raise ParseError(line_number, f"No newline found after header information")
+
+    elif token_string == reply_from_user_start_delimiter:
+        expected_token = reply_from_user_end_delimiter
+        line_number = original_string.count('\n') + 1
+
+        parse_error["expected"] = expected_token
+        parse_error["got"] = "End of file"
+        parse_error["line_num"] = line_number
+        parsed_item.append(parse_error)
+        raise ParseError(line_number, f"No reply from user end delimiter found")
+    else:
+        expected_token = f"Action delimiter starting with '{action_start_delimiter}' and ending with '{action_end_delimiter}' or {reply_from_user_start_delimiter}"
+        line_number = (original_string[:match_start_index]).count('\n') + 1
+
+        parse_error["expected"] = expected_token
+        parse_error["got"] = token_string
+        parse_error["line_num"] = line_number
+        parsed_item.append(parse_error)
+        raise ParseError(line_number, f"No action start delimiter found")
+
+
+################################################################################
+# Rules
+################################################################################
+header_rule = pp.SkipTo("\n\n").leaveWhitespace()
+
+directory_rule = pp.Dict(
+    pp.White("\n").suppress() +
+    pp.Optional(pp.Group("Name" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Login" + pp.Literal(":").suppress() +  pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Computer" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Location" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Email" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Phone" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Office" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("UNIX Dir" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Zero Dir" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("User ECNDB" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Host ECNDB" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.Optional(pp.Group("Subject" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
+    pp.White("\n\n").suppress()
+).setParseAction(parse_section_by_type("directory_information"))
+
+initial_message_rule = pp.Group(
+    pp.SkipTo(
+        pp.Literal(reply_from_user_start_delimiter)
+        | pp.Literal(action_start_delimiter)
+    ) | pp.SkipTo(pp.StringEnd(), include=True)
+).leaveWhitespace().setResultsName("content").setParseAction(parse_section_by_type("initial_message"))
+
+reply_from_user_rule = (
+    (reply_from_user_start_delimiter + pp.OneOrMore(pp.LineEnd())).suppress() +
+    pp.SkipTo("\n\n").setResultsName("headers") +
+    (pp.Group(pp.SkipTo(reply_from_user_end_delimiter + pp.LineEnd()).setParseAction(check_for_nested_action)).setResultsName("content")) +
+    (pp.Literal(reply_from_user_end_delimiter) + pp.LineEnd()).suppress() +
+    pp.ZeroOrMore(pp.LineEnd()).suppress()
+).leaveWhitespace().setParseAction(parse_section_by_type("reply_from_user"))
+
+reply_to_user_rule = (
+    pp.LineStart() +
+    pp.Literal(reply_to_user_delimiter).suppress() + 
+    pp.Word(pp.alphanums).setResultsName("by")+
+    pp.Literal(" at: ").suppress() +
+    pp.Word(pp.nums + "/-: ").setResultsName("datetime") +
+    (pp.Literal(action_end_delimiter) + pp.LineEnd()).suppress() +
+    pp.Group(
+        pp.SkipTo(
+            (pp.LineStart() + pp.Literal(reply_from_user_start_delimiter))
+            | (pp.LineStart() + pp.Literal(action_start_delimiter))
+        ) | pp.SkipTo(pp.StringEnd(), include=True)
+    ).setResultsName("content") 
+).leaveWhitespace().setParseAction(parse_section_by_type("reply_to_user"))
+
+edit_rule = (
+    pp.LineStart() +
+    pp.Literal(edit_start_delimiter).suppress() + 
+    pp.Word(pp.alphanums).setResultsName("by") +
+    pp.Literal(" at: ").suppress() +
+    pp.Word(pp.nums + "/-: ").setResultsName("datetime") +
+    (pp.Literal(action_end_delimiter) + pp.LineEnd()).suppress() +
+    pp.Group(
+        pp.SkipTo(
+            (pp.LineStart() + pp.Literal(reply_from_user_start_delimiter))
+            | (pp.LineStart() + pp.Literal(action_start_delimiter))
+        ) | pp.SkipTo(pp.StringEnd(), include=True)
+    ).setResultsName("content") 
+).leaveWhitespace().setParseAction(parse_section_by_type("edit"))
+
+status_rule = (
+    pp.LineStart() +
+    pp.Literal(status_start_delimiter).suppress() + 
+    pp.Word(pp.alphanums).setResultsName("by") +
+    pp.Literal(" at: ").suppress() +
+    pp.Word(pp.nums + "/-: ").setResultsName("datetime") +
+    (pp.Literal(action_end_delimiter) + pp.LineEnd()).suppress() +
+    pp.Group(
+        pp.SkipTo(
+            (pp.LineStart() + pp.Literal(reply_from_user_start_delimiter))
+            | (pp.LineStart() + pp.Literal(action_start_delimiter))
+        ) | pp.SkipTo(pp.StringEnd(), include=True)
+    ).setResultsName("content") 
+).leaveWhitespace().setParseAction(parse_section_by_type("status"))
+
+error_rule = pp.Group(
+    pp.SkipTo(pp.LineEnd())
+).setParseAction(error_handler)
+
+item_rule = (
+    header_rule + 
+    pp.Optional(directory_rule).suppress() + 
+    initial_message_rule + 
+    pp.ZeroOrMore(
+        reply_from_user_rule 
+        | reply_to_user_rule 
+        | edit_rule 
+        | status_rule
+    ) + pp.Optional(error_rule)
+)
+
+
+
+def parse_item(item_body: string, raise_on_error: bool = False) -> list:
+    """Accepts string of an Item body and returns JSON serializable list of dictionary with formatted action types.
+
+    Args:
+            item_body (string): The string of the item to be parsed.
+            raise_on_error (bool): If true, a ParseError is raised when parsing error encountered. Otherwise, a parse error dictionary is added to the return value before being returned. Defaults to False.
+
+    Returns:
+            list: List of actions as ordered in the item. Does not include initial message metadata or assignments.
+
+    Raises:
+            ParseError: If raise_on_error is True, raises ParseError when parsing error occurs. Otherwise adds parse_error acction to return value.
+    """
+    if raise_on_error: 
+        item_rule.parseString(item_body)
+    else:   
+        try:
+            item_rule.parseString(item_body)
+        except ParseError:
+            pass
+
+    return parsed_item
diff --git a/src/webqueue2api/parser/utils.py b/src/webqueue2api/parser/utils.py
@@ -0,0 +1,26 @@
+"""Shared utilities for the parser package"""
+
+from dateutil import parser, tz
+from datetime import datetime
+
+def format_date_string(date: str) -> str:
+    """Returns the date/time formatted as RFC 8601 YYYY-MM-DDTHH:MM:SS+00:00.
+    Returns empty string if the string argument passed to the function is not a datetime.
+    See: https://en.wikipedia.org/wiki/ISO_8601
+
+    Returns:
+            str: Properly formatted date/time recieved or empty string.
+    """
+    try:
+        # This date is never meant to be used. The default attribute is just to set timezone.
+        parsed_date = parser.parse(date, default=datetime(
+            1970, 1, 1, tzinfo=tz.gettz('EDT')))
+    except:
+        return ""
+
+    parsed_date_string = parsed_date.strftime("%Y-%m-%dT%H:%M:%S%z")
+
+    return parsed_date_string
+
+if __name__ == "__main__":
+	print(format_date_string("Tue, 23 Jun 2020 13:30:45 -0400"))