Skip to content

Feature rewrite parser in formal grammar #41

Merged
merged 35 commits into from
Aug 2, 2021
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e634be1
Remove old ECNQueue module
benne238 Jun 28, 2021
2648c8e
Add pyparsing to requirements
benne238 Jun 28, 2021
425b41a
Add ParseError to parser module
benne238 Jun 28, 2021
dc11658
Move Item.__get_formatted_date to webqueue2api.parser.utils and renam…
benne238 Jun 28, 2021
12407b0
Refactor to use new format_date_string function
benne238 Jun 28, 2021
7ee915f
replaced old parser and imported new pyparsing parser in the item class
benne238 Jun 28, 2021
5738191
Fixed bug where the end of the string was not matched
benne238 Jun 28, 2021
c7e0d0a
created function to append the initial message headers to the parsed …
benne238 Jun 29, 2021
0f69c18
modifed __add_initial_message_headers to output the expected format f…
benne238 Jun 29, 2021
42bb135
Moved the content attribute in item to be defined after all the other…
benne238 Jun 29, 2021
e7e7687
add_intitial_message_headers function call from the parseSections fun…
benne238 Jun 29, 2021
b701cda
Revert "add_intitial_message_headers function call from the parseSect…
benne238 Jun 29, 2021
3951832
return initial message section with appropriate headers
benne238 Jun 29, 2021
b9d4945
formatted function names to camel case
benne238 Jun 29, 2021
de2dd76
add function call to __get_sorted_sections to sort the parsed item se…
benne238 Jun 29, 2021
4dfb974
add assignment parsing to item.py
benne238 Jun 29, 2021
d858c0c
modified repy_from_user_end_delimiter to not include an ending newline
benne238 Jun 30, 2021
56798fb
added a condition that checks for an ending reply from user delimiter…
benne238 Jun 30, 2021
f61860e
fixed error message for header information not seperated from content…
benne238 Jun 30, 2021
8289ae7
Add logic to return an error_parse if a header is not formatted corre…
benne238 Jun 30, 2021
f5e5619
moved the email.policy configuration from the parser.py module to the…
benne238 Jun 30, 2021
595b1be
modified action end delimiter and modified the expected value error m…
benne238 Jun 30, 2021
66b65d7
modified the status rule to expect only a single line delimiter and m…
benne238 Jun 30, 2021
5fbc373
modified the edit rule to expect only a single line delimiter and mad…
benne238 Jun 30, 2021
a3fafba
modified the reply to user rule to expect only a single line delimite…
benne238 Jun 30, 2021
1d0358c
added pp.LineStart() to explicity look for reply_from_user delimiter …
benne238 Jun 30, 2021
f7114c2
Added logic to raise a parser error with the appropriate information …
benne238 Jul 1, 2021
a84d8e7
Made expected message relevent: the datetime string doesn't need to b…
benne238 Jul 1, 2021
5383f8f
added docstring to __get_assignments
benne238 Jul 1, 2021
23abdf3
added docstring to __parse_sections
benne238 Jul 1, 2021
c9d04e5
Made initial message rule more similar to the edit, reply-to, and sta…
benne238 Jul 2, 2021
84cb7b4
Remove unused datetime import
campb303 Aug 2, 2021
80cc1cb
Correct __add_initial_message_headers doc block
campb303 Aug 2, 2021
5e07895
Cleanup __get_sorted_sections
campb303 Aug 2, 2021
206ed36
Rename reply to user delimiter
campb303 Aug 2, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def get_all_dependencies():
# Custom version of python-ldap without SASL requirements
"python-ldap @ git+https://github.itap.purdue.edu/ECN/python-ldap/@python-ldap-3.3.1",
"easyad",
"dataclasses"
"dataclasses",
"pyparsing"
],
extras_require={
"dev": conditional_dependencies["dev"],
Expand Down
5 changes: 5 additions & 0 deletions src/webqueue2api/parser/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,9 @@ def __init__(self, path: str):
class QueueDoesNotExistError(Exception):
def __init__(self, path: str):
self.message = f"Directory {path} not found."
super().__init__(self.message)

class ParseError(Exception):
def __init__(self, line_number: int, message: str = "Unable to parse item."):
self.message = f"{message} at line {line_number}"
super().__init__(self.message)
923 changes: 83 additions & 840 deletions src/webqueue2api/parser/item.py

Large diffs are not rendered by default.

331 changes: 331 additions & 0 deletions src/webqueue2api/parser/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
import pyparsing as pp
import json
import string
import email
import email.errors
import datetime
from .utils import format_date_string
from .errors import ParseError


parsed_item = []



################################################################################
# Delimiters
################################################################################
action_start_delimiter = "*** "
action_end_delimiter = "***"

edit_start_delimiter = action_start_delimiter + "Edited by: "
status_start_delimiter = action_start_delimiter + "Status updated by: "
reply_to_user_delimiter = action_start_delimiter + "Replied by: "
campb303 marked this conversation as resolved.
Show resolved Hide resolved
reply_from_user_start_delimiter = "=== Additional information supplied by user ==="
reply_from_user_end_delimiter = "==============================================="



################################################################################
# Parse Actions: Callbacks for Rules
################################################################################
def parse_section_by_type(section_type):
"""Returns a function to parse a section based on the section type.

Args:
section_type (str): The type of section to parse.
Can be "reply_to_user" | "edit" | "reply_from_user" | "status" | "directory_information" | "initial_message"
"""

def parse_section(original_string: str, match_start_index: int, tokens: pp.ParseResults) -> None:
"""Parses section and adds to global section list.

Args:
original_string (string): The original string passed to PyParsing
match_start_index (int): The character index where the match starts.
tokens (pyparsing.ParseResults): The PyParsing results.

Raises:
ValueError, OverflowError: If a date cannot be formatted.
ParseError: If unexpected formatting is encountered.
"""
tokens_dictionary = tokens.asDict()
tokens_dictionary["type"] = section_type

# Remove empty keys
for key in tokens_dictionary.keys():
if key == "": del tokens_dictionary[key]

# Parse reply-from-user headers
if section_type == "reply_from_user":
try:
headers = email.message_from_string(tokens_dictionary["headers"])
except email.errors.MissingHeaderBodySeparatorDefect as e:
parse_error = {
"type": "parse_error",
"datetime": format_date_string(str(datetime.datetime.now())),
"expected": "Header information with a key/value pair seperated by a colon or a newline to seperate the header from the content",
}
headers_list = tokens_dictionary["headers"].splitlines(keepends=True)
for line in headers_list:
if ":" not in line and not line.startswith(" "):
parse_error["got"] = line
line_number = original_string[:(match_start_index + original_string[match_start_index:].find(line))].count("\n") + 1
parse_error["line_num"] = line_number
parsed_item.append(parse_error)
raise ParseError(parse_error["line_num"], f"{parse_error['got']} is a malfomred header or the start of message content without a newline")

if "Date" not in headers.keys():
content_start = tokens_dictionary["content"][0].strip().split("\n", 1)[0]
parse_error = {
"type": "parse_error",
"datetime": format_date_string(str(datetime.datetime.now())),
"expected": "A Date header in the reply from user section",
"got": content_start,
"line_num": original_string[:original_string.find(content_start)].count("\n") + 1
}
raise ParseError(parse_error["line_num"], "Expected a 'Date' header in the reply from user section")

headers_list = []
for key in headers.keys():
headers_list.append({"type": key, "content": headers[key]})

for header in headers_list:
if header["type"] == "Date":
tokens_dictionary["datetime"] = header["content"]
elif header["type"] == "Subject":
tokens_dictionary["subject"] = header["content"]
elif header["type"] == "From":
user_name, user_email = email.utils.parseaddr(header["content"])
tokens_dictionary["from_name"] = user_name
tokens_dictionary["from_email"] = user_email
elif header["type"].lower() == "cc":
cc_list = [
{ "name": user_name, "email" :user_email }
for user_name, user_email in email.utils.getaddresses([header["content"]])
]
tokens_dictionary["cc"] = cc_list

tokens_dictionary["headers"] = headers_list

# Format date header
if "datetime" in tokens_dictionary.keys():
try:
formatted_date = format_date_string(tokens_dictionary["datetime"])
except (ValueError, OverflowError):
line_number = original_string[:original_string.find(f"{key}: {headers[key]}")].count("\n") + 1
parsed_item.append({
"type": "parse_error",
"datetime": format_date_string(str(datetime.datetime.now())),
"expected": "Expected a date and/or time",
"got": headers[key],
"line_num": line_number
})
raise ParseError(line_number, f"Could not format date header")

tokens_dictionary["datetime"] = formatted_date

# Convert content string to list of lines
if "content" in tokens_dictionary.keys():
tokens_dictionary["content"] = tokens_dictionary["content"][0].strip()
tokens_dictionary["content"] = tokens_dictionary["content"].splitlines(keepends=True)

parsed_item.append(tokens_dictionary)
return

return parse_section

def check_for_nested_action(original_string, match_start_index, tokens):
"""Checks for nested action in reply_from_user.

Args:
original_string (string): The original string passed to PyParsing
match_start_index (int): The character index where the match starts.
tokens (pyparsing.ParseResults): The PyParsing results.

Raises:
ParseError: If nested action is found.
"""
token_string = tokens[0]
strings_that_indicate_nesting = [
edit_start_delimiter,
status_start_delimiter,
reply_to_user_delimiter,
reply_from_user_start_delimiter
]

for delimiter in strings_that_indicate_nesting:
if delimiter in token_string:
line_number = 1 + original_string[:match_start_index].count("\n") + token_string[:token_string.find(delimiter)].count("\n")
parsed_item.append({
"type": "parse_error",
"datetime": format_date_string(str(datetime.datetime.now())),
"expected": reply_from_user_end_delimiter,
"got": f"Found nested action '{delimiter}' in reply from user",
"line_num": line_number
})
raise ParseError(line_number, f"Found nested action '{delimiter}' in reply from user")
return

def error_handler(original_string, match_start_index, tokens):
token_string = tokens[0][0]

parse_error = {
"type": "parse_error",
'datetime': format_date_string(str(datetime.datetime.now())),
}

if token_string == reply_from_user_start_delimiter and \
reply_from_user_end_delimiter in original_string[match_start_index:]:

expected_token = "\n\n"
line_number = original_string[:original_string[match_start_index:].find(reply_from_user_end_delimiter) + match_start_index].count("\n") + 1

parse_error["expected"] = expected_token
parse_error["got"] = reply_from_user_end_delimiter
parse_error["line_num"] = line_number
parsed_item.append(parse_error)
raise ParseError(line_number, f"No newline found after header information")

elif token_string == reply_from_user_start_delimiter:
expected_token = reply_from_user_end_delimiter
line_number = original_string.count('\n') + 1

parse_error["expected"] = expected_token
parse_error["got"] = "End of file"
parse_error["line_num"] = line_number
parsed_item.append(parse_error)
raise ParseError(line_number, f"No reply from user end delimiter found")
else:
expected_token = f"Action delimiter starting with '{action_start_delimiter}' and ending with '{action_end_delimiter}' or {reply_from_user_start_delimiter}"
line_number = (original_string[:match_start_index]).count('\n') + 1

parse_error["expected"] = expected_token
parse_error["got"] = token_string
parse_error["line_num"] = line_number
parsed_item.append(parse_error)
raise ParseError(line_number, f"No action start delimiter found")


################################################################################
# Rules
################################################################################
header_rule = pp.SkipTo("\n\n").leaveWhitespace()

directory_rule = pp.Dict(
pp.White("\n").suppress() +
pp.Optional(pp.Group("Name" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Login" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Computer" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Location" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Email" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Phone" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Office" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("UNIX Dir" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Zero Dir" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("User ECNDB" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Host ECNDB" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.Optional(pp.Group("Subject" + pp.Literal(":").suppress() + pp.SkipTo(pp.LineEnd()))) +
pp.White("\n\n").suppress()
).setParseAction(parse_section_by_type("directory_information"))

initial_message_rule = pp.Group(
pp.SkipTo(
pp.Literal(reply_from_user_start_delimiter)
| pp.Literal(action_start_delimiter)
) | pp.SkipTo(pp.StringEnd(), include=True)
).leaveWhitespace().setResultsName("content").setParseAction(parse_section_by_type("initial_message"))

reply_from_user_rule = (
(reply_from_user_start_delimiter + pp.OneOrMore(pp.LineEnd())).suppress() +
pp.SkipTo("\n\n").setResultsName("headers") +
(pp.Group(pp.SkipTo(reply_from_user_end_delimiter + pp.LineEnd()).setParseAction(check_for_nested_action)).setResultsName("content")) +
(pp.Literal(reply_from_user_end_delimiter) + pp.LineEnd()).suppress() +
pp.ZeroOrMore(pp.LineEnd()).suppress()
).leaveWhitespace().setParseAction(parse_section_by_type("reply_from_user"))

reply_to_user_rule = (
pp.LineStart() +
pp.Literal(reply_to_user_delimiter).suppress() +
pp.Word(pp.alphanums).setResultsName("by")+
pp.Literal(" at: ").suppress() +
pp.Word(pp.nums + "/-: ").setResultsName("datetime") +
(pp.Literal(action_end_delimiter) + pp.LineEnd()).suppress() +
pp.Group(
pp.SkipTo(
(pp.LineStart() + pp.Literal(reply_from_user_start_delimiter))
| (pp.LineStart() + pp.Literal(action_start_delimiter))
) | pp.SkipTo(pp.StringEnd(), include=True)
).setResultsName("content")
).leaveWhitespace().setParseAction(parse_section_by_type("reply_to_user"))

edit_rule = (
pp.LineStart() +
pp.Literal(edit_start_delimiter).suppress() +
pp.Word(pp.alphanums).setResultsName("by") +
pp.Literal(" at: ").suppress() +
pp.Word(pp.nums + "/-: ").setResultsName("datetime") +
(pp.Literal(action_end_delimiter) + pp.LineEnd()).suppress() +
pp.Group(
pp.SkipTo(
(pp.LineStart() + pp.Literal(reply_from_user_start_delimiter))
| (pp.LineStart() + pp.Literal(action_start_delimiter))
) | pp.SkipTo(pp.StringEnd(), include=True)
).setResultsName("content")
).leaveWhitespace().setParseAction(parse_section_by_type("edit"))

status_rule = (
pp.LineStart() +
pp.Literal(status_start_delimiter).suppress() +
pp.Word(pp.alphanums).setResultsName("by") +
pp.Literal(" at: ").suppress() +
pp.Word(pp.nums + "/-: ").setResultsName("datetime") +
(pp.Literal(action_end_delimiter) + pp.LineEnd()).suppress() +
pp.Group(
pp.SkipTo(
(pp.LineStart() + pp.Literal(reply_from_user_start_delimiter))
| (pp.LineStart() + pp.Literal(action_start_delimiter))
) | pp.SkipTo(pp.StringEnd(), include=True)
).setResultsName("content")
).leaveWhitespace().setParseAction(parse_section_by_type("status"))

error_rule = pp.Group(
pp.SkipTo(pp.LineEnd())
).setParseAction(error_handler)

item_rule = (
header_rule +
pp.Optional(directory_rule).suppress() +
initial_message_rule +
pp.ZeroOrMore(
reply_from_user_rule
| reply_to_user_rule
| edit_rule
| status_rule
) + pp.Optional(error_rule)
)



def parse_item(item_body: string, raise_on_error: bool = False) -> list:
"""Accepts string of an Item body and returns JSON serializable list of dictionary with formatted action types.

Args:
item_body (string): The string of the item to be parsed.
raise_on_error (bool): If true, a ParseError is raised when parsing error encountered. Otherwise, a parse error dictionary is added to the return value before being returned. Defaults to False.

Returns:
list: List of actions as ordered in the item. Does not include initial message metadata or assignments.

Raises:
ParseError: If raise_on_error is True, raises ParseError when parsing error occurs. Otherwise adds parse_error acction to return value.
"""
if raise_on_error:
item_rule.parseString(item_body)
else:
try:
item_rule.parseString(item_body)
except ParseError:
pass

return parsed_item
26 changes: 26 additions & 0 deletions src/webqueue2api/parser/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Shared utilities for the parser package"""

from dateutil import parser, tz
from datetime import datetime

def format_date_string(date: str) -> str:
"""Returns the date/time formatted as RFC 8601 YYYY-MM-DDTHH:MM:SS+00:00.
Returns empty string if the string argument passed to the function is not a datetime.
See: https://en.wikipedia.org/wiki/ISO_8601

Returns:
str: Properly formatted date/time recieved or empty string.
"""
try:
# This date is never meant to be used. The default attribute is just to set timezone.
parsed_date = parser.parse(date, default=datetime(
1970, 1, 1, tzinfo=tz.gettz('EDT')))
except:
return ""

parsed_date_string = parsed_date.strftime("%Y-%m-%dT%H:%M:%S%z")

return parsed_date_string

if __name__ == "__main__":
print(format_date_string("Tue, 23 Jun 2020 13:30:45 -0400"))
Loading