Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Extract-oil-spill-amount-from-text/func_set_ra_eval.ipynb
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1448 lines (1448 sloc)
54.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "ae8311ba-bb71-4ae9-bca0-7094eaea4a46", | |
"metadata": {}, | |
"source": [ | |
"# Define input" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "7c44fccf", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "c4aefe3c-9cca-4dab-a8de-9aefc4e1e989", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def define_variables(type_ra_ext):\n", | |
" \"\"\"\n", | |
" Define variables based on the type of incident description or post.\n", | |
"\n", | |
" Parameters:\n", | |
" type_ra_ext (str): The type of data extraction, either 'description' or 'post'.\n", | |
"\n", | |
" Returns:\n", | |
" dict: A dictionary containing relevant variables.\n", | |
" \"\"\"\n", | |
"\n", | |
" variables = {}\n", | |
"\n", | |
" if type_ra_ext == 'description':\n", | |
" variables = {\n", | |
" 'filename_prefix': 'incident',\n", | |
" 'col_text': 'description',\n", | |
" 'col_date': 'open_date',\n", | |
" # 'data_source': 'RAEV',\n", | |
" 'col_ra_gpt_from_sample': 'RA_icd_GPT_OA_des',\n", | |
" 'col_ra_gpt': 'RA_icd_gpt',\n", | |
" 'col_ra_merge': 'RA_merge_icd_rb_gpt',\n", | |
" 'col_source_ra_merge': 'source_RA_merge_icd',\n", | |
" 'sheetnames_mvrfd_ext': ['S9', 'S10', 'S11', 'S12', 'S13','S14'],\n", | |
" 'col_refer': 'id',\n", | |
" 'col_ra_gpt_oa_text': 'RA_gpt_des_w_oa'\n", | |
" }\n", | |
"\n", | |
" elif type_ra_ext == 'post':\n", | |
" variables = {\n", | |
" 'filename_prefix': 'posts',\n", | |
" 'col_old_id': 'NPost_id',\n", | |
" 'col_text': 'post content',\n", | |
" 'col_date': 'post date',\n", | |
" # 'data_source': 'Posts',\n", | |
" 'col_ra_gpt_from_sample': 'RA_post_GPT_OA_des',\n", | |
" 'col_ra_gpt': 'RA_post_gpt',\n", | |
" 'col_ra_merge': 'RA_merge_posts_rb_gpt',\n", | |
" 'col_source_ra_merge': 'source_RA_merge_posts',\n", | |
" 'sheetnames_mvrfd_ext': ['S1', 'S2', 'S3', 'S4', 'S5','S6','S7','S8','S9','S10','S11'],\n", | |
" 'col_refer': 'NPost id',\n", | |
" 'col_ra_gpt_oa_text': 'RA_gpt_post_w_oa'\n", | |
" }\n", | |
"\n", | |
" variables['col_threat'] = 'threat'\n", | |
" variables['col_sub'] = 'commodity'#'commodity'#'risk_sub'\n", | |
" # variables['col_des'] = 'description'\n", | |
"\n", | |
" # columns names for text without trajectory prediction etc\n", | |
" if type_ra_ext == 'description':\n", | |
" variables['col_text_wt_traj'] = 'des_wt_traj'\n", | |
" elif type_ra_ext == 'post':\n", | |
" variables['col_text_wt_traj'] = 'post_wt_traj'\n", | |
" variables['if_text_has_hypothesis'] = 'trj_prd'\n", | |
" \n", | |
" # columns names for oa-related text\n", | |
" variables['col_oa_text_from_sample'] = 'OA_des'\n", | |
" if type_ra_ext == 'description':\n", | |
" variables['col_oa_text'] = 'des_w_oa'\n", | |
" elif type_ra_ext == 'post':\n", | |
" variables['col_oa_text'] = 'post_w_oa'\n", | |
"\n", | |
" # columns names for rule-based ra extraction\n", | |
" # variables['col_real_input'] = variables['col_text_wt_traj']\n", | |
" variables['col_ra_rb'] = 'RA_rb_' + variables['col_text_wt_traj']\n", | |
" variables['col_oatt_rb'] = 'CA_rb_' + variables['col_text_wt_traj']\n", | |
" variables['col_if_tl_plt'] = 'Times_larger_plt'\n", | |
" variables['col_if_rls'] = 'Rls_doc'\n", | |
" variables['col_if_no_rls'] = 'No_release'\n", | |
" if type_ra_ext == 'description':\n", | |
" variables['if_do_no_rls_check'] = True\n", | |
" elif type_ra_ext == 'post':\n", | |
" variables['if_do_no_rls_check'] = False\n", | |
"\n", | |
" # column names for gpt extraction results\n", | |
" \n", | |
" variables['col_oatt_gpt_ext'] = 'OAtt_GPT_Ext'\n", | |
" variables['col_oarv_gpt_ext'] = 'OArv_GPT_Ext'\n", | |
" variables['col_if_oa_irlvt_gpt'] = 'OA_irlvt_GPT'\n", | |
" variables['col_if_oa_unit_lost'] = 'OA_unit_lost'\n", | |
" if type_ra_ext == 'description':\n", | |
" variables['col_id_of_sample'] = 'id'\n", | |
" variables['col_text_gpt35'] = 'text_gpt35_description'\n", | |
" variables['col_ra_gpt_ext'] = 'RA_gpt_des_w_oa'\n", | |
" elif type_ra_ext == 'post':\n", | |
" variables['col_id_of_sample'] = 'NPost id'\n", | |
" variables['col_text_gpt35'] = 'text_gpt35_post'\n", | |
" variables['col_ra_gpt_ext'] = 'RA_gpt_post_w_oa'\n", | |
"\n", | |
" # columns for merging rule-based and gpt results\n", | |
" if type_ra_ext == 'description':\n", | |
" variables['col_vrf_ra'] = 'RA_des_Manual'\n", | |
" variables['col_id_of_vrfd_ra'] = 'id'\n", | |
" elif type_ra_ext == 'post':\n", | |
" variables['col_vrf_ra'] = 'RA_post_Manual'\n", | |
" variables['col_id_of_vrfd_ra'] = 'old_npost_id_2parts'\n", | |
" variables['col_ra_source_of_vrfd_ra'] = 'Source'\n", | |
"\n", | |
" return variables" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "dde3c61d-296b-460b-bc02-657bccbc9a53", | |
"metadata": {}, | |
"source": [ | |
"## Format adjustment" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "0b5b445f-d9e1-4caf-bc05-d7f0a4542f76", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# adjust format but don't change text content \n", | |
"# Aim: ideally we don't need to mention it in the paper\n", | |
"def adjust_format(text):\n", | |
" try:\n", | |
" text = str(text);\n", | |
" text = text.replace('_x000B_',' ');\n", | |
" text = text.replace('•','');\n", | |
" text = text.replace(',OOO','000');\n", | |
" if text.count('-') != text.count(' - '):\n", | |
" text = text.replace('-',' - ');\n", | |
" while ' ' in text:\n", | |
" text = text.replace(' ',' ');\n", | |
" except:\n", | |
" pass\n", | |
" return text" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6afc649f-113e-475f-9e3f-b9fc6cfce901", | |
"metadata": {}, | |
"source": [ | |
"# Extract Risk substance Name" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "38a62e39-1725-44ba-92dd-f0be142b1ff6", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"kws_oil = ['condensate','oil', 'diesel', 'heavy fuel', 'lube', 'lubrica',\n", | |
" 'bunker', 'hydraulic', 'petro', 'gasoline', 'diesel', 'JP',\n", | |
" 'Jet', 'crude', 'petroleum', 'shale', '#2', 'No. 2', '#4',\n", | |
" 'number 2', '#6', 'No. 6', '# 6', 'number 6', 'IFO', 'HFO']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4198b8ba", | |
"metadata": {}, | |
"source": [ | |
"# Rule-based RA extraction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "edb3beea-7cbc-4404-9f84-1f62ce1e6805", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-03-24 19:21:15 INFO: Loading these models for language: en (English):\n", | |
"========================\n", | |
"| Processor | Package |\n", | |
"------------------------\n", | |
"| tokenize | combined |\n", | |
"| pos | combined |\n", | |
"| lemma | combined |\n", | |
"========================\n", | |
"\n", | |
"2024-03-24 19:21:15 INFO: Use device: cpu\n", | |
"2024-03-24 19:21:15 INFO: Loading: tokenize\n", | |
"2024-03-24 19:21:15 INFO: Loading: pos\n", | |
"2024-03-24 19:21:16 INFO: Loading: lemma\n", | |
"2024-03-24 19:21:16 INFO: Done loading processors!\n" | |
] | |
} | |
], | |
"source": [ | |
"import stanza\n", | |
"nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "7fde465a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"release_words = ['spill', 'release', 'pollution', 'leak', 'damage', 'discharge',\\\n", | |
" 'sheen','lose','loss','spillage','pollute','seep','overflow','dump'];\n", | |
"release_phrase = ['in water','in the water'];\n", | |
"neg_words = ['no', 'not','nothing'];\n", | |
"acccause_words = ['aground','ground','adrift','drift','collsion','allision','collide', 'hurricane','sink']\n", | |
"# 'grounded','ran aground',''\n", | |
"unit_words = [ 'gallon', 'barrel', 'barrels','bbl','bbls', 'gal', 'liter','cubicfeet','cubicyard']; #'pound',\n", | |
"\n", | |
"amount_words = ['million','k'];\n", | |
"avoid_words = ['onboard', 'carry','load','capacity','contain','abroad','aboard','potential'];\n", | |
"fuzzy_carry_words = ['have','hold'];\n", | |
"exclude_fzy_carry_words = ['have been','has been'];\n", | |
"accsit_words = ['sink', 'fire', 'explode','burn','explosion','capsize'];\n", | |
"accmin_words = ['adrift']; #,'re-floated','refloat','floated','afloat','hard aground'\n", | |
"accmin_phrase = ['lost propulsion','without propulsion','lost power']\n", | |
"separator_word = [',','\\(','\\)'];\n", | |
"# recover keywords\n", | |
"recover_word = ['recovered'];\n", | |
"transfer_word = ['transfer','collect'] #'remove'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "a1c50394-18fd-45f0-92ea-428328e5cd75", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# Compare two strings are similar\n", | |
"# consider that county names in NRC are chaotic, here we use county, city names from NOAA, try to match with county name from NRC\n", | |
"#locationName_NRC must be a pandas Series!\n", | |
"# the returned value is booling\n", | |
"def keywords_searching(locationName_NRC,keywords_set, case_sentivity = False):\n", | |
" keywords_pattern = '|'.join(keywords_set);\n", | |
" L_kw = locationName_NRC.str.contains(keywords_pattern,case = case_sentivity ,regex=True)\n", | |
" return L_kw " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "04892220-962f-4215-815d-3ee38045576b", | |
"metadata": {}, | |
"source": [ | |
"## Text pretreatment" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "a0e3c5ed-fd43-424b-a0d9-3386e3e03256", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def text_preprocess(text):\n", | |
" \"\"\"\n", | |
" Preprocess the input text by replacing specific substrings.\n", | |
"\n", | |
" This function takes a text input and performs a series of string replacements\n", | |
" to standardize certain terms and correct common formatting issues.\n", | |
"\n", | |
" Parameters:\n", | |
" text (str): The text to be processed.\n", | |
"\n", | |
" Returns:\n", | |
" str: The processed text with replacements applied.\n", | |
" \"\"\"\n", | |
"\n", | |
" try:\n", | |
" # Ensure the input is in string format\n", | |
" text = str(text)\n", | |
"\n", | |
" # Replacements for various terms and formatting issues\n", | |
" text = text.replace('_x000B_', ' ')\n", | |
" text = text.replace('cubic feet', 'cubicfeet')\n", | |
" text = text.replace('•', '')\n", | |
" text = text.replace('approx.', 'approximately')\n", | |
" text = text.replace('APPROX.', 'approximately')\n", | |
" text = text.replace('gal. of', 'gal of')\n", | |
" text = text.replace('bbl. of', 'bbl of')\n", | |
" text = text.replace('bbls. of', 'bbls of')\n", | |
" text = text.replace('est.', 'estimate')\n", | |
" text = text.replace('gals.', 'gallons')\n", | |
" text = text.replace(',OOO', '000')\n", | |
"\n", | |
" # Ensure spaces around hyphens are consistent\n", | |
" if text.count('-') != text.count(' - '):\n", | |
" text = text.replace('-', ' - ')\n", | |
"\n", | |
" # Replace double spaces with single spaces\n", | |
" while ' ' in text:\n", | |
" text = text.replace(' ', ' ')\n", | |
" except:\n", | |
" # If an error occurs, pass without making changes\n", | |
" pass\n", | |
"\n", | |
" return text" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "eaa69ab6-30d9-4c65-b6d6-3595688283a2", | |
"metadata": {}, | |
"source": [ | |
"## Exclude hypothetical sentences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "3f23736a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pdc_object = ['NOAA','SSC','USGC'];\n", | |
"pdc_behavior = ['request','ask'];\n", | |
"pdc_task = ['trajectory','fate'];\n", | |
"\n", | |
"def trajectory_prediction(df_sent):\n", | |
" L_obj = df_sent['lemma'].str.contains('|'.join(pdc_object), case=True).any();\n", | |
" L_bhv = df_sent['lemma'].str.contains('|'.join(pdc_behavior), case=False).any();\n", | |
" L_task = df_sent['lemma'].str.contains('|'.join(pdc_task), case=False).any();\n", | |
"\n", | |
" L_pdc_sent = True if ((L_task == True) & ((L_obj == True) | (L_bhv == True))) else False;\n", | |
" return L_pdc_sent" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "cbaf3fb8-fb0f-49fc-a095-9bc95ba40b67", | |
"metadata": {}, | |
"source": [ | |
"## Remove OA-lacking sentences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "35cd41fb-f9a9-4d7b-a709-51638185a4ed", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"def if_text_contain_oa(df_sent):\n", | |
" # RA search\n", | |
" df_sent['lemma'].replace(',','', regex=True, inplace=True); #remove comma from words\n", | |
" # Check if there are any types of numbers in this sentence\n", | |
" df_num_sent = (df_sent['upos'] == 'NUM');\n", | |
" L_num_sent = df_num_sent.any()\n", | |
" if L_num_sent:\n", | |
" list_OA = unit_forward_search(df_sent,unit_words);\n", | |
" list_OA.sort(reverse=True) # descending sort\n", | |
" if (len(list_OA) > 0):\n", | |
" # Take the sentence that contains oil amount\n", | |
" return True\n", | |
" else:\n", | |
" return False" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "184d8917-49cf-440d-925e-ebee6f53cc22", | |
"metadata": {}, | |
"source": [ | |
"## Rule-based NLP" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "96924d08", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 20230210Update: Add sequence constraint\n", | |
"def shortest_distance(L1,L2,sequence = 'two-way'):\n", | |
" L1 = [L1] if type(L1) == int else L1;\n", | |
" L2 = [L2] if type(L2) == int else L2;\n", | |
" L1.sort(); L2.sort();\n", | |
" dist = float(\"inf\");\n", | |
" i = j = 0;\n", | |
" while i < len(L1) and j < len(L2):\n", | |
" dis_ij = L2[j] - L1[i] \n", | |
" if sequence == 'two-way':\n", | |
" dist = min(dist, abs(dis_ij));\n", | |
" elif sequence == 'forward':\n", | |
" # In this case, elements i from L1 must be in front of element j from L2\n", | |
" # This can be used in situations which need set order of words such as amount + unit\n", | |
" if dis_ij > 0:\n", | |
" dist = min(dist, dis_ij);\n", | |
" if L1[i] < L2[j]:\n", | |
" i += 1\n", | |
" else:\n", | |
" j += 1\n", | |
" return dist " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "bac661ce", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# The function is to find the nearest group of two group of words\n", | |
"# The input value include: (1) df_sent, (2-3)the indexes of two groups of words respectively\n", | |
"# (4)separator word, default value is ',' (5) the effctive distance for two words\n", | |
"# The function of this function include:\n", | |
"# a.check if the distance between two words less than the effctive distance, \n", | |
"# b.check if defined separators are between two words.\n", | |
"# if a and b are both yes, assign True.\n", | |
"def Separator_check(df_sent,release_ind,neg_ind,separator_word = ',',effective_dist = 10):\n", | |
" dis_ij = 999;\n", | |
" l_wsparator = [];\n", | |
" l_wdistance = [];\n", | |
" L_fseparate = False;\n", | |
" # L_distance = False; \n", | |
" # Default: distance between rls and carry is smaller than effective distance\n", | |
" if (release_ind != []) & (neg_ind != []):\n", | |
" for i in range(len(release_ind)):\n", | |
" for j in range(len(neg_ind)):\n", | |
" release_i = release_ind[i];\n", | |
" neg_i = neg_ind[j];\n", | |
" if dis_ij > abs(release_i - neg_i):\n", | |
" dis_ij = abs(release_i - neg_i);\n", | |
" release_min = release_i;\n", | |
" neg_min = neg_i;\n", | |
"\n", | |
" if abs(release_i - neg_i) <= effective_dist:\n", | |
" l_wdistance.append(abs(release_i - neg_i) <= effective_dist)\n", | |
" # check if there are comma between the nearest neg and releases word \n", | |
" df_mid_neg_rls = df_sent.loc[min(neg_i,release_i)+1:max(neg_i,release_i)-1];\n", | |
" separator_word_pattern = '|'.join(separator_word);\n", | |
" l_wsparator.append((df_mid_neg_rls['lemma'].str.contains(separator_word_pattern,case=False)).any());\n", | |
"\n", | |
" L_distance = any(l_wdistance); \n", | |
" L_fseparate = any(l_wsparator);\n", | |
" # Temportaily don't consider distance restriction(l_wdistance)-update 09/24- didn't run main program\n", | |
" return L_fseparate #[L_distance,L_fseparate]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "02e08f0f-6d6e-44a9-9fe2-c0ded662340f", | |
"metadata": {}, | |
"source": [ | |
"### Release check" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "d5a58c44", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def release_check(df_sent):\n", | |
" pattern_release = '|'.join(release_words);\n", | |
" ll_release = df_sent['lemma'].str.fullmatch(pattern_release, case=False);\n", | |
" L_rls_word = ll_release.any();\n", | |
" return L_rls_word" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ea2fff75-f0c8-43b2-ab22-12a3add59e12", | |
"metadata": {}, | |
"source": [ | |
"### Unrelease check" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "c3567249", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 20230309 Upate: use \"shortest_distance\" to replace previous code\n", | |
"# current new function need neg word in front of rls word\n", | |
"\n", | |
"# Update: detect separator characters between release word and negetive word\n", | |
"def Unrealase_check(df_sent,sent,separator_Rstrct = True):\n", | |
" L_unrelease = False;\n", | |
"\n", | |
" pettern_neg = '|'.join(neg_words)\n", | |
" df_sent['neg_words'] = df_sent['text'].str.fullmatch(pettern_neg, case=False)\n", | |
" # try to use text rather than lemma, to avoid noise like No.(no)\n", | |
" list_neg_idx = df_sent.index[df_sent['neg_words'] == True].tolist();\n", | |
"\n", | |
" pattern_release = '|'.join(release_words)\n", | |
" df_sent['release_words'] = df_sent['lemma'].str.fullmatch(pattern_release, case=False)\n", | |
" list_rls_idx = df_sent.index[df_sent['release_words'] == True].tolist();\n", | |
"\n", | |
" #calculate the shortest distance between neg words and release words\n", | |
" if (list_rls_idx != [])&(list_neg_idx !=[]):\n", | |
" dist_2min = shortest_distance(list_neg_idx,list_rls_idx,sequence = 'forward');\n", | |
" if dist_2min <= 5:\n", | |
" L_unrelease = True;\n", | |
"\n", | |
" if L_unrelease == False:\n", | |
" ll_release = False;\n", | |
" for i in release_phrase: # in water in the water\n", | |
" li_release = i in sent.text;\n", | |
" if li_release == True:\n", | |
" ll_release = True;\n", | |
" ll_neg = df_sent['text'].str.fullmatch(pettern_neg, case=False).any();\n", | |
" if (ll_release == True) & (ll_neg == True):\n", | |
" L_unrelease = True;\n", | |
"\n", | |
" if separator_Rstrct == True:\n", | |
" L_fseparate = Separator_check(df_sent,list_rls_idx,list_neg_idx);\n", | |
" else:\n", | |
" L_fseparate = False;\n", | |
"\n", | |
" return [L_unrelease,L_fseparate]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "487184e5-dd95-4c9c-90de-3a2bd997d7cd", | |
"metadata": {}, | |
"source": [ | |
"### Carry" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "7227c5ae", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Check if one sentence contains carried word and doesn't contain release word.\n", | |
"# After practicing, this version removed all the release check here, as it seems a little redundant\n", | |
"\n", | |
"def Carry_sub_V2(df_sent,sent):\n", | |
" L_carry = False;\n", | |
" L_exclude_Fzy_carry = False;\n", | |
" pattern_carry = '|'.join(avoid_words);\n", | |
" ll_carry_1 = df_sent['lemma'].str.contains(pattern_carry, case=False)\n", | |
" ll_carry = (ll_carry_1.any()) | ('on board' in sent.text) ;\n", | |
" pattern_release = '|'.join(release_words)\n", | |
" ll_release = df_sent['lemma'].str.contains(pattern_release, case=False).any();\n", | |
" if (ll_carry == True) & (ll_release == False):\n", | |
" L_carry = True;\n", | |
"\n", | |
" pattern_fcarry = '|'.join(fuzzy_carry_words)\n", | |
" ll_fcarry = df_sent['lemma'].str.fullmatch(pattern_fcarry, case=False);\n", | |
"\n", | |
" L_fuzzy_carry = (df_sent['lemma'].str.fullmatch(pattern_fcarry, case=False).any());\n", | |
" if any(word in sent.text for word in exclude_fzy_carry_words):\n", | |
" L_exclude_Fzy_carry = True;\n", | |
"\n", | |
" # for some fuzzy carried words like 'have', some restrains needed to consider, \n", | |
" # currently release amount and units are applied as restrains\n", | |
" elif (L_fuzzy_carry == True) & (L_exclude_Fzy_carry == False):\n", | |
" ll_num = (df_sent['upos'] == 'NUM');\n", | |
"\n", | |
" pattern_units = '|'.join(unit_words);\n", | |
" ll_units = df_sent['lemma'].str.fullmatch(pattern_units, case=False);\n", | |
" if (ll_num.any() == True) & (ll_units.any() == True):\n", | |
" unit_list = ll_units.index[ll_units == True].tolist();\n", | |
" carry_list = ll_fcarry.index[ll_fcarry == True].tolist();\n", | |
" shortest_dis_2list = shortest_distance(unit_list,carry_list);\n", | |
" if shortest_dis_2list <= 3:\n", | |
" L_carry = True;\n", | |
" return L_carry" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "411d4a78", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def sent2df(sent):\n", | |
" df_sent = pd.DataFrame();\n", | |
" for word in sent.words:\n", | |
" temp = pd.DataFrame.from_dict(word.to_dict(),orient = 'index')\n", | |
" if 'head' in temp.index:\n", | |
" temp.rename(index={\"head\": \"head_id\"},inplace = True)\n", | |
" temp.loc['head'] = sent.words[word.head-1].text;\n", | |
" df_sent = pd.concat([df_sent, temp],axis = 1,ignore_index=False )\n", | |
" df_sent = df_sent.T.reset_index(drop = True)\n", | |
" return df_sent" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "cf928f21", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def Isnumber(x):\n", | |
" try:\n", | |
" x_float = float(x);\n", | |
" Isnumber = True;\n", | |
" except:\n", | |
" Isnumber = False;\n", | |
" return Isnumber" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c557e384-902f-4f94-b113-0813dcc24486", | |
"metadata": {}, | |
"source": [ | |
"### oil amount extraction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "90c47eef", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Function:Extract volume amount from one sentence (version 3) 20230309\n", | |
"# (1) Fix: some numbers separated by comma (such as 100,000) cannot be lemmatized and treated as numbers\n", | |
"# Version 2 updates: see unit_forward_search_V2\n", | |
"\n", | |
"# load data of posts\n", | |
"os.chdir(RootFile_Name)\n", | |
"unit_converter = pd.read_excel('Unit_Converter_NOAA.xlsx',sheet_name='Convertable',index_col=[0]);\n", | |
"quantifier_converter = pd.read_excel('Unit_Converter_NOAA.xlsx',sheet_name='quantifier',index_col=[0]);\n", | |
"\n", | |
"from word2number import w2n\n", | |
"def unit_forward_search(df_sent,unit_kw = unit_words):\n", | |
" pattern_amount = '|'.join(unit_kw);\n", | |
" L_unit = df_sent['lemma'].str.fullmatch(pattern_amount, case=False);\n", | |
" df_unit = df_sent.loc[L_unit,['lemma']].reset_index(drop = True)\n", | |
" amount_sent = [];\n", | |
" for i in range(L_unit.sum()):#L_unit.sum()\n", | |
" coeff2gal = unit_converter.loc[df_unit.loc[i,'lemma'].upper(),'To_gallon'];\n", | |
" a2num_tt = 1;\n", | |
" num_tt = 1;\n", | |
" unit_index = df_sent[L_unit].index[i]\n", | |
" df_unitforward = df_sent.loc[unit_index-2:unit_index-1].reset_index(drop = True);\n", | |
" # create a copy for df.str.contains/fullmatch etc.\n", | |
" df_unitforward_c = df_unitforward.copy()\n", | |
"\n", | |
" # stanza sometimes cannot convert numbers with comma into \"numbers\"\n", | |
" # str.isnumeric cannot detect numbers with comma as \"numbers\" \n", | |
" # So displace comma and convert it into numbers manually\n", | |
" L_comma_num = (df_unitforward['upos'] == 'NUM') & (df_unitforward['lemma'].str.contains(','));\n", | |
" df_unitforward.loc[L_comma_num,'lemma'] = df_unitforward.loc[L_comma_num,'lemma'].str.replace(',', '').astype(float);\n", | |
"\n", | |
" # recognize and mark aIsnumberll types of numbers in 'lemma' column.\n", | |
" L_num = (df_unitforward['upos'] == 'NUM') & (df_unitforward['lemma'].str.isnumeric())\n", | |
" # str.isnumeric cannot detect decimal \n", | |
" # convert the strings to floats in 'lemma' column (if applicable)\n", | |
" for j in range(df_unitforward.shape[0]):\n", | |
" L_dec = Isnumber(df_unitforward.loc[j,'lemma']);\n", | |
" if L_dec == True:\n", | |
" df_unitforward.loc[j,'lemma'] = float(df_unitforward.loc[j,'lemma']);\n", | |
" L_num[j] = True;\n", | |
"\n", | |
" #here the alphanumber must contain \"K\" (for 1000)\n", | |
" L_anum = (df_unitforward['upos'] == 'NUM') & (df_unitforward_c['lemma'].str.isalpha())\n", | |
"\n", | |
" if L_num.any() == True:\n", | |
" df_num = df_unitforward[L_num].reset_index(drop = True)\n", | |
" df_num['lemma'].replace(',','', regex=True, inplace=True); #remove comma from words\n", | |
" for j in range(df_num.shape[0]):\n", | |
" num_i = df_num.loc[j,'lemma'];\n", | |
" num_tt = num_tt * float(num_i);\n", | |
"\n", | |
" if L_anum.any() == True:\n", | |
" df_anum = df_unitforward[L_anum].reset_index(drop = True)\n", | |
" for j in range(df_anum.shape[0]):\n", | |
" anum_i = df_anum.loc[j,'lemma'];\n", | |
" try:\n", | |
" a2num_i = w2n.word_to_num(anum_i);\n", | |
" except ValueError:\n", | |
" anum_i = df_anum.loc[j,'lemma'].lower();\n", | |
" a2num_i = quantifier_converter.loc[anum_i,'To_number'];\n", | |
" except:\n", | |
" a2num_i = -99999999; #give an abnormal value,to notify Error\n", | |
" a2num_tt = a2num_tt * a2num_i;\n", | |
"\n", | |
" else:\n", | |
" a2num_i = 1;\n", | |
" amount_sent.append(float(num_tt * a2num_tt * coeff2gal));\n", | |
" return amount_sent" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"id": "a0b0e83f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def carry_index(df_sent,sent,avoid_words):\n", | |
" if 'on board' in sent:\n", | |
" avoid_words.append('board');\n", | |
" pattern_carry = '|'.join(avoid_words);\n", | |
" ll_carry = df_sent['lemma'].str.contains(pattern_carry, case=False) ;\n", | |
" carry_ind = ll_carry.index[ll_carry == True].tolist();\n", | |
" return carry_ind" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"id": "16199127", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def kw_plt(df_sent):\n", | |
" L_potential = df_sent['lemma'].str.contains('potential').any();\n", | |
" return L_potential" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "b4092ab1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def NER2df(doc):\n", | |
" df_sent = pd.DataFrame();\n", | |
" for ent in doc.ents:\n", | |
" temp = pd.DataFrame.from_dict(ent.to_dict(),orient = 'index')\n", | |
" df_sent = pd.concat([df_sent, temp],axis = 1,ignore_index=False)\n", | |
" df_sent = df_sent.T.reset_index(drop = True)\n", | |
" return df_sent" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3e5c4e73-e89f-42c7-b431-7cfe3bdce7dd", | |
"metadata": {}, | |
"source": [ | |
"### Rule-based NLP" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "b943ce83-8ff7-461b-ae6d-b4c2f2b4ae8c", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# Main run 9.1\n", | |
"#a) modify OAtt extraction \n", | |
"#b) modify coefficient of identifying RA and OAtt from 0.95 to 1\n", | |
"#c) modify variable names\n", | |
"\n", | |
"#Main run 8.1\n", | |
"# a) follow PFaL search RA from paragraph only\n", | |
"\n", | |
"def RA_rb_ext(df_noaa,k,col_text,\n", | |
" no_rls_check = True,\n", | |
" ptl_large_check = True,\n", | |
" ratio_oa_ptl = 1,\n", | |
" label_UCRA = False):\n", | |
" \n", | |
" L_times_larger_plt = False; if_rls = False; if_no_rls = False\n", | |
" RA_rb = -1; CA_rb = -1;\n", | |
" list_RA = []; list_OA_carry = [];\n", | |
" sents_unrelease = []; list_sents_rls = [];\n", | |
"\n", | |
" # df_noaa.rename(columns = {'max_ptl_release_gallons': 'ptl_max_RA',\n", | |
" # 'plt_max_RA': 'ptl_max_RA'},inplace = True);\n", | |
"\n", | |
" text = df_noaa.loc[k,col_text];\n", | |
" des = text_preprocess(text);\n", | |
" # df_noaa.loc[k,col_text] = des;\n", | |
"\n", | |
" if des != 'Unknown':\n", | |
" plt_max_RA = df_noaa.loc[k,'max_ptl_release_gallons'];\n", | |
" try:\n", | |
" doc = nlp(des) # run annotation over a sentence\n", | |
" for sent in doc.sentences:\n", | |
" df_sent = sent2df(sent);\n", | |
" # This must be put under the loop of sentences!\n", | |
" Ls_unrelease = Unrealase_check(df_sent,sent,separator_Rstrct = False)[0]\n", | |
" sents_unrelease.append(Ls_unrelease)\n", | |
"\n", | |
" # check if this sentence has released word\n", | |
" L_rls_word = release_check(df_sent);\n", | |
" L_potential = kw_plt(df_sent) if ptl_large_check==True else False\n", | |
"\n", | |
" # label records with release behaviors and consider take No_Rls into account\n", | |
" if ((L_rls_word == True) and (Ls_unrelease == False) and (L_potential == False)):\n", | |
" L_rls_sent = True;\n", | |
" else:\n", | |
" L_rls_sent = False;\n", | |
" list_sents_rls.append(L_rls_sent);\n", | |
"\n", | |
" # Start RA extraction\n", | |
" # give proirity to RA searching, do No_Rls assignemnt later\n", | |
" df_sent['lemma'].replace(',','', regex=True, inplace=True); #remove comma from words\n", | |
"\n", | |
" # Check if there are any types of numbers in this sentence\n", | |
" L_num_intext = (df_sent['upos'] == 'NUM').any();\n", | |
"\n", | |
" # Check if this sentence has carried word\n", | |
" L_cry_sent = Carry_sub_V2(df_sent,sent)\n", | |
"\n", | |
" if (L_num_intext == True):\n", | |
" list_OA = unit_forward_search(df_sent,unit_words);\n", | |
" list_OA.sort(reverse=True); #descending\n", | |
"\n", | |
" if (len(list_OA) > 0):\n", | |
" if (L_cry_sent == False) & (L_rls_sent == True):\n", | |
" list_RA.append(max(list_OA));\n", | |
" elif (L_cry_sent == True) & (L_rls_sent == True):\n", | |
" for oil_amount_i in list_OA:\n", | |
" # if plt_max_RA != -1:\n", | |
" if (oil_amount_i/plt_max_RA <= ratio_oa_ptl):\n", | |
" list_RA.append(oil_amount_i);\n", | |
" else:\n", | |
" list_OA_carry.append(oil_amount_i);\n", | |
" elif (L_cry_sent == True):\n", | |
" list_OA_carry.append(max(list_OA));\n", | |
"\n", | |
" # Release identification\n", | |
" if_rls = any(list_sents_rls)\n", | |
"\n", | |
" # No release identification\n", | |
" if_no_rls = any(sents_unrelease)\n", | |
"\n", | |
" # Carry amount identification\n", | |
" if len(list_OA_carry)>0:\n", | |
" CA_rb = max(list_OA_carry);\n", | |
"\n", | |
" # No release check\n", | |
" if (no_rls_check == True) & (any(sents_unrelease) == True):\n", | |
" RA_rb = 0;\n", | |
"\n", | |
" # RA identification\n", | |
" if (len(list_RA) > 0):\n", | |
" # if (max(list_RA) > df_noaa.loc[k,col_Srch_RA]):\n", | |
" RA_rb = max(list_RA);\n", | |
" \n", | |
" # label Times_larger_plt\n", | |
" if (plt_max_RA != -1) and (RA_rb>(3*plt_max_RA)):\n", | |
" # df_noaa.loc[k,col_Srch_RA] = plt_max_RA;\n", | |
" L_times_larger_plt = True;\n", | |
"\n", | |
" # label UCRA (unclear release amount)\n", | |
" if (label_UCRA == True) and RA_rb <= 0 and (any(list_sents_rls)): # ???\n", | |
" RA_rb = -2;\n", | |
"\n", | |
" except ValueError:\n", | |
" RA_rb = 'ValueError';\n", | |
" except KeyError:\n", | |
" RA_rb = 'KeyError';\n", | |
" return [RA_rb, CA_rb, L_times_larger_plt,if_rls,if_no_rls]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c2ae063d-f8e1-49ff-acc6-ab77d22a9623", | |
"metadata": {}, | |
"source": [ | |
"## GPT RA Extraction" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "15a29b2a-1390-45c1-9472-37e297426662", | |
"metadata": {}, | |
"source": [ | |
"### Call GPT" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "2cd2b9cd-c584-4aa2-8dc3-4df4407674c4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# gpt model selection\n", | |
"def gpt_model_select(gpt_model_shortname):\n", | |
" if gpt_model_shortname == \"gpt4\":\n", | |
" gpt_model = \"gpt-4-1106-preview\"\n", | |
" col_gpt_output = 'text_gpt4'\n", | |
" elif gpt_model_shortname == \"gpt4-0125\":\n", | |
" gpt_model = \"gpt-4-0125-preview\"\n", | |
" col_gpt_output = 'text_gpt4_0125' \n", | |
" elif gpt_model_shortname == \"gpt35\":\n", | |
" gpt_model = \"gpt-3.5-turbo\"\n", | |
" col_gpt_output = 'text_gpt35'\n", | |
" return gpt_model,col_gpt_output\n", | |
"\n", | |
"gpt_model_shortname = 'gpt4-0125' #\"gpt4-0125\"\n", | |
"gpt_model,col_gpt_output = gpt_model_select(gpt_model_shortname)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "c11f6dc9-c85f-4b5a-a2af-14b4dc6234e4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def gpt_cost_estimate(gpt_model,n_prompt_tokens,n_completion_tokens,n_total_tokens):\n", | |
" if gpt_model == \"gpt-4-1106-preview\" or gpt_model == \"gpt-4-0125-preview\":\n", | |
" costs = (n_prompt_tokens*0.01 + n_completion_tokens*0.03)/1000\n", | |
" elif gpt_model == \"gpt-3.5-turbo\":\n", | |
" costs = (n_total_tokens*0.00200)/1000\n", | |
" return round(costs,2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "fb918c65-3342-41e7-9242-a78807ea8b71", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"from openai import OpenAI\n", | |
"\n", | |
"API_KEY = 'xxxxxx' # replace 'xxxxxx' with user's API key\n", | |
"\n", | |
"def query_gpt(prompt, model=\"gpt-3.5-turbo\"):\n", | |
" # \"\"\"\n", | |
" # Function to query GPT-3.5-turbo model.\n", | |
" # :param prompt: The prompt to send to the model.\n", | |
" # :param model: The model to use, default is 'gpt-3.5-turbo'.\n", | |
" # :param max_tokens: The maximum number of tokens to generate.\n", | |
" # :return: The response from the model.\n", | |
" # \"\"\"\n", | |
"\n", | |
" client = OpenAI(api_key=API_KEY)\n", | |
"\n", | |
" chat_completion = client.chat.completions.create(\n", | |
" messages=[\n", | |
" {\n", | |
" \"role\": \"user\",\n", | |
" \"content\": prompt,\n", | |
" }\n", | |
" ],\n", | |
" model=model,\n", | |
" )\n", | |
" return chat_completion" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e2a83bbf-25ee-4157-80dd-b276284e553b", | |
"metadata": {}, | |
"source": [ | |
"### Extract OA from GPT eval results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "5284d4c5-ae7b-411f-949d-3488b36f1070", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "NameError", | |
"evalue": "name 'unit_words' is not defined", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_8704/3567559654.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;31m# removed unit adding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mOA_Ext_From_GPT_V3\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_sent\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0munit_kw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munit_words\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[0mpattern_unit\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'|'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0munit_words\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mL_unit_lost\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_sent\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'lemma'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfullmatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpattern_unit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcase\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;31mNameError\u001b[0m: name 'unit_words' is not defined" | |
] | |
} | |
], | |
"source": [ | |
"# Version 3: Extract OA amount from GPT results 20231218\n", | |
"# removed unit adding\n", | |
"\n", | |
"def OA_Ext_From_GPT_V3(df_sent,unit_kw = unit_words):\n", | |
" pattern_unit = '|'.join(unit_words)\n", | |
" L_unit_lost = df_sent['lemma'].str.fullmatch(pattern_unit, case=False).any()\n", | |
" try:\n", | |
" list_OA_GPT = unit_forward_search(df_sent,unit_words)\n", | |
" except:\n", | |
" list_OA_GPT = []\n", | |
" if len(list_OA_GPT) >= 1: # For most results with units, existed function is usable\n", | |
" list_OA_GPT.sort(reverse=True)\n", | |
" return [list_OA_GPT,L_unit_lost]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "f04de39c-231b-4de8-824e-76e331de712a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extract RA,OAtt,OArcv 20240324\n", | |
"# Version 3.1, adapted from Version 2.1\n", | |
"# remove unit-lost dection, recover amount extraction\n", | |
"# don't use maximum RA use the last RA\n", | |
"\n", | |
"def get_RA_from_gpt_0324(text,RA_field_name = 'Final RA', prompt_type=\"short\",separator = '\\n'):\n", | |
" # Initialization\n", | |
" signal_words = {\n", | |
" \"short\": ('ra_release', 'ra_total'),\n", | |
" \"long\": (RA_field_name.lower(), 'OA_total'.lower())\n", | |
" }\n", | |
" ra_signal_word, oatt_signal_word = signal_words[prompt_type]\n", | |
"\n", | |
" RA_gpt, OAtt_gpt = -1, -1\n", | |
" # Text preprocessing and split it into segments\n", | |
" pretreat_text = pretreat_for_nlp(text)\n", | |
" segments = pretreat_text.split(separator)\n", | |
" \n", | |
" # go through segments backward (Use the latest appeared RA instead of largest because of the new prompts)\n", | |
" for segment in reversed(segments):\n", | |
" segment = segment.strip(' ')\n", | |
" # print(segment)\n", | |
" # recognize RA = -1 at first\n", | |
" if segment.startswith('- Final RA: Unknown') & (RA_gpt == -1):\n", | |
" # print('yes')\n", | |
" break\n", | |
" else:\n", | |
" doc = nlp(segment)\n", | |
" for sentence in doc.sentences:\n", | |
" sentence_df = sent2df(sentence)\n", | |
" contains_RA = ra_signal_word in sentence.text.lower()\n", | |
" contains_OAtt = oatt_signal_word in sentence.text.lower()\n", | |
"\n", | |
" if (sentence_df['upos'] == 'NUM').any():\n", | |
" OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words)\n", | |
" if len(OA_temp) > 0:\n", | |
" if contains_RA and RA_gpt==-1:\n", | |
" RA_gpt = max(OA_temp)\n", | |
" elif contains_OAtt and OAtt_gpt==-1:\n", | |
" OAtt_gpt = max(OA_temp)\n", | |
" if (RA_gpt == -1) & ('Final RA: 0' in sentence.text): # may be remove in the future\n", | |
" RA_gpt = 0\n", | |
" return [RA_gpt, OAtt_gpt]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "d5afa350-4dc0-478c-8028-4a3848685024", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extract RA,OAtt,OArcv 20240324\n", | |
"# Version 3.1, adapted from Version 2.1\n", | |
"# remove unit-lost dection, recover amount extraction\n", | |
"# don't use maximum RA use the last RA\n", | |
"\n", | |
"def get_RA_from_long_prompt_0324(text,RA_field_name = 'Actual RA', prompt_type=\"short\",separator = '\\n'):\n", | |
" # Initialization\n", | |
" signal_words = {\n", | |
" \"short\": ('ra_release', 'ra_total'),\n", | |
" \"long\": (RA_field_name.lower(), 'OA_total'.lower())\n", | |
" }\n", | |
" ra_signal_word, oatt_signal_word = signal_words[prompt_type]\n", | |
"\n", | |
" RA_gpt = -1\n", | |
" No_spill = False\n", | |
" # Text preprocessing and split it into segments\n", | |
" pretreat_text = pretreat_for_nlp(text)\n", | |
" segments = pretreat_text.split(separator)\n", | |
"\n", | |
" # go through segments backward (Use the latest appeared RA instead of largest because of the new prompts)\n", | |
" for segment in segments:\n", | |
" segment = segment.strip(' ')\n", | |
" # print(segment)\n", | |
" # recognize RA = -1 at first\n", | |
" if segment.startswith('- No spill: Yes') & (RA_gpt == -1): # may be remove in the future\n", | |
" RA_gpt = 0\n", | |
" No_spill = True\n", | |
" else:\n", | |
" doc = nlp(segment)\n", | |
" for sentence in doc.sentences:\n", | |
" sentence_df = sent2df(sentence)\n", | |
" contains_RA = ra_signal_word in sentence.text.lower()\n", | |
" if (sentence_df['upos'] == 'NUM').any():\n", | |
" OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words)\n", | |
" if len(OA_temp) > 0:\n", | |
" if contains_RA and RA_gpt==-1:\n", | |
" RA_gpt = max([RA_gpt] + OA_temp)\n", | |
" return [RA_gpt, No_spill]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "e28026ce-a391-4b48-ab46-25a0cb50c4d2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"RA_field_name='Final RA'\n", | |
"'- ' + RA_field_name + ': Unknown' == '- Final RA: Unknown'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "f3152833-ea20-434a-a2c9-b68fb38143de", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Pretreatments on GPT results (text) 20240324\n", | |
"# Version 1, adapted from process_text_from_gpt, Version 2 \n", | |
"def pretreat_for_nlp(text):\n", | |
" text = text.replace('\\n\\n','\\n');\n", | |
" text = text.replace('cubic yard','cubicyard');\n", | |
" text = text.replace('• ','- ')\n", | |
" text = text.strip(' ')\n", | |
" \n", | |
" # add a white space to \"BBL\"\n", | |
" if 'bbl' in text.lower() and ' bbl' not in text.lower():\n", | |
" text = text.replace('bbl',' bbl')\n", | |
" text = text.replace('BBL',' bbl')\n", | |
" \n", | |
" while '..' in text:\n", | |
" text = text.replace('..','.')\n", | |
"\n", | |
" # Ensure spaces around hyphens are consistent\n", | |
" if text.count('-') != text.count(' - '):\n", | |
" text = text.replace('-', ' - ')\n", | |
"\n", | |
" while ' ' in text:\n", | |
" text = text.replace(' ',' ')\n", | |
"\n", | |
" return text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "4056cc84-e0b4-446a-9d06-68f94f28a234", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extract RA,OAtt 20240329\n", | |
"# Version 2.2, adapted from Version 2.1\n", | |
"# remove unit-lose detection\n", | |
"# Remove recover amount extraction\n", | |
"\n", | |
"def get_oil_amount_from_short_prompt(text, ra_signal_word = 'RA_Release', oatt_signal_word = 'RA_total'):\n", | |
" # Initialization\n", | |
" # signal_words = {\n", | |
" # \"short\": ('ra_release', 'ra_total')\n", | |
" # }\n", | |
" # ra_signal_word = , oatt_signal_word = signal_words[prompt_type]\n", | |
"\n", | |
" max_RA, max_OAtt = -1, -1\n", | |
"\n", | |
" # Text preprocessing\n", | |
" processed_text = process_text_from_gpt(text)\n", | |
" segments = processed_text.split('FENDUAN')\n", | |
"\n", | |
" for segment in segments:\n", | |
" doc = nlp(segment)\n", | |
" for sentence in doc.sentences:\n", | |
" sentence_df = sent2df(sentence)\n", | |
" has_number = (sentence_df['upos'] == 'NUM').any()\n", | |
" contains_RA = ra_signal_word in sentence.text; #print(sentence.text)\n", | |
" contains_OAtt = oatt_signal_word in sentence.text\n", | |
"\n", | |
" if has_number:\n", | |
" # print('has number')\n", | |
" OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words); \n", | |
" if contains_RA:\n", | |
" max_RA = max([max_RA] + OA_temp)\n", | |
" # if (max_RA == -1) & (ra_signal_word+': 0' in sentence.text):\n", | |
" # max_RA = max([max_RA] + [0])\n", | |
" elif contains_OAtt:\n", | |
" max_OAtt = max([max_OAtt] + OA_temp)\n", | |
"\n", | |
" return [max_RA, max_OAtt]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "31820246-58de-46b3-aa63-ed131591ef27", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Extract RA,OAtt,OArcv 20231218\n", | |
"# Version 2.1, adapted from Version 2, revised by ChatGPT\n", | |
"# To distinguish with long prompt, add short prompt to its function name\n", | |
"# Removed unit adding function in OA_Ext_From_GPT_V3 as it is no longer used\n", | |
"\n", | |
"def extract_RA_from_gpt(text,prompt_type=\"short\", extract_recovery=False):\n", | |
" # Initialization\n", | |
" signal_words = {\n", | |
" \"short\": ('ra_release', 'ra_total', 'ra_recover'),\n", | |
" \"long\": ('Final RA'.lower(), 'OA_total'.lower(), 'OA_recovered'.lower())\n", | |
" }\n", | |
" ra_signal_word, oatt_signal_word, oarcv_signal_word = signal_words[prompt_type]\n", | |
"\n", | |
" max_RA, max_OAtt, max_OArv = -1, -1, -1\n", | |
"\n", | |
" # Text preprocessing\n", | |
" processed_text = process_text_from_gpt(text)\n", | |
" segments = processed_text.split('FENDUAN')\n", | |
"\n", | |
" for segment in segments:\n", | |
" doc = nlp(segment)\n", | |
" for sentence in doc.sentences:\n", | |
" sentence_df = sent2df(sentence)\n", | |
" has_number = (sentence_df['upos'] == 'NUM').any()\n", | |
" contains_RA = ra_signal_word in sentence.text.lower()\n", | |
" contains_OAtt = oatt_signal_word in sentence.text.lower()\n", | |
" contains_OArv = oarcv_signal_word in sentence.text.lower() if extract_recovery else False\n", | |
"\n", | |
" if has_number:\n", | |
" OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words)\n", | |
" if contains_RA:\n", | |
" RA_unit_lost = OA_unit_lost\n", | |
" max_RA = max([max_RA] + OA_temp)\n", | |
" if (max_RA == -1) & ('Final RA: 0' in sentence.text):\n", | |
" max_RA = max([max_RA] + [0])\n", | |
" elif contains_OAtt:\n", | |
" OAtt_unit_lost = OA_unit_lost\n", | |
" max_OAtt = max([max_OAtt] + OA_temp)\n", | |
" elif contains_OArv:\n", | |
" OArv_unit_lost = OA_unit_lost\n", | |
" max_OArv = max([max_OArv] + OA_temp)\n", | |
"\n", | |
" return [max_RA, max_OAtt, max_OArv]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d992f239-b2d1-44af-9936-00be2425b8c5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Pretreatments on GPT results (text) 20230309\n", | |
"# Version 2\n", | |
"def process_text_from_gpt(text):\n", | |
"\n", | |
" # text pretreatment\n", | |
" text = text.replace('\\n','FENDUAN');\n", | |
" text = text.replace('cubic yard','cubicyard');\n", | |
" text = text.strip(' ')\n", | |
"\n", | |
" # add a white space to \"BBL\"\n", | |
" if 'bbl' in text.lower() and ' bbl' not in text.lower():\n", | |
" text = text.replace('bbl',' bbl')\n", | |
" text = text.replace('BBL',' bbl')\n", | |
" \n", | |
" while '..' in text:\n", | |
" text = text.replace('..','.')\n", | |
" while ' ' in text:\n", | |
" text = text.replace(' ',' ')\n", | |
" while 'FENDUANFENDUAN' in text:\n", | |
" text = text.replace('FENDUANFENDUAN','FENDUAN')\n", | |
"\n", | |
" # Ensure spaces around hyphens are consistent\n", | |
" if text.count('-') != text.count(' - '):\n", | |
" text = text.replace('-', ' - ')\n", | |
"\n", | |
" return text" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3d813b08-d691-46ca-85fa-ec6a0729021e", | |
"metadata": {}, | |
"source": [ | |
"## Merge rule-based RA and GPT RA" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "efe08337-7f8b-4075-81c6-886e45c9f0df", | |
"metadata": {}, | |
"source": [ | |
"# RA identification" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2ce4f722-41a9-4485-bfcc-2b4463595d0a", | |
"metadata": {}, | |
"source": [ | |
"## Group incidents by their ra_icd and ra_posts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "017f8c2c-6f55-4011-9845-de502f353ae8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def select_verified_excel_sheets(excel_file_path, excel_file_name, sheet_names):\n", | |
"\n", | |
" verified_data_excel_file = pd.ExcelFile(excel_file_path + excel_file_name)\n", | |
"\n", | |
" # Determine the sheets to process: use the provided list or determine based on the algorithm\n", | |
" if sheet_names is None:\n", | |
" # Algorithm to determine the sheet names\n", | |
" sample_sheet_names = verified_data_excel_file.sheet_names\n", | |
" sample_sheet_names = ['S5', 'S6']+[sheet for sheet in sample_sheet_names if sheet.startswith('F')] \n", | |
" else:\n", | |
" # Use the user-provided list of sheets\n", | |
" sample_sheet_names = sheet_names\n", | |
" print(\"Sheets being processed:\", sample_sheet_names)\n", | |
"\n", | |
" verified_data_excel_file.close()\n", | |
" return sample_sheet_names" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d794a3f9-9bc7-41a1-a9ac-d6a1293f3445", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## determine final RA by group" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0f80d2b3-4cb4-4f9d-b68a-1abc6775e57f", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"define initial conditions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "29efd429-ee52-4c20-862c-72ff20653a35", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"col_RA_icd_Comb_Srch_GPT = 'RA_merge_icd_rb_gpt'\n", | |
"col_RA_post_Comb_Srch_GPT = 'RA_merge_posts_rb_gpt'\n", | |
"\n", | |
"col_RA_Comb_des_post = 'ra_final'\n", | |
"col_source_RA_comb = 'source_final_ra'\n", | |
"col_category_icds = 'category_icds'\n", | |
"col_source_RA_des = 'source_RA_merge_icd'\n", | |
"col_source_RA_post = 'source_RA_merge_posts'\n", | |
"col_vrf_RAs_des_post = 'RA_Comb_des_post_Manual'\n", | |
"n_icd_unvrfd = 0" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": { | |
"height": "calc(100% - 180px)", | |
"left": "10px", | |
"top": "150px", | |
"width": "306px" | |
}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
}, | |
"toc-showtags": false | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |