Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Extract-oil-spill-amount-from-text/1_RA_extraction.ipynb
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1573 lines (1573 sloc)
52.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "0b86191a-960b-48cb-bef0-6ac54eb72e42", | |
"metadata": {}, | |
"source": [ | |
"# Initialization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "934e7ce5-0de1-486f-89c5-70e27212b339", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import os\n", | |
"from tqdm import tqdm\n", | |
"import warnings" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a1b9f02b-8071-4f3b-955a-9360d200155b", | |
"metadata": {}, | |
"source": [ | |
"Replace the folder path with the absolute address of the folder on your local disk. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "a39b698c-9212-41dc-ae4f-e47e6be5f6a7", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"RootFile_Name = 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\\\'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a180235f-948a-4880-a695-6627f9e2bcc7", | |
"metadata": {}, | |
"source": [ | |
"Change folder, run custom functions, define folder paths and filenames" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "4efab6c8-fecb-4aa1-a1f8-91020cd3147a", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-07-06 19:56:41 INFO: Loading these models for language: en (English):\n", | |
"========================\n", | |
"| Processor | Package |\n", | |
"------------------------\n", | |
"| tokenize | combined |\n", | |
"| pos | combined |\n", | |
"| lemma | combined |\n", | |
"========================\n", | |
"\n", | |
"2024-07-06 19:56:41 INFO: Use device: cpu\n", | |
"2024-07-06 19:56:41 INFO: Loading: tokenize\n", | |
"2024-07-06 19:56:41 INFO: Loading: pos\n", | |
"2024-07-06 19:56:41 INFO: Loading: lemma\n", | |
"2024-07-06 19:56:41 INFO: Done loading processors!\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Directory 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\description\\' already exists.\n", | |
"Directory 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\posts\\' already exists.\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(RootFile_Name)\n", | |
"\n", | |
"# Read custom functions used in the RA candidate extraction\n", | |
"%run func_set_general.ipynb\n", | |
"%run func_set_ra_eval.ipynb\n", | |
"\n", | |
"# Better to separately create folders for both descriptions and posts.\n", | |
"# use the created folder path to replace the following one\n", | |
"folder_path_description = RootFile_Name +'description\\\\'\n", | |
"folder_path_posts = RootFile_Name +'posts\\\\'\n", | |
"check_and_create_directory(folder_path_description)\n", | |
"check_and_create_directory(folder_path_posts)\n", | |
"folder_path_manual = RootFile_Name+'manual\\\\'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1b1711f9-4941-4b90-a1bd-9634bfad968a", | |
"metadata": {}, | |
"source": [ | |
"# Run the following code twice for description and posts separately\n", | |
"the first time is for processing description (by setting type_ra_ext = 'description')\n", | |
"\n", | |
"the second time is for posts (by setting type_ra_ext = 'post')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "dc0fc3cc-3b0e-4439-8fcb-d70c43c2070f", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Select if you want to process descriptions or posts. Note that you must process descriptions before posts\n", | |
"type_ra_ext = 'description'\n", | |
"# type_ra_ext = 'post'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "62f516c5-7099-4aa5-8cad-4c42753f2830", | |
"metadata": { | |
"collapsed": true, | |
"jupyter": { | |
"outputs_hidden": true | |
}, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'filename_prefix': 'incident',\n", | |
" 'col_text': 'description',\n", | |
" 'col_date': 'open_date',\n", | |
" 'col_ra_gpt_from_sample': 'RA_icd_GPT_OA_des',\n", | |
" 'col_ra_gpt': 'RA_icd_gpt',\n", | |
" 'col_ra_merge': 'RA_merge_icd_rb_gpt',\n", | |
" 'col_source_ra_merge': 'source_RA_merge_icd',\n", | |
" 'sheetnames_mvrfd_ext': ['S9', 'S10', 'S11', 'S12', 'S13', 'S14'],\n", | |
" 'col_refer': 'id',\n", | |
" 'col_ra_gpt_oa_text': 'RA_gpt_des_w_oa',\n", | |
" 'col_threat': 'threat',\n", | |
" 'col_sub': 'commodity',\n", | |
" 'col_text_wt_traj': 'des_wt_traj',\n", | |
" 'if_text_has_hypothesis': 'trj_prd',\n", | |
" 'col_oa_text_from_sample': 'OA_des',\n", | |
" 'col_oa_text': 'des_w_oa',\n", | |
" 'col_ra_rb': 'RA_rb_des_wt_traj',\n", | |
" 'col_oatt_rb': 'CA_rb_des_wt_traj',\n", | |
" 'col_if_tl_plt': 'Times_larger_plt',\n", | |
" 'col_if_rls': 'Rls_doc',\n", | |
" 'col_if_no_rls': 'No_release',\n", | |
" 'if_do_no_rls_check': True,\n", | |
" 'col_oatt_gpt_ext': 'OAtt_GPT_Ext',\n", | |
" 'col_oarv_gpt_ext': 'OArv_GPT_Ext',\n", | |
" 'col_if_oa_irlvt_gpt': 'OA_irlvt_GPT',\n", | |
" 'col_if_oa_unit_lost': 'OA_unit_lost',\n", | |
" 'col_id_of_sample': 'id',\n", | |
" 'col_text_gpt35': 'text_gpt35_description',\n", | |
" 'col_ra_gpt_ext': 'RA_gpt_des_w_oa',\n", | |
" 'col_vrf_ra': 'RA_des_Manual',\n", | |
" 'col_id_of_vrfd_ra': 'id',\n", | |
" 'col_ra_source_of_vrfd_ra': 'Source'}" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# loading variables\n", | |
"if type_ra_ext == 'description':\n", | |
" folder_path = folder_path_description\n", | |
"else:\n", | |
" folder_path = folder_path_posts\n", | |
"\n", | |
"vars = define_variables(type_ra_ext)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "779eeea2-9d86-4bc5-9ad0-13c2fa360b23", | |
"metadata": {}, | |
"source": [ | |
"# Identify oil spill-related incidents/posts" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "24b254ba-0318-41e0-b6c1-f85dbfcd5061", | |
"metadata": {}, | |
"source": [ | |
"## Description" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "95258389-1847-48e8-9d6e-728250c3b900", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The size of original dataset is 4473\n", | |
"The columns of original dataset include Index(['id', 'open_date', 'name', 'location', 'lat', 'lon', 'threat', 'tags',\n", | |
" 'commodity', 'measure_skim', 'measure_shore', 'measure_bio',\n", | |
" 'measure_disperse', 'measure_burn', 'max_ptl_release_gallons', 'posts',\n", | |
" 'description'],\n", | |
" dtype='object')\n", | |
"The number of incidents in refined dataset is 3550\n", | |
"The refined incident were written into 1_incident_level_oil_spill_related_1.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"# read original dataset\n", | |
"os.chdir(folder_path_description)\n", | |
"df_noaa = pd.read_csv('incidents_20231207.csv',parse_dates = ['open_date'], encoding='latin1')\n", | |
"# replace 'incidents_20231207.csv' with new raw incident data downloaded from NOAA ORR (https://incidentnews.noaa.gov/raw/index) if you want to update the dataset\n", | |
"\n", | |
"print('The size of original dataset is',df_noaa.shape[0])\n", | |
"print('The columns of original dataset include',df_noaa.columns)\n", | |
"\n", | |
"# identify oil spill relevance by column \"threat\" and \"commodity\"\n", | |
"\n", | |
"# fillna nan values in the columns to be filtered in advance\n", | |
"df_noaa.loc[:,[vars['col_threat'],vars['col_sub']]] = df_noaa.loc[:,[vars['col_threat'],vars['col_sub']]].fillna('Unknown')\n", | |
"\n", | |
"# filter out the incidents related to keywords-topic\n", | |
"# Search in \"threat\" column\n", | |
"L_threat = keywords_searching(df_noaa[vars['col_threat']], kws_oil, case_sentivity = False)\n", | |
"\n", | |
"# Search in \"commodity\" column\n", | |
"L_risk_sub = keywords_searching(df_noaa[vars['col_sub']], kws_oil, case_sentivity = False)\n", | |
"L_Condi = L_threat | L_risk_sub\n", | |
"\n", | |
"df_noaa = df_noaa[L_Condi].reset_index(drop = True);\n", | |
"print(\"The number of incidents in refined dataset is\", df_noaa.shape[0]);\n", | |
"\n", | |
"if type_ra_ext == 'description':\n", | |
" # write in\n", | |
" os.chdir(folder_path)\n", | |
" filename_oil_related = '1_' + vars['filename_prefix'] + '_level_oil_spill_related_1.xlsx'\n", | |
" df_noaa.to_excel(filename_oil_related, index=False)\n", | |
" print('The refined ' + vars['filename_prefix'] + ' were written into',filename_oil_related)\n", | |
"else:\n", | |
" df_incidents = df_noaa.copy()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "7d93e47d-cdeb-4d09-8496-73ef51473391", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## Posts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "54316e2f-fe60-40ea-962f-440a52194f2b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"if type_ra_ext == 'post':\n", | |
"\n", | |
" # read the original post\n", | |
" os.chdir(folder_path_posts)\n", | |
" df_noaa = pd.read_excel('posts_raw_all.xlsx',parse_dates = ['post date'])\n", | |
" # replace 'posts_raw_all.xlsx' with new raw post data if you want to update the dataset\n", | |
" print('The size of original dataset is',df_noaa.shape[0])\n", | |
" print('The columns of original dataset include',df_noaa.columns)\n", | |
"\n", | |
" # Oil spill incidents need to be finished before conducting this filter\n", | |
" IF_post_is_oil = pd.Series([False] * df_noaa.shape[0])\n", | |
" for k in tqdm(range(df_noaa.shape[0])):\n", | |
" id_noaa_i = df_noaa.loc[k,'noaa id']\n", | |
" if id_noaa_i in df_incidents['id'].values:\n", | |
" IF_post_is_oil[k] = True;\n", | |
" df_noaa = df_noaa.loc[IF_post_is_oil,:]\n", | |
" print(\"The number of posts in refined dataset is\", df_noaa.shape[0])\n", | |
" \n", | |
" # write in\n", | |
" os.chdir(folder_path_posts)\n", | |
" # Assuming df is your DataFrame and 'url_column' is the name of the column with long URLs\n", | |
" # df_noaa[vars['col_text']] = \"'\" + df_noaa[vars['col_text']].astype(str)\n", | |
"\n", | |
" filename_oil_related = '1_' + vars['filename_prefix'] + '_level_oil_spill_related_1.xlsx'\n", | |
" # Now write to Excel\n", | |
" df_noaa.to_excel(filename_oil_related, engine='openpyxl', index=False)\n", | |
" \n", | |
" # df_noaa.to_excel(filename_oil_related, index=False)\n", | |
" print('The refined ' + vars['filename_prefix'] + ' were written into:',filename_oil_related)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9087429e-5b2f-4633-a0a4-5d2aeddeff2d", | |
"metadata": {}, | |
"source": [ | |
"# Rule-based RA extraction" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "af01a8f2-c8c2-4c25-8277-3dc1d048d270", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## Pretreatment: remove hypothetical sentences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "3f5b4ec3-5d82-49a4-8778-837fdbafaa5a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Start rule-based RA candidates for posts\n", | |
"Size of df: 21934 9\n", | |
"Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts']\n", | |
"Memory Usage (Total in MB): 34.96\n" | |
] | |
} | |
], | |
"source": [ | |
"# read the post data\n", | |
"print('Start rule-based RA candidates for ' + vars['filename_prefix'])\n", | |
"os.chdir(folder_path)\n", | |
"df_noaa = pd.read_excel(filename_oil_related,parse_dates = [vars['col_date']])\n", | |
"\n", | |
"# Generate the summary\n", | |
"df_summary = summarize_large_dataframe(df_noaa)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "8759762c-302c-44a8-9247-477b271686d3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The column 'post content' exists in the DataFrame.\n", | |
"The column 'trj_prd' does not exist in the DataFrame. Creating and filling it with the specified default value.\n", | |
"The column 'post_wt_traj' does not exist in the DataFrame. Creating and filling it with the specified default value.\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_text'], False)\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['if_text_has_hypothesis'], False)\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_text_wt_traj'], np.nan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "35c8b19f-eb60-426d-b9c3-52792e8b1939", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Exluding the hypothetical sentences...\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████████████████████████████████████████████████████████████████████| 21934/21934 [2:01:59<00:00, 3.00it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Done.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# exclude hypothetical sentences\n", | |
"print('Exluding the hypothetical sentences...')\n", | |
"for k in tqdm(range(df_noaa.shape[0])):\n", | |
" if pd.isna(df_noaa.loc[k,vars['col_text_wt_traj']]):\n", | |
" str_real_sents = 'Unknown'\n", | |
" des = adjust_format(df_noaa.loc[k, vars['col_text']])\n", | |
" doc = nlp(des)\n", | |
" for sent in doc.sentences:\n", | |
" df_sent = sent2df(sent)\n", | |
" # skip the sentence if it is regarded as traj prediction\n", | |
" L_predict_sent = trajectory_prediction(df_sent)\n", | |
" if L_predict_sent:\n", | |
" df_noaa.loc[k, vars['if_text_has_hypothesis']] = True\n", | |
" continue\n", | |
" # Take the sentences that don't contain Traj prediction\n", | |
" str_real_sent_i = sent.text\n", | |
" str_real_sents = str_real_sent_i if str_real_sents == 'Unknown' else str_real_sents + ' ' + str_real_sent_i;\n", | |
" df_noaa.loc[k, vars['col_text_wt_traj']] = str_real_sents\n", | |
"print(\"Done.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "99ccf4d4-7b16-4564-bce4-93dc230f14aa", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Size of df: 21934 11\n", | |
"Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj']\n", | |
"Memory Usage (Total in MB): 63.53\n", | |
"The refined posts were written into: 2_posts_level_exclude_hypo_sents_1.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"# Write in\n", | |
"os.chdir(folder_path)\n", | |
"filename = '2_' + vars['filename_prefix'] + '_level_exclude_hypo_sents_1.xlsx'\n", | |
"\n", | |
"# Now write to Excel\n", | |
"df_noaa.to_excel(filename, engine='openpyxl', index=False)\n", | |
"\n", | |
"# df_noaa.to_excel(filename, index=False, urlbox=False)\n", | |
"df_summary = summarize_large_dataframe(df_noaa)\n", | |
"print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b5bf21d6-54d3-4795-8afd-9cc0eec835fd", | |
"metadata": {}, | |
"source": [ | |
"## Utilize rule-based algorithm to extract oil spill amount" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "1add3b76-2365-438b-98e9-b1dfb5f41114", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The column 'Times_larger_plt' does not exist in the DataFrame. Creating and filling it with the specified default value.\n", | |
"The column 'Rls_doc' does not exist in the DataFrame. Creating and filling it with the specified default value.\n", | |
"The column 'No_release' does not exist in the DataFrame. Creating and filling it with the specified default value.\n", | |
"The column 'CA_rb_post_wt_traj' does not exist in the DataFrame. Creating and filling it with the specified default value.\n", | |
"The column 'RA_rb_post_wt_traj' does not exist in the DataFrame. Creating and filling it with the specified default value.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Check existence of required columns and create them if needed\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_if_tl_plt'], False)\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_if_rls'], False)\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_if_no_rls'], False)\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_oatt_rb'], np.nan)\n", | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_ra_rb'], np.nan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "3565c74a-fc8b-4cb0-ab63-043b8e1f574c", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ptl_max_RA was added.\n" | |
] | |
} | |
], | |
"source": [ | |
"# remove column \"max_ptl_release_gallons\" in df_noaa\n", | |
"if 'ptl_max_RA' in df_noaa.columns:\n", | |
" df_noaa.drop(columns = ['ptl_max_RA'],inplace = True)\n", | |
"\n", | |
"# migrate potential maximum release amount\n", | |
"# For posts: migrate potential maximum release amount\n", | |
"if 'max_ptl_release_gallons' not in df_noaa.columns:\n", | |
" os.chdir(folder_path_description);\n", | |
" try:\n", | |
" df_icd = pd.read_csv('incidents_20231207.csv',parse_dates = ['open_date'], encoding='latin1')\n", | |
" except FileNotFoundError:\n", | |
" df_icd = pd.read_excel('incidents_20231207.xlsx',parse_dates = ['open_date'])\n", | |
" df_icd['max_ptl_release_gallons'].fillna(-1,inplace = True)\n", | |
" df_ptl_temp = df_icd[['id','max_ptl_release_gallons']]\n", | |
" df_noaa = pd.merge(df_noaa,\n", | |
" df_ptl_temp,\n", | |
" left_on = 'noaa id',\n", | |
" right_on = 'id',\n", | |
" how = 'left')\n", | |
" df_noaa.drop(columns = ['id'],inplace=True)\n", | |
" print('The column of potential maximum RA was added.')\n", | |
"else:\n", | |
" print('The column of potential maximum RA exists. No need for adding.')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "454c6831-7f5f-4b2a-af73-feadb5e12fe0", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████████████████████████████████████████████████████████████████████| 21934/21934 [2:06:38<00:00, 2.89it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"# Call rule-based function \n", | |
"for k in tqdm(range(df_noaa.shape[0])): #\n", | |
" if pd.isna(df_noaa.loc[k,vars['col_ra_rb']]):\n", | |
" [RA_rb, CA_rb, L_times_larger_plt,if_rls,if_no_rls] = RA_rb_ext(\n", | |
" df_noaa = df_noaa,\n", | |
" k = k,\n", | |
" col_text = vars['col_text_wt_traj'],\n", | |
" no_rls_check = vars['if_do_no_rls_check'],\n", | |
" ptl_large_check = True,\n", | |
" ratio_oa_ptl = 1,\n", | |
" label_UCRA = False);\n", | |
" \n", | |
" df_noaa.loc[k,vars['col_ra_rb']] = RA_rb\n", | |
" df_noaa.loc[k,vars['col_oatt_rb']] = CA_rb\n", | |
" df_noaa.loc[k,vars['col_if_tl_plt']] = L_times_larger_plt\n", | |
" df_noaa.loc[k,vars['col_if_rls']] = if_rls\n", | |
" df_noaa.loc[k,vars['col_if_no_rls']] = if_no_rls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "c03ee5fe-dd35-4bc8-91f3-f03a1bcbd252", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Size of df: 21934 17\n", | |
"Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA']\n", | |
"Memory Usage (Total in MB): 64.77\n", | |
"The refined posts were written into: 3_posts_level_rule_based_candidate_1.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"# Write in\n", | |
"os.chdir(folder_path)\n", | |
"filename = '3_' + vars['filename_prefix'] + '_level_rule_based_candidate_1.xlsx'\n", | |
"df_noaa.to_excel(filename, engine='openpyxl', index=False)\n", | |
"# Generate the summary\n", | |
"df_summary = summarize_large_dataframe(df_noaa)\n", | |
"print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f6386db8-a33d-4583-8bdf-f7317bb49ebe", | |
"metadata": { | |
"heading_collapsed": true | |
}, | |
"source": [ | |
"# LLM-based RA extraction" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ae457c3f-d1e0-4927-9419-f01566e605f6", | |
"metadata": {}, | |
"source": [ | |
"## Pretreatment: remove OA-lacking sentences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "8e250662-4b61-492a-a87c-4ebad3ce4121", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = '3_' + vars['filename_prefix'] + '_level_rule_based_candidate_1.xlsx'\n", | |
"os.chdir(folder_path)\n", | |
"df_noaa = pd.read_excel(filename)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "4b9613bc-4788-4dc6-bfab-a79e60e8003d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The column 'post_w_oa' does not exist in the DataFrame. Creating and filling it with the specified default value.\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_oa_text'], np.nan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "64b89a11-1fb8-4ac2-a5c6-772b31350014", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████████████████████████████████████████████████████████████████████| 21934/21934 [2:00:07<00:00, 3.04it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"# a) Extract sentences contains oil amount(OA) from description(or post text) \n", | |
"for k in tqdm(range(df_noaa.shape[0])):\n", | |
" if pd.isna(df_noaa.loc[k,vars['col_oa_text']]):\n", | |
" str_OA_sents = 'Unknown';\n", | |
" text = adjust_format(df_noaa.loc[k, vars['col_text_wt_traj']])\n", | |
" try:\n", | |
" doc = nlp(text) # run annotation over a sentence\n", | |
" for sent in doc.sentences:\n", | |
" df_sent = sent2df(sent)\n", | |
" L_text_contain_oa = if_text_contain_oa(df_sent)\n", | |
" if L_text_contain_oa == True:\n", | |
" str_OA_sent_i = sent.text;\n", | |
" str_OA_sents = str_OA_sent_i if str_OA_sents == 'Unknown' else str_OA_sents + ' ' + str_OA_sent_i;\n", | |
" except ValueError:\n", | |
" print('ValueError happened! id: ','')\n", | |
" df_noaa.loc[k,vars['col_oa_text']] = str_OA_sents\n", | |
"# df_noaa[vars['col_oa_text']].fillna('Unknown',inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "e1b5e463-4e34-4057-90b1-db5f407bb251", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Size of df: 21934 18\n", | |
"Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa']\n", | |
"Memory Usage (Total in MB): 66.39\n", | |
"The refined posts were written into: 4_posts_level_oa_sents_1.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"# Write in\n", | |
"filename = '4_' + vars['filename_prefix'] + '_level_oa_sents_1.xlsx'\n", | |
"df_noaa.to_excel(filename, engine='openpyxl', index=False)\n", | |
"# Generate the summary\n", | |
"df_summary = summarize_large_dataframe(df_noaa)\n", | |
"print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "f870dbc1-3c94-4391-a824-ace9b3a1d9fc", | |
"metadata": {}, | |
"source": [ | |
"## Utilize LLM to extract oil spill amount\n", | |
"\n", | |
"In this section, incident descriptions/posts are processed by a Large Language Model (LLM) to generate LLM-based RA candidates. However, due to the inherent randomness of LLMs, results may vary across incidents. Users can either load our LLM-based RA candidates to ensure repeatability or run the LLM model themselves. We use the GPT-3.5-turbo API for this purpose. To run it, please refer to OpenAI's API instructions (https://platform.openai.com/docs/models) and update the API key (variable name \"API_KEY\") in the \"Call GPT\" section of \"func_set_ra_eval.ipynb\"." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9b8fc095-dfea-4945-90c5-a25ab3364103", | |
"metadata": {}, | |
"source": [ | |
"### Option 1: load the prepared LLM-based RA candidates" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "aa34d56d-78ab-406b-8c5f-62dc38da9137", | |
"metadata": {}, | |
"source": [ | |
"Read in RA_GPT" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "3c265f33-2489-4cd2-8e95-53686e823943", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The RA_GPT data of description has been read\n" | |
] | |
} | |
], | |
"source": [ | |
"# read gpt data (reference)\n", | |
"reference_path_icd = RootFile_Name +'gpted\\\\'\n", | |
"os.chdir(reference_path_icd)\n", | |
"df_gpted = pd.read_excel(type_ra_ext+'_gpted.xlsx')\n", | |
"print('The RA_GPT data of ' + type_ra_ext + ' has been read')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "9c7f4100-45da-4667-8560-4f7a76fe3d50", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Size of df: 21934 4\n", | |
"Column Names: ['NPost id', 'old_npost_id_2parts', 'text_gpt35_post', 'RA_gpt_post_w_oa']\n", | |
"Memory Usage (Total in MB): 4.54\n" | |
] | |
} | |
], | |
"source": [ | |
"# Generate the summary\n", | |
"df_summary = summarize_large_dataframe(df_gpted)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "34164c19-3f8d-4070-92df-20e6df317b47", | |
"metadata": {}, | |
"source": [ | |
"Merge RA_GPT to incident-level data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "be904bca-1282-43a3-a064-18a2cdfa48da", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# read incidents/posts\n", | |
"os.chdir(folder_path)\n", | |
"df_noaa = pd.read_excel(filename)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "95b8c983-e260-46be-9789-41c35d8f3753", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# df_noaa = ensure_column_exists(df_noaa, vars['col_id_of_sample'], np.nan)\n", | |
"# df_noaa = ensure_column_exists(df_noaa, vars['col_text_gpt35'], np.nan)\n", | |
"# df_noaa = ensure_column_exists(df_noaa, vars['col_ra_gpt_ext'], np.nan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "087ba90c-8651-459a-b2ae-fd4b4a8a3de8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['NPost id', 'text_gpt35_post', 'RA_gpt_post_w_oa']\n" | |
] | |
} | |
], | |
"source": [ | |
"col_merge_gpt = [vars['col_id_of_sample'], vars['col_text_gpt35'],vars['col_ra_gpt_ext']]\n", | |
"print(col_merge_gpt)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "b742cafd-a86f-41a9-9634-ce0ce9696437", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"21934" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# migrate OA_des and RA_GPT\n", | |
"df_gpted_for_merge = df_gpted[col_merge_gpt]\n", | |
"df_noaa_w_gpt = pd.merge(left = df_noaa,\n", | |
" right = df_gpted_for_merge,\n", | |
" how = 'left',\n", | |
" on = vars['col_id_of_sample']\n", | |
" )\n", | |
"df_noaa_w_gpt.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "c3375965-9225-47d0-9236-0003f79214f4", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 post need GPT extraction.\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df_noaa_w_gpt[vars['col_ra_gpt_ext']].isna().sum(), type_ra_ext + ' need GPT extraction.')\n", | |
"df_noaa = df_noaa_w_gpt.copy()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "74433654-2d43-4b29-ba72-c847c4449e44", | |
"metadata": {}, | |
"source": [ | |
"### Option 2: Call GPT through API to process the text " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "24ac1695-1391-451c-9b24-a3bcdb054ccd", | |
"metadata": {}, | |
"source": [ | |
"Call GPT API to analyze text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "ec56b53a-e7ae-4292-b4aa-7264c86966f9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"os.chdir(folder_path)\n", | |
"df_noaa = df_noaa_w_gpt.copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "3a1c7de1-1cd3-43f6-ae38-e6adb9a6fa8f", | |
"metadata": { | |
"code_folding": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Solely based on the following information, how much oil has been actually released or spilled? Return \"RA_Release = \".Return \"RA_Release = Unknown\" if the spill amount is not able to be found. How much product has been carried or involved? Return \"RA_total = \". Return \"RA_total = Unknown\" if the carry amount is not able to be found.\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# short prompt\n", | |
"Qn_short = \\\n", | |
"'Solely based on the following information, how much oil has been actually released or spilled? \\\n", | |
"Return \\\"RA_Release = \\\".Return \\\"RA_Release = Unknown\\\" if the spill amount is not able to be found.\\\n", | |
" How much product has been carried or involved? Return \\\"RA_total = \\\". \\\n", | |
"Return \\\"RA_total = Unknown\\\" if the carry amount is not able to be found.\\n'\n", | |
"print(Qn_short)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "534157d4-e1ca-47c3-ae17-902866986211", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# gpt model selection\n", | |
"def gpt_model_select(gpt_model_shortname):\n", | |
" if gpt_model_shortname == \"gpt4\":\n", | |
" gpt_model = \"gpt-4-1106-preview\"\n", | |
" gpt_output = 'text_gpt4'\n", | |
" elif gpt_model_shortname == \"gpt3\":\n", | |
" gpt_model = \"gpt-3.5-turbo\"\n", | |
" gpt_output = 'text_gpt35'\n", | |
" return gpt_model,gpt_output\n", | |
"\n", | |
"gpt_model_shortname = \"gpt3\" #\"gpt4\"\n", | |
"gpt_model,gpt_output = gpt_model_select(gpt_model_shortname)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "3f9e43c7-615c-454c-976a-685428ea7348", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def gpt_cost_estimate(gpt_model,n_prompt_tokens,n_completion_tokens,n_total_tokens):\n", | |
" if gpt_model == \"gpt-4-1106-preview\":\n", | |
" costs = (n_prompt_tokens*0.01 + n_completion_tokens*0.03)/1000\n", | |
" elif gpt_model == \"gpt-3.5-turbo\":\n", | |
" costs = (n_total_tokens*0.00200)/1000\n", | |
" return round(costs,2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "9af51a34-9f7b-4b8d-85c6-d5518c7f85ec", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"column text_gpt35_post exists.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 2P for post GPT extraction\n", | |
"n_prompt_tokens = 0\n", | |
"n_completion_tokens = 0\n", | |
"n_total_tokens = 0\n", | |
"n_posts_pcsed = 0\n", | |
"col_gpt_output = gpt_output + '_'+ type_ra_ext\n", | |
"\n", | |
"if col_gpt_output not in df_noaa.columns:\n", | |
" df_noaa[col_gpt_output] = np.nan\n", | |
" print('column',col_gpt_output,\"didn't exist and has been created\")\n", | |
"else:\n", | |
" # df_noaa[col_gpt_output].fillna('Unprocessed',inplace = True)\n", | |
" print('column',col_gpt_output,\"exists.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "a3efd1f2-1e79-4cab-a20f-b7d01d3963ee", | |
"metadata": { | |
"code_folding": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 73037.60it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The number of posts that have been processed is 0\n", | |
"The number of all tokens is 0 The estimated price is $ 0.0\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 4.1 for GPT extraction\n", | |
"# adapted from Version 3.1\n", | |
"\n", | |
"for i in tqdm(range(df_noaa.shape[0])): #\n", | |
" text_input = df_noaa.loc[i, vars['col_oa_text']]\n", | |
" text_output = df_noaa.loc[i, col_gpt_output]\n", | |
" if text_input!='Unknown' and pd.isna(text_output):\n", | |
" des = text_preprocess(df_noaa.loc[i, vars['col_oa_text']]) # col_oa_input, vars['col_text_wt_traj']\n", | |
" QnA = Qn_short + des\n", | |
" chat_completion = query_gpt(QnA, model=gpt_model) # \"gpt-3.5-turbo\"\n", | |
" df_noaa.loc[i, col_gpt_output] = chat_completion.choices[0].message.content.lstrip('\\n')\n", | |
" n_prompt_tokens += chat_completion.usage.prompt_tokens\n", | |
" n_completion_tokens += chat_completion.usage.completion_tokens\n", | |
" n_total_tokens += chat_completion.usage.total_tokens\n", | |
" n_posts_pcsed += 1\n", | |
"gpt_api_cost = gpt_cost_estimate(gpt_model,n_prompt_tokens,n_completion_tokens,n_total_tokens)\n", | |
"print('The number of posts that have been processed is',n_posts_pcsed);\n", | |
"print('The number of all tokens is',n_total_tokens,'The estimated price is $',gpt_api_cost)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "3d6c5c07-6152-4630-9068-e88d16a12c27", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 incidents(posts) need convert text_GPT into RA.\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df_noaa[vars['col_ra_gpt_oa_text']].isna().sum(), 'incidents(posts) need convert text_GPT into RA.')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a3166ac4-67c2-4c8b-af99-9bbbb112a93c", | |
"metadata": {}, | |
"source": [ | |
"Extract LLM-based candidate RA from gpt-generated text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "40081d43-efbb-4f5e-9f8d-89c1714bd7cd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The column 'RA_gpt_post_w_oa' exists in the DataFrame.\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa = ensure_column_exists(df_noaa, vars['col_ra_gpt_oa_text'], np.nan)\n", | |
"# df_noaa = ensure_column_exists(df_noaa, vars['col_oatt_gpt_ext'], np.nan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "4136cfc8-a7cd-4210-932c-33088ed7a1a6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The RA_GPT column RA_gpt_post_w_oa exists.\n" | |
] | |
} | |
], | |
"source": [ | |
"# if migrated RA_gpt, the column vars['col_ra_gpt'] should exist\n", | |
"if vars['col_ra_gpt_oa_text'] not in df_noaa.columns:\n", | |
" df_noaa[vars['col_ra_gpt_oa_text']] = np.nan\n", | |
" print(\"The RA_GPT column %d didn't exist and has been created with NaN.\" %vars['col_ra_gpt_oa_text'])\n", | |
"else:\n", | |
" # df_noaa[vars['col_ra_gpt']].fillna(-1,inplace = True)\n", | |
" print(\"The RA_GPT column %s exists.\" %vars['col_ra_gpt_oa_text'])\n", | |
"# df_noaa['OAtt_GPT_Ext'] = np.nan" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"id": "e68a26d1-e60b-4260-9075-fd902b97550d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 71940.27it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in tqdm(range(df_noaa.shape[0])): #df_noaa.shape[0]\n", | |
" text_gpt_return = df_noaa.loc[i, col_gpt_output]\n", | |
" ra_gpt_return = df_noaa.loc[i, vars['col_ra_gpt_oa_text']]\n", | |
" if not pd.isna(text_gpt_return) and pd.isna(ra_gpt_return):\n", | |
" text_gpt_output = df_noaa.loc[i, col_gpt_output]\n", | |
" # Extract OAs right after getting GPT results\n", | |
" [RA_GPT_i,OAtt_GPT_i] = get_oil_amount_from_short_prompt(\n", | |
" text_gpt_output, \n", | |
" ra_signal_word = 'RA_Release', \n", | |
" oatt_signal_word = 'RA_total'\n", | |
" )\n", | |
" df_noaa.loc[i, vars['col_ra_gpt_oa_text']] = RA_GPT_i" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"id": "1a19e12b-e502-4f31-9063-b69196fcadb0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Size of df: 21934 20\n", | |
"Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa', 'text_gpt35_post', 'RA_gpt_post_w_oa']\n", | |
"Memory Usage (Total in MB): 68.81\n", | |
"The refined posts were written into: 5_posts_level_add_llm_ra_1.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"# Generate file name and write in\n", | |
"os.chdir(folder_path)\n", | |
"filename = '5_' + vars['filename_prefix'] + '_level_add_llm_ra_1.xlsx'\n", | |
"df_noaa.to_excel(filename, engine='openpyxl', index=False)\n", | |
"# Generate the summary\n", | |
"df_summary = summarize_large_dataframe(df_noaa)\n", | |
"print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e1bc9c54-a9e0-4b80-af66-02363652a528", | |
"metadata": {}, | |
"source": [ | |
"# Check the concensus of rule-based and LLM-based RA candidates" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "303cc61a-1252-4238-bb02-53d1e6c327f8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'5_posts_level_add_llm_ra_1.xlsx'" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"filename = '5_' + vars['filename_prefix'] + '_level_add_llm_ra_1.xlsx'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "3fa259ff-5950-42de-be7c-1cf29c83f011", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"os.chdir(folder_path)\n", | |
"df_noaa = pd.read_excel(filename)\n", | |
"df_noaa.rename(columns = {'RA_GPT_Ext': vars['col_ra_gpt']},inplace = True)\n", | |
"df_noaa[vars['col_ra_gpt_oa_text']].fillna(-1,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "afd16490-7b54-4b52-a272-dae3811da8c3", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 52423.36it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1335 incidents(posts) have different RA from rule-based and gpt tools.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa[[vars['col_ra_merge'],vars['col_source_ra_merge']]] = [np.nan,'Unknown']\n", | |
"IF_ra_rb_eq_gpt = pd.Series([False] * df_noaa.shape[0]);\n", | |
"for k in tqdm(range(df_noaa.shape[0])):\n", | |
" RA_rb = df_noaa.loc[k, vars['col_ra_rb']]\n", | |
" RA_gpt = df_noaa.loc[k, vars['col_ra_gpt_oa_text']]\n", | |
" if not pd.isna(RA_gpt):\n", | |
" if_ra_rb_eq_gpt = RA_Equality_Compare(RA_rb,RA_gpt,ignore_NoRls = False,ignore_Unk = True)\n", | |
" IF_ra_rb_eq_gpt[k] = if_ra_rb_eq_gpt;\n", | |
"df_noaa.loc[IF_ra_rb_eq_gpt, vars['col_ra_merge']] = df_noaa.loc[IF_ra_rb_eq_gpt, vars['col_ra_rb']]\n", | |
"df_noaa.loc[IF_ra_rb_eq_gpt, vars['col_source_ra_merge']] = 'ra_rb_eq_gpt'\n", | |
"print((~IF_ra_rb_eq_gpt).sum(), 'incidents(posts) have different RA from rule-based and gpt tools.')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ef1c4d4a-37e2-48e1-b8fd-1138aab39838", | |
"metadata": {}, | |
"source": [ | |
"# Manually verify the inconsistent results to get the document-level candidate RA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "b5e9e97f-fa03-4227-b57b-cd8845eb2da7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"column RA_post_Manual doesn't exist\n", | |
"column Source doesn't exist\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"Index(['NPost id', 'post title', 'post date', 'post tags', 'post content',\n", | |
" 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts',\n", | |
" 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release',\n", | |
" 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa',\n", | |
" 'text_gpt35_post', 'RA_gpt_post_w_oa', 'RA_merge_posts_rb_gpt',\n", | |
" 'source_RA_merge_posts'],\n", | |
" dtype='object')" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path)\n", | |
"\n", | |
"# Drop the specified columns from inconsistent_df_noaa because they will be re-created and re-assigned later on\n", | |
"columns_to_drop = [vars['col_vrf_ra'], vars['col_ra_source_of_vrfd_ra']]\n", | |
"for col in columns_to_drop:\n", | |
" try:\n", | |
" df_noaa = df_noaa.drop(columns=col, axis=1)\n", | |
" print('column', col,'has been dropped.')\n", | |
" except KeyError:\n", | |
" print(\"column\",col,\"doesn't exist\")\n", | |
"\n", | |
"# Add id column if needed\n", | |
"if vars['col_id_of_vrfd_ra'] not in df_noaa.columns:\n", | |
" # Step 2: Drop the last two substrings from the unique ID column and store the result in a new column\n", | |
" df_noaa[vars['col_id_of_vrfd_ra']] = df_noaa[vars['col_old_id']].apply(lambda x: \"_\".join(x.split(\"_\")[:2]))\n", | |
" print(\"column of id (%d) doesn't exist and has been created\" %vars['col_id_of_vrfd_ra'])\n", | |
"df_noaa.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "6378b24b-44e7-4fe1-a51b-48d808889b7a", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['RA_des_Manual', 'Source']\n" | |
] | |
} | |
], | |
"source": [ | |
"# create columns for manual verified candidate RA\n", | |
"cols_to_add = [vars['col_vrf_ra'], vars['col_ra_source_of_vrfd_ra']]; print(cols_to_add)\n", | |
"df_noaa[cols_to_add] = np.nan" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"id": "dbe6bdf1-473d-4888-bd46-71c8d98f407a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"verified records are from post where sheets involved are: ['Sheet1']\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.84it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of modified rows in df_noaa: 3331\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# add manually verified candidate RAs to df_noaa\n", | |
"filename_manual = 'RA_candidate_' + type_ra_ext + '_manual.xlsx'\n", | |
"os.chdir(folder_path_manual)\n", | |
"df_manual = pd.ExcelFile(filename_manual)\n", | |
"\n", | |
"# List all sheet names and filter those that start with 'S'\n", | |
"sheets_to_read = [sheet_name for sheet_name in df_manual.sheet_names if sheet_name.startswith('S')]\n", | |
"\n", | |
"# sheetnames_mvrfd_ext = df_manual.sheet_names\n", | |
"print(\"verified records are from\",type_ra_ext, \"where sheets involved are:\", sheets_to_read)\n", | |
"\n", | |
"# Initialize a counter for modified rows\n", | |
"modified_rows_count = 0\n", | |
"\n", | |
"for sheet_name in tqdm(sheets_to_read):\n", | |
" df_vrfd_i = df_manual.parse(sheet_name)\n", | |
"\n", | |
" # Remove duplicates in the 'id' column, keeping only the last occurrence\n", | |
" df_vrfd_i = df_vrfd_i.drop_duplicates(subset=vars['col_id_of_vrfd_ra'], keep='last')\n", | |
"\n", | |
" df_vrfd_i.dropna(subset=[vars['col_vrf_ra']], inplace=True)\n", | |
" df_vrfd_i.reset_index(drop=True, inplace=True)\n", | |
" df_vrfd_i_merge = df_vrfd_i[[vars['col_id_of_vrfd_ra']] + cols_to_add]\n", | |
"\n", | |
" # Merge and update df_noaa\n", | |
" before_merge = df_noaa.copy()\n", | |
" df_noaa = pd.merge(df_noaa,\n", | |
" df_vrfd_i_merge,\n", | |
" on=vars['col_id_of_vrfd_ra'],\n", | |
" how='left',\n", | |
" suffixes=('', '_updated')\n", | |
" )\n", | |
"\n", | |
" # Update the values and count modified rows\n", | |
" for col in cols_to_add:\n", | |
" df_noaa[col] = df_noaa[col].combine_first(df_noaa[col + '_updated'])\n", | |
" modified_rows_count += df_noaa[col].notna().sum() - before_merge[col].notna().sum()\n", | |
"\n", | |
" # Drop the temporary '_updated' columns\n", | |
" df_noaa.drop([col + '_updated' for col in cols_to_add], axis=1, inplace=True)\n", | |
"\n", | |
"df_manual.close()\n", | |
"\n", | |
"print(f\"Number of modified rows in df_noaa: {modified_rows_count}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "30a752e5-5bb6-4a83-b685-c9aa8f4fbc0c", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 22647.11it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"All done.\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# merge verified candidate RA with extracted candidate RA\n", | |
"# also note the sources of merged candidate RA\n", | |
"for k in tqdm(range(df_noaa.shape[0])):\n", | |
" source_ra_merge = df_noaa.loc[k, vars['col_source_ra_merge']]\n", | |
" ra_vrfd = df_noaa.loc[k, vars['col_vrf_ra']]\n", | |
" if (source_ra_merge == 'Unknown') and (pd.notna(ra_vrfd)):\n", | |
" df_noaa.loc[k, vars['col_ra_merge']] = ra_vrfd\n", | |
" ra_rb = df_noaa.loc[k, vars['col_ra_rb']]\n", | |
" ra_gpt = df_noaa.loc[k, vars['col_ra_gpt_oa_text']]\n", | |
" # add merged RA source\n", | |
" if (ra_vrfd == ra_rb) or (ra_vrfd<0 and ra_rb<0):\n", | |
" df_noaa.loc[k, vars['col_source_ra_merge']] = 'ra_vrfd_rb'\n", | |
" elif (ra_vrfd == ra_gpt) or (ra_vrfd<0 and ra_gpt<0):\n", | |
" df_noaa.loc[k, vars['col_source_ra_merge']] = 'ra_vrfd_gpt'\n", | |
" else:\n", | |
" df_noaa.loc[k, vars['col_source_ra_merge']] = 'ra_vrfd_other'\n", | |
"\n", | |
"IF_source_is_unk = df_noaa[vars['col_source_ra_merge']] == 'Unknown'\n", | |
"if IF_source_is_unk.sum() ==0:\n", | |
" print('All done.')\n", | |
"else:\n", | |
" print(IF_source_is_unk.sum(), 'incidents or posts still need to be verified')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "6b7a0a5c-8481-4478-9fb5-bfa3cb48bb7c", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"All records has met the criteria or been verified. No verification needed.\n" | |
] | |
} | |
], | |
"source": [ | |
"# check whether all incidents have extracted/verified candidate RAs\n", | |
"# The incidents that need verification (df_unverified) will be copied to clipboard\n", | |
"if IF_source_is_unk.sum() != 0:\n", | |
" df_unverified = df_noaa.loc[IF_source_is_unk,:]\n", | |
" df_unverified.to_clipboard(index = False)\n", | |
" print('The unverified incidents (posts) are written to clipboard.')\n", | |
"else:\n", | |
" print('All records has met the criteria or been verified. No verification needed.')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"id": "7cecb7cc-9667-41b0-bb38-72103896549b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Size of df: 21934 24\n", | |
"Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa', 'text_gpt35_post', 'RA_gpt_post_w_oa', 'RA_merge_posts_rb_gpt', 'source_RA_merge_posts', 'RA_post_Manual', 'Source']\n", | |
"Memory Usage (Total in MB): 71.3\n", | |
"The refined posts were written into: 6_posts_level_ra_candidate_1.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"# Generate file name and write in\n", | |
"os.chdir(folder_path)\n", | |
"filename = '6_' + vars['filename_prefix'] + '_level_ra_candidate_1.xlsx'\n", | |
"df_noaa.to_excel(filename, engine='openpyxl', index=False)\n", | |
"# Generate the summary\n", | |
"df_summary = summarize_large_dataframe(df_noaa)\n", | |
"print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |