Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Extract-oil-spill-amount-from-text/2_RA_identification.ipynb
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2201 lines (2201 sloc)
77.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "e7a03a88-a812-4ecb-848d-8ebf21826726", | |
"metadata": {}, | |
"source": [ | |
"# Initialization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "b2377955", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import os\n", | |
"from tqdm import tqdm\n", | |
"import warnings" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "20eb5978-9d79-4dff-be37-cf54de14d117", | |
"metadata": {}, | |
"source": [ | |
"Replace the folder path with the absolute address of the folder on your local disk. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "000d86ca-ade2-4817-999f-fb7c6e532fb1", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"RootFile_Name = 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\\\'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a008ab2f-63ef-40dd-a9d7-d850964ae275", | |
"metadata": {}, | |
"source": [ | |
"Change folder, run custom functions, define folder paths and filenames" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a8c65813-00d5-4bae-b8d7-ef902f0d8d98", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2024-07-06 19:50:23 INFO: Loading these models for language: en (English):\n", | |
"========================\n", | |
"| Processor | Package |\n", | |
"------------------------\n", | |
"| tokenize | combined |\n", | |
"| pos | combined |\n", | |
"| lemma | combined |\n", | |
"========================\n", | |
"\n", | |
"2024-07-06 19:50:23 INFO: Use device: cpu\n", | |
"2024-07-06 19:50:23 INFO: Loading: tokenize\n", | |
"2024-07-06 19:50:23 INFO: Loading: pos\n", | |
"2024-07-06 19:50:23 INFO: Loading: lemma\n", | |
"2024-07-06 19:50:23 INFO: Done loading processors!\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(RootFile_Name)\n", | |
"\n", | |
"# Read custom functions used in the RA candidate extraction\n", | |
"%run func_set_general.ipynb\n", | |
"%run func_set_ra_eval.ipynb\n", | |
"\n", | |
"# Better to separately create folders for both descriptions and posts.\n", | |
"# use the created folder path to replace the following one\n", | |
"folder_path_description = RootFile_Name +'description\\\\'\n", | |
"folder_path_posts = RootFile_Name +'posts\\\\'\n", | |
"folder_path_manual = RootFile_Name+'manual\\\\'\n", | |
"folder_path_ip = RootFile_Name+'incident_posts\\\\'\n", | |
"check_and_create_directory(folder_path_ip)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "dbde5e3e-7bb4-497c-ac94-119cda56c0f5", | |
"metadata": {}, | |
"source": [ | |
"Here you need to clarify: \n", | |
"(1) the filenames of the RA candidates extracted from descriptions and posts (the last outputs of running \"1_RA_extraction.ipynb\")\n", | |
"(2) the filename of the manual verified incident (in the folder \"manual\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "1c506aed-49ef-4046-914b-09a6926cd4ef", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename_icd = '6_incident_level_ra_candidate_1.xlsx'\n", | |
"filename_post = '6_posts_level_ra_candidate_1.xlsx'\n", | |
"filename_manual = 'final_RA_manual.xlsx'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "a57cffb8-97fe-4c57-81e6-e5227ae7aa81", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# define the strings for naming the output files\n", | |
"tt_ip = 'incidents_posts'\n", | |
"version_ip = '1'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a92f82a7-f379-483e-8cfa-6a5b9e81b9f6", | |
"metadata": {}, | |
"source": [ | |
"# Group incidents by their RA candidates quantity and sources" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "ad8cf36f-6467-4629-9f20-afb2203ba37e", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# read RA candidates data of description and post data \n", | |
"os.chdir(folder_path_description)\n", | |
"df_icds = pd.read_excel(filename_icd)\n", | |
"\n", | |
"os.chdir(folder_path_posts)\n", | |
"try:\n", | |
" df_posts = pd.read_excel(filename_post)\n", | |
"except ValueError:\n", | |
" df_posts = pd.read_csv(filename_post)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "556efd01-6f83-4a45-830a-e06bef1e9c53", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"df_icds: Index(['id', 'open_date', 'name', 'location', 'lat', 'lon', 'threat', 'tags',\n", | |
" 'commodity', 'measure_skim', 'measure_shore', 'measure_bio',\n", | |
" 'measure_disperse', 'measure_burn', 'max_ptl_release_gallons', 'posts',\n", | |
" 'description', 'trj_prd', 'des_wt_traj', 'Times_larger_plt', 'Rls_doc',\n", | |
" 'No_release', 'CA_rb_des_wt_traj', 'RA_rb_des_wt_traj', 'des_w_oa',\n", | |
" 'text_gpt35_description', 'RA_gpt_des_w_oa', 'RA_merge_icd_rb_gpt',\n", | |
" 'source_RA_merge_icd', 'RA_des_Manual', 'Source'],\n", | |
" dtype='object')\n", | |
"df_posts: Index(['NPost id', 'post title', 'post date', 'post tags', 'post content',\n", | |
" 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts',\n", | |
" 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release',\n", | |
" 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa',\n", | |
" 'text_gpt35_post', 'RA_gpt_post_w_oa', 'RA_merge_posts_rb_gpt',\n", | |
" 'source_RA_merge_posts', 'RA_post_Manual', 'Source'],\n", | |
" dtype='object')\n" | |
] | |
} | |
], | |
"source": [ | |
"print('df_icds:',df_icds.columns)\n", | |
"print('df_posts:',df_posts.columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "5bc9d6b5-a9cd-46dc-9d2a-561dc31b7515", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename_parts = [tt_ip,version_ip, \"A\",\"01\", 'groupby_ra',\"xlsx\"]\n", | |
"filename_ip_grpby = generate_filename(filename_parts)\n", | |
"\n", | |
"filename_parts = [tt_ip,version_ip, \"D\",\"01\", 'accept',\"xlsx\"]\n", | |
"filename_ip_acpt = generate_filename(filename_parts)\n", | |
"\n", | |
"filename_parts = [tt_ip,version_ip, \"B\",\"01\", 'NeedVerify',\"xlsx\"]\n", | |
"filename_ip_nd_vrfy = generate_filename(filename_parts)\n", | |
"\n", | |
"filename_parts = [tt_ip,version_ip, \"C\",\"01\", 'Unverify',\"xlsx\"]\n", | |
"filename_ip_unvrfy = generate_filename(filename_parts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "6ee95a49-8f19-4aa3-aff4-0ace656df013", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def drop_columns_from_df(df, columns):\n", | |
" \"\"\"\n", | |
" Drops specified columns from a DataFrame and prints which columns were removed \n", | |
" and which were not found.\n", | |
"\n", | |
" :param df: Pandas DataFrame from which to drop columns\n", | |
" :param columns: List of column names to drop\n", | |
" \"\"\"\n", | |
" columns_found = [col for col in columns if col in df.columns]\n", | |
" columns_not_found = [col for col in columns if col not in df.columns]\n", | |
"\n", | |
" # Drop the columns that are found\n", | |
" df.drop(columns=columns_found, inplace=True)\n", | |
"\n", | |
" # Print the results\n", | |
" if columns_found:\n", | |
" print(f\"Removed columns: {', '.join(columns_found)}\")\n", | |
" if columns_not_found:\n", | |
" print(f\"Columns not found: {', '.join(columns_not_found)}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "59b6d5aa-c70a-43b5-a641-6f471b708ecd", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def find_common_columns(df1, df2):\n", | |
" \"\"\"\n", | |
" Find common column names between two DataFrames.\n", | |
"\n", | |
" :param df1: First DataFrame\n", | |
" :param df2: Second DataFrame\n", | |
" :return: List of common column names\n", | |
" \"\"\"\n", | |
" common_columns = set(df1.columns) & set(df2.columns)\n", | |
" return list(common_columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "f7e219a7-1079-4764-9113-210e872c57ee", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Removed columns: Source\n", | |
"Columns not found: searched_sub, risk_sub, Multi_sub, OA_des\n", | |
"Removed columns: Source, ptl_max_RA, Rls_doc, No_release\n", | |
"Columns not found: OA_des\n", | |
"Common columns: []\n" | |
] | |
} | |
], | |
"source": [ | |
"# drop and rename some columns\n", | |
"drop_columns_from_df(df_icds, ['searched_sub', 'risk_sub', 'Multi_sub', 'Source', 'OA_des'])\n", | |
"df_icds.rename(columns={'Times_larger_plt': 'times_larger_plt'},inplace = True)\n", | |
"\n", | |
"drop_columns_from_df(df_posts, ['Source','ptl_max_RA','Rls_doc', 'No_release','OA_des'])\n", | |
"df_posts.rename(columns={'Times_larger_plt': 'p_times_larger_plt',\n", | |
" 'trj_prd': 'p_trj_prd',\n", | |
" 'OA_des': 'OA_post'},inplace = True)\n", | |
"\n", | |
"common_cols = find_common_columns(df_icds, df_posts)\n", | |
"print(\"Common columns:\", common_cols)\n", | |
"cols_df_noaa = df_icds.columns.to_list() + df_posts.columns.to_list()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "263628be-52d4-4b22-9e3f-c8a67fc91897", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The combined df has 4886 rows (multi_posts incidents included)\n", | |
"Index(['id', 'open_date', 'name', 'location', 'lat', 'lon', 'threat', 'tags',\n", | |
" 'commodity', 'measure_skim', 'measure_shore', 'measure_bio',\n", | |
" 'measure_disperse', 'measure_burn', 'max_ptl_release_gallons', 'posts',\n", | |
" 'description', 'trj_prd', 'des_wt_traj', 'times_larger_plt', 'Rls_doc',\n", | |
" 'No_release', 'CA_rb_des_wt_traj', 'RA_rb_des_wt_traj', 'des_w_oa',\n", | |
" 'text_gpt35_description', 'RA_gpt_des_w_oa', 'RA_merge_icd_rb_gpt',\n", | |
" 'source_RA_merge_icd', 'RA_des_Manual', 'NPost id', 'post title',\n", | |
" 'post date', 'post tags', 'post content', 'attachment availability',\n", | |
" 'noaa id', 'post id', 'old_npost_id_2parts', 'p_trj_prd',\n", | |
" 'post_wt_traj', 'p_times_larger_plt', 'CA_rb_post_wt_traj',\n", | |
" 'RA_rb_post_wt_traj', 'post_w_oa', 'text_gpt35_post',\n", | |
" 'RA_gpt_post_w_oa', 'RA_merge_posts_rb_gpt', 'source_RA_merge_posts',\n", | |
" 'RA_post_Manual'],\n", | |
" dtype='object')\n" | |
] | |
} | |
], | |
"source": [ | |
"# Merge incidents and their associated posts\n", | |
"df_posts_w_ra = df_posts.loc[df_posts['RA_merge_posts_rb_gpt']>-1,:].reset_index(drop=True)\n", | |
"df_ip = df_icds.merge(df_posts_w_ra, \n", | |
" how='left',\n", | |
" left_on='id', \n", | |
" right_on='noaa id',\n", | |
" suffixes=('_des', '_post'))\n", | |
"print('The combined df has',df_ip.shape[0],'rows (multi_posts incidents included)')\n", | |
"print(df_ip.columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "4c11c78d-3b7f-4c23-a690-202c22bad14e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|█████████████████████████████████████████████████████████████████████████████| 3550/3550 [00:28<00:00, 123.58it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"# categorize df into three groups\n", | |
"df_multiple_w_des = pd.DataFrame(columns = df_ip.columns)\n", | |
"df_multiple_wt_des = pd.DataFrame(columns = df_ip.columns)\n", | |
"df_one = pd.DataFrame(columns = df_ip.columns)\n", | |
"df_zero = pd.DataFrame(columns = df_ip.columns)\n", | |
"\n", | |
"# group incident by its ra_icd and ra_post availability\n", | |
"for k in tqdm(range(df_icds.shape[0])):\n", | |
" id_icd = df_icds.loc[k,'id']\n", | |
" df_ip_icd = df_ip.loc[df_ip['id'] == id_icd,:].reset_index(drop=True)\n", | |
" ra_icd = df_ip_icd.loc[0,col_RA_icd_Comb_Srch_GPT]\n", | |
" if df_ip_icd.shape[0] == 1: # 0+0 0+1 1+0 1+1\n", | |
" ra_post = df_ip_icd.loc[0,col_RA_post_Comb_Srch_GPT]\n", | |
" if ra_icd > -1 and pd.notna(ra_post): # 1+1\n", | |
" df_multiple_w_des = pd.concat([df_multiple_w_des,df_ip_icd],ignore_index=True)\n", | |
" elif ra_icd > -1 and pd.isna(ra_post): # 1+0\n", | |
" df_one = pd.concat([df_one,df_ip_icd],ignore_index=True)\n", | |
" elif ra_icd <= -1 and pd.notna(ra_post): # 0+1\n", | |
" df_one = pd.concat([df_one,df_ip_icd],ignore_index=True)\n", | |
" elif ra_icd <= -1 and pd.isna(ra_post): # 0+0\n", | |
" df_zero = pd.concat([df_zero,df_ip_icd],ignore_index=True)\n", | |
" else: # 1+n 0+n\n", | |
" if ra_icd > -1: # 1+n\n", | |
" df_multiple_w_des = pd.concat([df_multiple_w_des,df_ip_icd],ignore_index=True)\n", | |
" elif ra_icd <= -1: # 0+n\n", | |
" df_multiple_wt_des = pd.concat([df_multiple_wt_des,df_ip_icd],ignore_index=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "70c7e3cf-06cf-46f7-874b-30240a72a904", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The post-combined incidents are written to incidents_posts_1_A_01_groupby_ra.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"# Assume your DataFrames are named accordingly:\n", | |
"# df_one_n, df_one_one, df_one_zero, df_zero_n, df_zero_one, df_zero_zero\n", | |
"# df_multiple_w_des,df_multiple_wt_des,df_one,df_zero\n", | |
"# Suppressing the 'Ignoring URL' warning.\n", | |
"# with warnings.catch_warnings():\n", | |
"# warnings.simplefilter(\"ignore\", category=UserWarning)\n", | |
"print('The post-combined incidents are written to',filename_ip_grpby)\n", | |
"\n", | |
"# Create a Pandas Excel writer using the XlsxWriter as the engine.\n", | |
"with pd.ExcelWriter(filename_ip_grpby, engine='openpyxl') as writer:\n", | |
" # Write each DataFrame to a different worksheet.\n", | |
" df_multiple_w_des.to_excel(writer, sheet_name='multi_des',index = False)\n", | |
" df_multiple_wt_des.to_excel(writer, sheet_name='multi_no_des',index = False)\n", | |
" df_one.to_excel(writer, sheet_name='one',index = False)\n", | |
" df_zero.to_excel(writer, sheet_name='zero',index = False)\n", | |
" # No need to manually save as the with context will handle it." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c4d25f6e-af31-4e8a-bf35-c1843575d3f7", | |
"metadata": {}, | |
"source": [ | |
"So far, we've grouped incident-posts by their RA availabilities." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c2466eff-0b5e-4f50-88cf-cf90d059583e", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"# Determine final RA by group" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "20db7eff-8ac0-4785-a598-2d503bd804c4", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"define initial conditions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "c3277584-e3f3-40e4-8b48-94d2de85ad0c", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"if_write_acpt = True\n", | |
"if_write_nd_vrfy = True\n", | |
"if_write_unvrfd = True" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "53437bab-7940-4bcd-866c-9f10c78f6b5b", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"def write_df_to_excel(df_to_write, file_path, sheet_name_to_add):\n", | |
" # Check if the Excel file already exists\n", | |
" if not os.path.exists(file_path):\n", | |
" print(\"Excel file did't exist and has been created.\") \n", | |
" # File does not exist, create a new Excel file and write the DataFrame\n", | |
" with pd.ExcelWriter(file_path, engine='openpyxl') as writer:\n", | |
" df_to_write.to_excel(writer, sheet_name=sheet_name_to_add, index=False)\n", | |
" else:\n", | |
" print('Excel file exists.')\n", | |
" # File exists, add a new sheet to the existing Excel file\n", | |
" # Ensure openpyxl is installed: pip install openpyxl\n", | |
" with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:\n", | |
" df_to_write.to_excel(writer, sheet_name=sheet_name_to_add, index=False)\n", | |
"\n", | |
" print(f\"DataFrame is written to {file_path} into sheet {sheet_name_to_add}.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "f27d166d-2e0f-4e7f-92c3-530bea3e5ff3", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# Label icds who have already been verified Version 4 - Generated by ChatGPT based on Version 3\n", | |
"# Users can specify sheets for id extraction\n", | |
"def extract_verified_ids(path_vrfd_file, excel_file_name, col_verify, sheet_names=None):\n", | |
" \"\"\"\n", | |
" Extract verified IDs from a specified Excel file.\n", | |
"\n", | |
" :param root_file_name: Root folder name containing Excel file\n", | |
" :param excel_file_name: Excel file name to extract IDs from\n", | |
" :param col_verify: Column name to check for verification status\n", | |
" :param sheet_names: Optional list of sheet names to process. If None, the algorithm determines the sheets.\n", | |
" :return: List of verified IDs\n", | |
" \"\"\"\n", | |
" # select_verified_excel_sheets(path_vrfd_file, excel_file_name, sheet_names=None)\n", | |
"\n", | |
" verified_data_excel_file = pd.ExcelFile(path_vrfd_file + excel_file_name)\n", | |
" sample_sheet_names = [sheet_name for sheet_name in verified_data_excel_file.sheet_names if sheet_name.startswith('S')]\n", | |
" \n", | |
" verified_ids = list()\n", | |
" for sheet_name in sample_sheet_names:\n", | |
" df_sample = verified_data_excel_file.parse(sheet_name) # Read a specific sheet to DataFrame\n", | |
" verified_rows = df_sample[col_verify].notnull()\n", | |
" verified_ids.extend(df_sample.loc[verified_rows, 'id'].tolist())\n", | |
"\n", | |
" verified_data_excel_file.close()\n", | |
" return verified_ids" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "a0321060-4331-49d0-9c58-b16ed372ab0e", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# Compare incidents in df_noaa with verified incidents\n", | |
"# Check if which incidents in df_noaa need manual verification\n", | |
"\n", | |
"def identify_unvrfd_icds(verified_ids,df_noaa,col_for_vrfy):\n", | |
" df_noaa_unvrfy = pd.DataFrame();\n", | |
" n_nd_vrf = 0; n_vrfd_icds = 0; n_acptd = 0; \n", | |
" for i in np.unique(df_noaa['id'].to_numpy()):\n", | |
" df_icd_post_i = df_noaa.loc[df_noaa['id']==i,:];\n", | |
" L_icds_nd_vrf = ((df_icd_post_i[col_for_vrfy]<0).all());\n", | |
" if L_icds_nd_vrf:\n", | |
" n_nd_vrf += 1;\n", | |
" if i in verified_ids:\n", | |
" n_vrfd_icds += 1;\n", | |
" else:\n", | |
" df_noaa_unvrfy = pd.concat([df_noaa_unvrfy,df_icd_post_i]);\n", | |
" else:\n", | |
" n_acptd += 1;\n", | |
"\n", | |
" df_noaa_unvrfy.reset_index(drop = True,inplace = True);\n", | |
" print('%d of incidents are accepted.\\n%d of incidents have been verified.\\n%d more incidents need to be manual verified'\\\n", | |
" %(n_acptd,n_vrfd_icds,n_nd_vrf-n_vrfd_icds))\n", | |
" return df_noaa_unvrfy" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "09dcb8ad-d352-4b56-a1e0-f31546ae1942", | |
"metadata": {}, | |
"source": [ | |
"## Case1: Multiple candidates, from both description and posts" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "21163f42-e206-4f28-b086-d3d9f9265ba8", | |
"metadata": {}, | |
"source": [ | |
"This section needs to be ran twice for 'one_one' and 'one_n' respectively" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"id": "5a7dfc55-ab57-453e-bea5-f27531103e06", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# icd_category_name = 'one_one'\n", | |
"icd_category_name = 'multi_des'\n", | |
"accept_thrshd = 0.3; extreme_thrshd = 5;" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"id": "fdcfc9b4-0520-48a8-a666-27c14a5a531e", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Sheet multi_des has been read in\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"df_noaa = pd.read_excel(\n", | |
" filename_ip_grpby,\n", | |
" sheet_name = icd_category_name,\n", | |
" parse_dates=['open_date', 'post date'])\n", | |
"print('Sheet',icd_category_name, 'has been read in')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ccdfbee2-245d-483b-a572-527067c5c13b", | |
"metadata": {}, | |
"source": [ | |
"### Compare Equality between RA_des and RA_post (by icds)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"id": "0f1187da-243f-4cc9-a791-7e4b0a701500", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|████████████████████████████████████████████████████████████████████████████| 1651/1651 [00:00<00:00, 5536.14it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa['Equal_ra_des_post'] = False\n", | |
"for i in tqdm(range(df_noaa.shape[0])):\n", | |
" L_Equal_RAs_des_post \\\n", | |
" = RA_Equality_Compare(\\\n", | |
" df_noaa.loc[i,col_RA_icd_Comb_Srch_GPT],\n", | |
" df_noaa.loc[i,col_RA_post_Comb_Srch_GPT],\n", | |
" ignore_NoRls = False,\n", | |
" ignore_Unk = True)\n", | |
" df_noaa.loc[i, 'Equal_RAs_des_post'] = L_Equal_RAs_des_post" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"id": "5441a36d-55fc-4029-a5b4-9b32f2e6a90a", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|████████████████████████████████████████████████████████████████████████████| 1651/1651 [00:00<00:00, 4998.69it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa['Spcf_des_post'] = -1\n", | |
"for i in tqdm(range(df_noaa.shape[0])):\n", | |
" Ratio_2val = Ratio_values(val_nmt = df_noaa.loc[i,col_RA_icd_Comb_Srch_GPT],\\\n", | |
" val_dnmt = df_noaa.loc[i,col_RA_post_Comb_Srch_GPT],\\\n", | |
" Eq_2val = df_noaa.loc[i,'Equal_RAs_des_post'])\n", | |
"\n", | |
" df_noaa.loc[i, 'Spcf_des_post'] = Ratio_2val" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6120bdb1-af1d-46ff-8be4-da4290ec2712", | |
"metadata": {}, | |
"source": [ | |
"### Check if RA differences of posts-combined incidents are accepatable and label them" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"id": "941a4def-46a1-4241-b811-bc67901e2160", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|███████████████████████████████████████████████████████████████████████████████| 668/668 [00:03<00:00, 187.47it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The number of incidents who have extremely difference RAs is 137\n", | |
"The number of incidents whose RAs_Diff are acceptable is 514\n", | |
"The number of incidents whose RAs_Diff are unacceptable (Large_diff + Extrm_diff) is 154\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 3\n", | |
"# simplify combine process, don't use post ranking anymore\n", | |
"df_noaa['Diff_RAs_Accept'] = False;\n", | |
"L_Diff_RAs_accept = (df_noaa['Spcf_des_post'] >= 1-accept_thrshd)&(df_noaa['Spcf_des_post'] <= 1+accept_thrshd);\n", | |
"L_Extrm_diff = (df_noaa['Spcf_des_post'] > extreme_thrshd) |(df_noaa['Spcf_des_post'] < (1/extreme_thrshd));\n", | |
"df_noaa_diff = pd.DataFrame();\n", | |
"n_RAs_Diff_Accept = 0; \n", | |
"n_RAs_Extrm_diff = 0;\n", | |
"list_id_NOAA = df_noaa['id'].unique();\n", | |
"for i in tqdm(range(len(list_id_NOAA))):\n", | |
" id_NOAA_i = list_id_NOAA[i];\n", | |
" L_icd_i = (df_noaa['id'] == id_NOAA_i);\n", | |
" L_RAs_diff_acpt_icd_i = L_icd_i & L_Diff_RAs_accept;\n", | |
" L_RAs_extrm_diff_icd_i = L_icd_i & L_Extrm_diff;\n", | |
" L_acpt_icd_i = False;\n", | |
" df_ips_i = df_noaa.loc[L_icd_i,:].copy()\n", | |
" idx_icd_i = df_ips_i.index\n", | |
" for j in range(df_ips_i.shape[0]):\n", | |
" idx_j = idx_icd_i[j]\n", | |
" if L_RAs_extrm_diff_icd_i.sum() > 0:\n", | |
" df_ips_i.loc[:,'Diff_RAs_Accept'] = 'Extrm_diff_icd_post'\n", | |
" n_RAs_Extrm_diff += 1\n", | |
" break\n", | |
" elif (L_RAs_diff_acpt_icd_i[idx_j] == True):\n", | |
" df_ips_i.loc[idx_j,'Diff_RAs_Accept'] = 'Accept:Diff_icd_post'\n", | |
" if L_acpt_icd_i == False:\n", | |
" L_acpt_icd_i = True\n", | |
" n_RAs_Diff_Accept += 1\n", | |
" else:\n", | |
" df_ips_i.loc[idx_j,'Diff_RAs_Accept'] = 'Large_diff_icd_post'\n", | |
" df_noaa_diff = pd.concat([df_noaa_diff,df_ips_i]);\n", | |
"df_noaa = df_noaa_diff.sort_index();\n", | |
"print('The number of incidents who have extremely difference RAs is',n_RAs_Extrm_diff);\n", | |
"print('The number of incidents whose RAs_Diff are acceptable is',n_RAs_Diff_Accept);\n", | |
"print('The number of incidents whose RAs_Diff are unacceptable (Large_diff + Extrm_diff) is',len(list_id_NOAA)-n_RAs_Diff_Accept);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "da29561a-18a4-412a-86d3-4cea555fc097", | |
"metadata": {}, | |
"source": [ | |
"### Separate incidents and select the RA_Comb_des_post and label their sources" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"id": "d4e42344-c9a4-49e2-94c7-01d7806a0d50", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|████████████████████████████████████████████████████████████████████████████████| 668/668 [00:09<00:00, 71.55it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2 RAs are included in descriptions\n", | |
"61 RAs are included in posts\n", | |
"451 RAs are included in both descriptions and posts\n", | |
"The number of incident whose final release amounts need manual verification is 154\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 3:\n", | |
"df_noaa[col_RA_Comb_des_post] = -1\n", | |
"df_noaa[col_source_RA_comb] = 'Unknown'\n", | |
"df_noaa[col_category_icds] = icd_category_name\n", | |
"\n", | |
"list_id_NOAA = df_noaa['id'].unique()\n", | |
"df_ip_acpt = pd.DataFrame(columns = df_noaa.columns)\n", | |
"df_ips_w_ra = pd.DataFrame(columns = df_noaa.columns)\n", | |
"df_ip_nd_vrfy = pd.DataFrame(columns = df_noaa.columns)\n", | |
"n_RAs_from_icd = 0; n_RAs_from_post = 0; n_RAs_from_dp = 0;\n", | |
"\n", | |
"for i in tqdm(range(len(list_id_NOAA))):\n", | |
" id_NOAA_i = list_id_NOAA[i]\n", | |
" L_icd_i = df_noaa['id'] == id_NOAA_i\n", | |
" df_ips_i = df_noaa.loc[L_icd_i,:].copy().reset_index(drop = True)\n", | |
" df_ips_i.sort_values(by = 'post date',ascending = False,ignore_index=True, inplace = True)\n", | |
" L_contain_RA_accept = df_ips_i['Diff_RAs_Accept'].str.startswith('Accept:')\n", | |
" if L_contain_RA_accept.any():\n", | |
" # set the index of the first \"True\" value in the L_contain_RA_accept\n", | |
" l_1stTrue = L_contain_RA_accept.idxmax()\n", | |
" df_ips_acptd = df_ips_i.loc[l_1stTrue,:].reset_index(drop = True)\n", | |
" dt_icd = df_ips_i.loc[l_1stTrue,'post date']\n", | |
" dt_post = df_ips_i.loc[l_1stTrue,'open_date']\n", | |
" # compare the date of description and post\n", | |
" if dt_icd > dt_post:\n", | |
" # incident ra and its source\n", | |
" ra_icd = df_ips_i.loc[l_1stTrue,col_RA_icd_Comb_Srch_GPT]\n", | |
" sr_ra_icd = df_ips_i.loc[l_1stTrue,col_source_RA_des]\n", | |
" # (latest) post ra and its source\n", | |
" ra_post = df_ips_i.loc[l_1stTrue,col_RA_post_Comb_Srch_GPT]\n", | |
" sr_ra_post = df_ips_i.loc[l_1stTrue,col_source_RA_post]\n", | |
" df_ips_i.loc[:,col_RA_Comb_des_post] = ra_post\n", | |
" if ra_icd == ra_post:\n", | |
" df_ips_i.loc[l_1stTrue,col_source_RA_comb] = 'i&p: ' + sr_ra_icd + '&' + sr_ra_post\n", | |
" n_RAs_from_dp += 1\n", | |
" else:\n", | |
" df_ips_i.loc[l_1stTrue,col_source_RA_comb] = 'p: '+ sr_ra_post\n", | |
" n_RAs_from_post += 1\n", | |
" else:\n", | |
" df_ips_i.loc[l_1stTrue,col_RA_Comb_des_post] = ra_icd\n", | |
" df_ips_i.loc[l_1stTrue,col_source_RA_comb] = 'd: '+ sr_ra_icd\n", | |
" n_RAs_from_icd += 1\n", | |
" df_ips_w_ra = pd.concat([df_ips_w_ra,df_ips_i], ignore_index=True)\n", | |
" df_ip_acpt = df_ip_acpt.append(df_ips_i.loc[l_1stTrue,:], ignore_index=True)\n", | |
" # Take the df of each incidents whose ra difference is acceptable\n", | |
" else:\n", | |
" df_ip_nd_vrfy = pd.concat([df_ip_nd_vrfy,df_ips_i], ignore_index=True)\n", | |
"# df_noaa = df_noaa_RA.sort_index()\n", | |
"print('%d RAs are included in descriptions' % n_RAs_from_icd)\n", | |
"print('%d RAs are included in posts' % n_RAs_from_post)\n", | |
"print('%d RAs are included in both descriptions and posts' % n_RAs_from_dp)\n", | |
"n_nd_vrf = len(list_id_NOAA)-n_RAs_from_icd-n_RAs_from_post-n_RAs_from_dp\n", | |
"print('The number of incident whose final release amounts need manual verification is',n_nd_vrf);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b6af66c9-bb79-4f82-acec-9797fe56a09d", | |
"metadata": {}, | |
"source": [ | |
"### Check if the incidents are manually verified" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"id": "e6ce57dc-fdfd-489b-b061-d5e2d966884e", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 of incidents are accepted.\n", | |
"154 of incidents have been verified.\n", | |
"0 more incidents need to be manual verified\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 3\n", | |
"# S5,6 and all F sheets are considered\n", | |
"verified_ids = extract_verified_ids(folder_path_manual, filename_manual, col_vrf_RAs_des_post)\n", | |
"# Extract unverified incidents by comparing incidents in df_noaa with verified incidents\n", | |
"df_noaa_unvrfy = identify_unvrfd_icds(verified_ids,df_ip_nd_vrfy,col_RA_Comb_des_post)\n", | |
"n_icd_unvrfd += df_noaa_unvrfy.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"id": "45424638-0fa3-4cdb-893b-e9caa6b5f2ae", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"incidents_posts_1_D_01_accept.xlsx\n", | |
"Excel file did't exist and has been created.\n", | |
"DataFrame is written to incidents_posts_1_D_01_accept.xlsx into sheet multi_des.\n", | |
"incidents_posts_1_B_01_NeedVerify.xlsx\n", | |
"Excel file did't exist and has been created.\n", | |
"DataFrame is written to incidents_posts_1_B_01_NeedVerify.xlsx into sheet multi_des.\n", | |
"Incidents are all verified.No need for incidents_posts_1_C_01_Unverify.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"\n", | |
"if if_write_acpt:\n", | |
" print(filename_ip_acpt)\n", | |
" write_df_to_excel(df_to_write=df_ip_acpt,\n", | |
" file_path=filename_ip_acpt,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"\n", | |
"if if_write_nd_vrfy:\n", | |
" print(filename_ip_nd_vrfy)\n", | |
" write_df_to_excel(df_to_write=df_ip_nd_vrfy,\n", | |
" file_path=filename_ip_nd_vrfy,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"\n", | |
"if if_write_unvrfd and df_noaa_unvrfy.shape[0] > 0:\n", | |
" print('Unverified incidents exist.')\n", | |
" print(filename_ip_unvrfy)\n", | |
" write_df_to_excel(df_to_write=df_noaa_unvrfy,\n", | |
" file_path=filename_ip_unvrfy,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"else:\n", | |
" print('Incidents are all verified.No need for',filename_ip_unvrfy)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a7565bb1-05b1-4b5d-b207-f57a43ab35ea", | |
"metadata": {}, | |
"source": [ | |
"## Case 2: Multiple candidates, only from posts " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 149, | |
"id": "cefe836b-c473-49de-b51d-44bb7a7e2d23", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Sheet multi_no_des has been read in.\n" | |
] | |
} | |
], | |
"source": [ | |
"icd_category_name = 'multi_no_des'\n", | |
"\n", | |
"os.chdir(folder_path_ip)\n", | |
"\n", | |
"df_noaa = pd.read_excel(\n", | |
" filename_ip_grpby,\n", | |
" sheet_name = icd_category_name,\n", | |
" parse_dates=['open_date', 'post date'])\n", | |
"print('Sheet',icd_category_name,'has been read in.')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "264986d1-452f-41c8-a36e-8ba46d98fdb4", | |
"metadata": {}, | |
"source": [ | |
"### Check if Multi_RAs_post of incidents have acceptable differences and label it. If yes, take the RA_post with highest process orders as RA of the incident" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 150, | |
"id": "692a4ffa-d1b8-4b0f-aac1-57cc4aac9bf7", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|███████████████████████████████████████████████████████████████████████████████| 141/141 [00:00<00:00, 190.08it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"56 RAs from posts have extremely large differences\n", | |
"16 RAs from posts have large differences\n", | |
"69 RAs from posts are accepted\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"accept_thrshd = 1; extreme_thrshd = 5;\n", | |
"\n", | |
"# df_noaa['Equal_RAs_posts'] = False;\n", | |
"df_noaa[col_RA_Comb_des_post] = -1\n", | |
"df_noaa[col_source_RA_comb] = 'Unknown'\n", | |
"df_noaa['Diff_RAs_Accept'] = False\n", | |
"df_noaa[col_category_icds] = icd_category_name\n", | |
"\n", | |
"id_df = df_noaa['id'].unique();\n", | |
"df_noaa_add_Eq_Diff = pd.DataFrame(columns = df_noaa.columns)\n", | |
"n_extm_large = 0; n_diff_large = 0; n_post_acpt = 0;\n", | |
"\n", | |
"for i in tqdm(range(len(id_df))):\n", | |
" id_i = id_df[i];\n", | |
" df_ip_i = df_noaa.loc[df_noaa['id']==id_i,:].reset_index(drop = True)\n", | |
" df_ip_i.sort_values(by = 'post date',ascending = False,ignore_index=True,inplace = True)\n", | |
" L_RAs_post_Acpt = True; L_RAs_Extrm_Diff = False; #L_RAs_large_Diff_mn = False;\n", | |
" ra_post_max = df_ip_i[col_RA_post_Comb_Srch_GPT].max()\n", | |
" ra_post_min = df_ip_i[col_RA_post_Comb_Srch_GPT].min()\n", | |
" L_Eq_RAs_posts_mn= RA_Equality_Compare(ra_post_max,\n", | |
" ra_post_min,\n", | |
" ignore_NoRls = False,\n", | |
" ignore_Unk = True)\n", | |
"\n", | |
" Ratio_2val = Ratio_values(val_nmt = ra_post_max,\n", | |
" val_dnmt = ra_post_min,\n", | |
" Eq_2val = L_Eq_RAs_posts_mn)\n", | |
" if (Ratio_2val < 1/extreme_thrshd) | (Ratio_2val > extreme_thrshd):\n", | |
" L_RAs_Extrm_Diff = True\n", | |
" elif (Ratio_2val < 1-accept_thrshd) | (Ratio_2val > 1+accept_thrshd):\n", | |
" L_RAs_post_Acpt = False\n", | |
"\n", | |
" if L_RAs_Extrm_Diff == True:\n", | |
" df_ip_i.loc[:,'Diff_RAs_Accept'] = 'Extrm_diff_posts'\n", | |
" # df_ip_nd_vrfy = pd.concat([df_ip_nd_vrfy,df_ip_i], ignore_index=True)\n", | |
" n_extm_large += 1\n", | |
" elif L_RAs_post_Acpt == True:\n", | |
" df_ip_i.loc[:,'Diff_RAs_Accept'] = 'Others_Accepted'\n", | |
" df_ip_i.loc[0,'Diff_RAs_Accept'] = 'Accept:Diff_posts'\n", | |
" df_ip_i.loc[0,col_RA_Comb_des_post] = df_ip_i.loc[0,col_RA_post_Comb_Srch_GPT]\n", | |
" # df_ip_i.loc[0,col_source_RA_comb] = 'p:'+ df_ip_i.loc[0,col_source_RA_post]\n", | |
" n_post_acpt += 1\n", | |
" else:\n", | |
" df_ip_i.loc[:,'Diff_RAs_Accept'] = 'Large_diff_posts'\n", | |
" # df_ip_nd_vrfy = pd.concat([df_ip_nd_vrfy,df_ip_i], ignore_index=True)\n", | |
" n_diff_large += 1\n", | |
"\n", | |
" df_noaa_add_Eq_Diff = pd.concat([df_noaa_add_Eq_Diff,df_ip_i],ignore_index = True);\n", | |
"df_noaa = df_noaa_add_Eq_Diff.copy();\n", | |
"\n", | |
"print('%d RAs from posts have extremely large differences' % n_extm_large);\n", | |
"print('%d RAs from posts have large differences' % n_diff_large);\n", | |
"print('%d RAs from posts are accepted' % n_post_acpt);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e34d245b-cced-46cc-b759-aa6e3161d5bb", | |
"metadata": {}, | |
"source": [ | |
"### Select the final ra of an incident" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 151, | |
"id": "24569470-d566-46c4-b681-df7b02c76eb4", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|███████████████████████████████████████████████████████████████████████████████| 141/141 [00:00<00:00, 213.18it/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The number of incident whose final release amounts are from posts is 69\n", | |
"The number of incident whose final release amounts need manual verification is 72\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# select the final RA (col: RA_Comb_des_post) of an incident based on the criteria in \"Combination_method_icd_and_posts.xlsx\"\n", | |
"\n", | |
"list_id_NOAA = df_noaa['id'].unique();\n", | |
"# df_noaa_RA = pd.DataFrame();\n", | |
"n_RAs_from_post = 0\n", | |
"\n", | |
"df_ip_acpt = pd.DataFrame() \n", | |
"df_ips_w_ra = pd.DataFrame()\n", | |
"df_ip_nd_vrfy = pd.DataFrame()\n", | |
"\n", | |
"for i in tqdm(range(len(list_id_NOAA))):\n", | |
" id_NOAA_i = list_id_NOAA[i];\n", | |
" L_icd_i = (df_noaa['id'] == id_NOAA_i);\n", | |
" df_ips_i = df_noaa.loc[L_icd_i,:].copy().reset_index(drop = True)\n", | |
" df_ips_i.sort_values(by = 'post date',ascending = False,ignore_index=True,inplace = True)\n", | |
" L_contain_RA_accept = df_ips_i['Diff_RAs_Accept'].str.startswith('Accept:')\n", | |
"\n", | |
" if L_contain_RA_accept.any():\n", | |
" l_1stTrue = L_contain_RA_accept.idxmax()\n", | |
" # df_ips_acptd = df_ips_i.loc[L_contain_RA_accept,:].reset_index(drop = True)\n", | |
" ra_post = df_ips_i.loc[l_1stTrue,col_RA_post_Comb_Srch_GPT]\n", | |
" sr_ra_post = df_ips_i.loc[l_1stTrue,col_source_RA_post] \n", | |
" # ra_post = df_ips_acptd.loc[0,col_RA_post_Comb_Srch_GPT]\n", | |
" # sr_ra_post = df_ips_acptd.loc[0,col_source_RA_post]\n", | |
"\n", | |
" df_ips_i.loc[l_1stTrue,col_RA_Comb_des_post] = ra_post\n", | |
" df_ips_i.loc[l_1stTrue,col_source_RA_comb] = 'p: '+ sr_ra_post\n", | |
" n_RAs_from_post += 1\n", | |
" df_ips_w_ra = pd.concat([df_ips_w_ra,df_ips_i], ignore_index=True)\n", | |
" df_ip_acpt = pd.concat([df_ip_acpt,df_ips_i.loc[[l_1stTrue]]], ignore_index=True)\n", | |
" else:\n", | |
" df_ip_nd_vrfy = pd.concat([df_ip_nd_vrfy,df_ips_i], ignore_index=True)\n", | |
" # df_noaa_RA = pd.concat([df_noaa_RA,df_ips_i]);\n", | |
"# df_noaa = df_noaa_RA.sort_index();\n", | |
"print('The number of incident whose final release amounts are from posts is',n_RAs_from_post);\n", | |
"n_nd_vrf = len(list_id_NOAA)-n_RAs_from_post\n", | |
"print('The number of incident whose final release amounts need manual verification is',\\\n", | |
" n_nd_vrf);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 152, | |
"id": "1e3eca7a-1e80-4701-8228-5b4999689586", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>open_date</th>\n", | |
" <th>name</th>\n", | |
" <th>location</th>\n", | |
" <th>lat</th>\n", | |
" <th>lon</th>\n", | |
" <th>threat</th>\n", | |
" <th>tags</th>\n", | |
" <th>commodity</th>\n", | |
" <th>measure_skim</th>\n", | |
" <th>...</th>\n", | |
" <th>post_w_oa</th>\n", | |
" <th>text_gpt35_post</th>\n", | |
" <th>RA_gpt_post_w_oa</th>\n", | |
" <th>RA_merge_posts_rb_gpt</th>\n", | |
" <th>source_RA_merge_posts</th>\n", | |
" <th>RA_post_Manual</th>\n", | |
" <th>ra_final</th>\n", | |
" <th>source_final_ra</th>\n", | |
" <th>Diff_RAs_Accept</th>\n", | |
" <th>category_icds</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>6209</td>\n", | |
" <td>1970-02-10</td>\n", | |
" <td>Chevron Main Pass Block 41; 11 miles E of the ...</td>\n", | |
" <td>11 miles E of the Mississippi River delta, Lou...</td>\n", | |
" <td>29.3833</td>\n", | |
" <td>-88.9833</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>crude oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>Main Pass Block 41, Platform C, 1972\\nGulf of ...</td>\n", | |
" <td>RA_Release = 65,000 bbl\\nRA_total = Unknown</td>\n", | |
" <td>2730000.0</td>\n", | |
" <td>2730000.0</td>\n", | |
" <td>ra_rb_eq_gpt</td>\n", | |
" <td>NaN</td>\n", | |
" <td>-1</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>Extrm_diff_posts</td>\n", | |
" <td>multi_no_des</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>6209</td>\n", | |
" <td>1970-02-10</td>\n", | |
" <td>Chevron Main Pass Block 41; 11 miles E of the ...</td>\n", | |
" <td>11 miles E of the Mississippi River delta, Lou...</td>\n", | |
" <td>29.3833</td>\n", | |
" <td>-88.9833</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>crude oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>On March 4, oil impact estimated at\\n20 barrel...</td>\n", | |
" <td>RA_Release = 20 barrels\\nRA_total = Unknown</td>\n", | |
" <td>840.0</td>\n", | |
" <td>840.0</td>\n", | |
" <td>ra_vrfd_gpt</td>\n", | |
" <td>840.0</td>\n", | |
" <td>-1</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>Extrm_diff_posts</td>\n", | |
" <td>multi_no_des</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>6209</td>\n", | |
" <td>1970-02-10</td>\n", | |
" <td>Chevron Main Pass Block 41; 11 miles E of the ...</td>\n", | |
" <td>11 miles E of the Mississippi River delta, Lou...</td>\n", | |
" <td>29.3833</td>\n", | |
" <td>-88.9833</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>crude oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>About 2,000 barrels of two chemical dispersant...</td>\n", | |
" <td>RA_Release = 11,000 barrels\\nRA_total = Unknown</td>\n", | |
" <td>462000.0</td>\n", | |
" <td>1176000.0</td>\n", | |
" <td>ra_vrfd_other</td>\n", | |
" <td>1176000.0</td>\n", | |
" <td>-1</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>Extrm_diff_posts</td>\n", | |
" <td>multi_no_des</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>3 rows × 54 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id open_date name \\\n", | |
"0 6209 1970-02-10 Chevron Main Pass Block 41; 11 miles E of the ... \n", | |
"1 6209 1970-02-10 Chevron Main Pass Block 41; 11 miles E of the ... \n", | |
"2 6209 1970-02-10 Chevron Main Pass Block 41; 11 miles E of the ... \n", | |
"\n", | |
" location lat lon threat \\\n", | |
"0 11 miles E of the Mississippi River delta, Lou... 29.3833 -88.9833 Oil \n", | |
"1 11 miles E of the Mississippi River delta, Lou... 29.3833 -88.9833 Oil \n", | |
"2 11 miles E of the Mississippi River delta, Lou... 29.3833 -88.9833 Oil \n", | |
"\n", | |
" tags commodity measure_skim ... \\\n", | |
"0 NaN crude oil NaN ... \n", | |
"1 NaN crude oil NaN ... \n", | |
"2 NaN crude oil NaN ... \n", | |
"\n", | |
" post_w_oa \\\n", | |
"0 Main Pass Block 41, Platform C, 1972\\nGulf of ... \n", | |
"1 On March 4, oil impact estimated at\\n20 barrel... \n", | |
"2 About 2,000 barrels of two chemical dispersant... \n", | |
"\n", | |
" text_gpt35_post RA_gpt_post_w_oa \\\n", | |
"0 RA_Release = 65,000 bbl\\nRA_total = Unknown 2730000.0 \n", | |
"1 RA_Release = 20 barrels\\nRA_total = Unknown 840.0 \n", | |
"2 RA_Release = 11,000 barrels\\nRA_total = Unknown 462000.0 \n", | |
"\n", | |
" RA_merge_posts_rb_gpt source_RA_merge_posts RA_post_Manual ra_final \\\n", | |
"0 2730000.0 ra_rb_eq_gpt NaN -1 \n", | |
"1 840.0 ra_vrfd_gpt 840.0 -1 \n", | |
"2 1176000.0 ra_vrfd_other 1176000.0 -1 \n", | |
"\n", | |
" source_final_ra Diff_RAs_Accept category_icds \n", | |
"0 Unknown Extrm_diff_posts multi_no_des \n", | |
"1 Unknown Extrm_diff_posts multi_no_des \n", | |
"2 Unknown Extrm_diff_posts multi_no_des \n", | |
"\n", | |
"[3 rows x 54 columns]" | |
] | |
}, | |
"execution_count": 152, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_ips_i" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "87064ef1-5e9c-4729-8e29-5f95288c7391", | |
"metadata": {}, | |
"source": [ | |
"### Check which incidents need to be manually verified" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 153, | |
"id": "83e56cf9-4ea0-4cf9-acfc-4f6fefd82f55", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 of incidents are accepted.\n", | |
"72 of incidents have been verified.\n", | |
"0 more incidents need to be manual verified\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 3\n", | |
"# S5,6 and all F sheets are considered\n", | |
"verified_ids = extract_verified_ids(folder_path_manual, filename_manual, col_vrf_RAs_des_post)\n", | |
"# Extract unverified incidents by comparing incidents in df_noaa with verified incidents\n", | |
"df_noaa_unvrfy = identify_unvrfd_icds(verified_ids,df_ip_nd_vrfy,col_RA_Comb_des_post)\n", | |
"n_icd_unvrfd += df_noaa_unvrfy.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 154, | |
"id": "1d467a1d-cc21-4fab-ae1b-19095d0f5a50", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Excel file did't exist and has been created.\n", | |
"DataFrame is written to incidents_posts_1_D_01_accept.xlsx into sheet multi_no_des.\n", | |
"Excel file did't exist and has been created.\n", | |
"DataFrame is written to incidents_posts_1_B_01_NeedVerify.xlsx into sheet multi_no_des.\n", | |
"Incidents are all verified.\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"if if_write_acpt:\n", | |
" write_df_to_excel(df_to_write=df_ip_acpt,\n", | |
" file_path=filename_ip_acpt,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"if if_write_nd_vrfy:\n", | |
" write_df_to_excel(df_to_write = df_ip_nd_vrfy,\n", | |
" file_path= filename_ip_nd_vrfy,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"\n", | |
"if if_write_unvrfd and df_noaa_unvrfy.shape[0] > 0:\n", | |
" print('Unverified incidents exist.')\n", | |
" write_df_to_excel(df_to_write=df_noaa_unvrfy,\n", | |
" file_path=filename_ip_unvrfy,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"else:\n", | |
" print('Incidents are all verified.')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "35942a0f-e0f8-4d2e-8f51-4d9a97d18585", | |
"metadata": {}, | |
"source": [ | |
"So far, we've identified RAs for some incident-posts while others need manual verification\n", | |
"\n", | |
"We've also compared the incidents need verification with verified incidents and recognized the unverified incidents \n", | |
"\n", | |
"The following sections shouldn't be ran until no more incidents are unverified" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e14d7caa-17dc-4c9a-815e-fe724f050232", | |
"metadata": {}, | |
"source": [ | |
"## Case 3: Single candidate, from description or post" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0f7d5dd7-a04c-4566-ba89-06f1b46cdd57", | |
"metadata": {}, | |
"source": [ | |
"This section needs to be ran twice for 'one_zero' and 'zero_one' respectively" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 93, | |
"id": "f79ae417-8fae-4eb2-b0a5-bc20a3ebbb36", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# icd_category_name = 'one_zero'\n", | |
"icd_category_name = 'one'\n", | |
"\n", | |
"os.chdir(folder_path_ip)\n", | |
"df_noaa = pd.read_excel(\n", | |
" filename_ip_grpby,\n", | |
" sheet_name = icd_category_name,\n", | |
" parse_dates=['open_date', 'post date'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3b37039c-3005-4291-9901-b0f7f4fb6a8a", | |
"metadata": {}, | |
"source": [ | |
"### accept the only candidate RA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"id": "f5204072-738c-4f78-95c0-44d992e5e1da", | |
"metadata": { | |
"scrolled": true, | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|████████████████████████████████████████████████████████████████████████████| 1004/1004 [00:00<00:00, 2344.48it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"df_noaa[col_RA_post_Comb_Srch_GPT].fillna(-1,inplace=True)\n", | |
"df_noaa[col_RA_Comb_des_post] = -1\n", | |
"df_noaa[col_source_RA_comb] = 'Unknown'\n", | |
"df_noaa['Diff_RAs_Accept'] = False\n", | |
"df_noaa[col_category_icds] = icd_category_name\n", | |
"n_cfl_ind = 0; n_icd_no_rls = 0; n_icd_acpt = 0;\n", | |
"\n", | |
"df_ip_acpt = pd.DataFrame(columns = df_noaa.columns)\n", | |
"df_ip_nd_vrfy = pd.DataFrame(columns = df_noaa.columns)\n", | |
"\n", | |
"for i in tqdm(range(df_noaa.shape[0])):\n", | |
" ra_des = df_noaa.loc[i,col_RA_icd_Comb_Srch_GPT]\n", | |
" ra_post = df_noaa.loc[i,col_RA_post_Comb_Srch_GPT]\n", | |
" if ra_des > -1 and ra_post <= -1:\n", | |
" df_noaa.loc[i,col_RA_Comb_des_post] = ra_des\n", | |
" df_noaa.loc[i,'Diff_RAs_Accept'] = 'Accept: single description candidate'\n", | |
" df_noaa.loc[i,col_source_RA_comb] = 'd: ' + df_noaa.loc[i,col_source_RA_des]\n", | |
" elif ra_des <= -1 and ra_post > -1:\n", | |
" df_noaa.loc[i,col_RA_Comb_des_post] = ra_post\n", | |
" df_noaa.loc[i,'Diff_RAs_Accept'] = 'Accept: single post candidate'\n", | |
" df_noaa.loc[i,col_source_RA_comb] = 'p: ' + df_noaa.loc[i,col_source_RA_post]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "32cfa75f-6ca3-49a5-bc4a-9db9d397e13b", | |
"metadata": {}, | |
"source": [ | |
"### Check which incidents need to be manually verified" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"id": "82cdf9b2-aae9-4186-bfe0-10de0c3c47fb", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 of incidents are accepted.\n", | |
"0 of incidents have been verified.\n", | |
"0 more incidents need to be manual verified\n" | |
] | |
} | |
], | |
"source": [ | |
"# Version 3\n", | |
"# S5,6 and all F sheets are considered\n", | |
"verified_ids = extract_verified_ids(folder_path_manual, filename_manual, col_vrf_RAs_des_post)\n", | |
"# Extract unverified incidents by comparing incidents in df_noaa with verified incidents\n", | |
"df_noaa_unvrfy = identify_unvrfd_icds(verified_ids,df_ip_nd_vrfy,col_RA_Comb_des_post)\n", | |
"n_icd_unvrfd += df_noaa_unvrfy.shape[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"id": "db977a38-1575-4f70-9c72-6bc90dab573c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Excel file exists.\n", | |
"DataFrame is written to incidents_posts_1_D_01_accept.xlsx into sheet one.\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"if if_write_acpt:\n", | |
" write_df_to_excel(df_to_write=df_noaa, # df_ip_acpt,\n", | |
" file_path=filename_ip_acpt,\n", | |
" sheet_name_to_add=icd_category_name)\n", | |
"# if if_write_nd_vrfy:\n", | |
"# write_df_to_excel(df_to_write=df_ip_nd_vrfy,\n", | |
"# file_path=filename_ip_nd_vrfy,\n", | |
"# sheet_name_to_add=icd_category_name)\n", | |
"\n", | |
"# if if_write_unvrfd and df_noaa_unvrfy.shape[0] > 0:\n", | |
"# print('Unverified incidents exist.')\n", | |
"# write_df_to_excel(df_to_write=df_noaa_unvrfy,\n", | |
"# file_path=filename_ip_unvrfy,\n", | |
"# sheet_name_to_add=icd_category_name)\n", | |
"# else:\n", | |
"# print('Incidents are all verified.')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6395d3ad-e01d-47ae-a645-004a2de5b50a", | |
"metadata": {}, | |
"source": [ | |
"# Add verified RA to incidents that need verification" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"id": "d3700d94-ad62-483a-b7c7-202f0bf400d7", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# S5,6 and all F sheets are considered\n", | |
"verified_ids = extract_verified_ids(folder_path_manual,filename_manual,col_vrf_RAs_des_post)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"id": "ae1a7305-802b-45f1-abe9-00455215aa69", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Sheets being processed: ['S5', 'S6', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18']\n" | |
] | |
} | |
], | |
"source": [ | |
"# merge all manually verified dfs together\n", | |
"# Define the Excel file path\n", | |
"excel_file_path = RootFile_Name + '2Raw_Incident_Data_NOAA\\\\Oil_Extraction\\\\Sample_Verification\\\\'\n", | |
"\n", | |
"# Define the sheet names to concatenate\n", | |
"sheet_names_to_concatenate = sample_sheet_names = select_verified_excel_sheets(excel_file_path, 'NOAA_sampled_4.xlsx', sheet_names=None)\n", | |
"\n", | |
"# Call the function and get the concatenated DataFrame\n", | |
"df_ip_mvrfd = combine_excel_sheets(excel_file_path+'NOAA_sampled_4.xlsx', sheet_names_to_concatenate)\n", | |
"# df_ip_mvrfd = concatenate_sheets(excel_file_path, sheet_names_to_concatenate)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"id": "6a2b48ed-bf6a-4468-a47c-538d42eeca60", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# drop redundant rows without final RA (for group 1+n and 0+n)\n", | |
"df_ip_mvrfd.dropna(subset = [col_vrf_RAs_des_post],inplace = True)\n", | |
"df_ip_mvrfd.reset_index(drop = True,inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"id": "01d712ae-03fb-473d-be98-5edae74611fc", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"incidents_posts_1_D_01_vrfd.xlsx\n", | |
"182 incidents from group multi_des has been adding verified RAs\n", | |
"86 incidents from group multi_no_des has been adding verified RAs\n" | |
] | |
} | |
], | |
"source": [ | |
"# merge [verified RAs] with [need verified incidents] by group\n", | |
"filename_parts = [tt_ip,version_ip, \"D\",\"01\", 'vrfd',\"xlsx\"]\n", | |
"filename_ip_vrdf = generate_filename(filename_parts)\n", | |
"print(filename_ip_vrdf)\n", | |
"\n", | |
"# List of sheet names to be updated\n", | |
"sheet_names = ['multi_des', 'multi_no_des'] #'one_zero', 'zero_one',\n", | |
"\n", | |
"# Create the ExcelWriter for the output file outside the loop\n", | |
"with pd.ExcelWriter(filename_ip_vrdf, engine='openpyxl') as writer:\n", | |
" for sheet_name in sheet_names:\n", | |
"\n", | |
" # Read the current sheet from the input file\n", | |
" df_sheet = pd.read_excel(filename_ip_nd_vrfy, sheet_name=sheet_name)\n", | |
"\n", | |
" # Determine which column to use for merging ('NPost_id' or 'id')\n", | |
" df_sheet['merge_key'] = df_sheet['NPost id'].fillna(df_sheet['id'])\n", | |
" df_ip_mvrfd['merge_key'] = df_ip_mvrfd['NPost id'].fillna(df_ip_mvrfd['id'])\n", | |
"\n", | |
" # refine manually verified df\n", | |
" df_rfd = df_ip_mvrfd[['merge_key', 'RA_Comb_des_post_Manual', 'Source_RA_Comb_des_post_Manual']]\n", | |
"\n", | |
" # Perform the merge with concatenated_df\n", | |
" merged_df = df_sheet.merge(df_rfd, on='merge_key', how='inner')\n", | |
"\n", | |
" # Update the 'ra_final' and 'source_final_ra' columns\n", | |
" merged_df[col_RA_Comb_des_post] = merged_df['RA_Comb_des_post_Manual']\n", | |
" merged_df[col_source_RA_comb] = merged_df['Source_RA_Comb_des_post_Manual']\n", | |
"\n", | |
" # Drop the temporary 'merge_key' column\n", | |
" merged_df.drop(columns=['merge_key'], inplace=True)\n", | |
"\n", | |
" # Write the updated DataFrame to the new output file, each DataFrame as a separate sheet\n", | |
" merged_df.to_excel(writer, sheet_name=sheet_name, index=False)\n", | |
"\n", | |
" # print\n", | |
" print(merged_df.shape[0],'incidents from group',sheet_name,'has been adding verified RAs')\n", | |
"# Note: Ensure that 'concatenated_df' is already defined and available in your environment before running this code." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6d367e8f-8151-4a7b-aee8-b6d963f985c7", | |
"metadata": {}, | |
"source": [ | |
"# Merge accepted RA with verified RA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"id": "51a37b1b-bed0-4100-9110-4635461c2077", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"incidents_posts_1_D_01_vrfd.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"df_ip_acpt = pd.read_excel(filename_ip_acpt)\n", | |
"df_ip_vrfd = pd.read_excel(filename_ip_vrdf) \n", | |
"print(filename_ip_vrdf)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"id": "91b14e71-bae8-4f09-b443-344e0262fb4b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Columns exclusive to df1: []\n", | |
"Columns exclusive to df2: ['RA_Comb_des_post_Manual', 'Source_RA_Comb_des_post_Manual']\n" | |
] | |
} | |
], | |
"source": [ | |
"exclusive_df1, exclusive_df2 = find_exclusive_columns(df_ip_acpt, df_ip_vrfd)\n", | |
"print(\"Columns exclusive to df1:\", exclusive_df1)\n", | |
"print(\"Columns exclusive to df2:\", exclusive_df2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"id": "1c5e81a3-367a-430f-baad-dd22d212c5da", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Merge all groups of [accepted incidents] into one (\"inner\")\n", | |
"os.chdir(folder_path_ip)\n", | |
"sheet_names_acpt = ['multi_des', 'multi_no_des', 'one']\n", | |
"sheet_names_vrfy = ['multi_des', 'multi_no_des']\n", | |
"df_ip_acpt_merge = combine_excel_sheets(filename_ip_acpt, sheet_names_acpt)\n", | |
"df_ip_vrfd_merge = combine_excel_sheets(filename_ip_vrdf, sheet_names_vrfy)\n", | |
"df_ip_merge = pd.concat([df_ip_acpt_merge,df_ip_vrfd_merge], axis=0, join='inner',ignore_index = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"id": "4ffe03b8-5451-4fdc-9e77-f729178b6685", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(1813, 54)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>open_date</th>\n", | |
" <th>name</th>\n", | |
" <th>location</th>\n", | |
" <th>lat</th>\n", | |
" <th>lon</th>\n", | |
" <th>threat</th>\n", | |
" <th>tags</th>\n", | |
" <th>commodity</th>\n", | |
" <th>measure_skim</th>\n", | |
" <th>...</th>\n", | |
" <th>post_w_oa</th>\n", | |
" <th>text_gpt35_post</th>\n", | |
" <th>RA_gpt_post_w_oa</th>\n", | |
" <th>RA_merge_posts_rb_gpt</th>\n", | |
" <th>source_RA_merge_posts</th>\n", | |
" <th>RA_post_Manual</th>\n", | |
" <th>Diff_RAs_Accept</th>\n", | |
" <th>ra_final</th>\n", | |
" <th>source_final_ra</th>\n", | |
" <th>category_icds</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10012</td>\n", | |
" <td>2019-11-16</td>\n", | |
" <td>Crowley Barge 650-2 Crude Oil Spill; Anacortes...</td>\n", | |
" <td>Anacortes, WA</td>\n", | |
" <td>48.504536</td>\n", | |
" <td>-122.574863</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>Twenty gallons of oil leaked onto a barge, whi...</td>\n", | |
" <td>RA_Release = 20 gallons\\n\\nRA_total = 20 gallo...</td>\n", | |
" <td>20.0</td>\n", | |
" <td>20.0</td>\n", | |
" <td>ra_rb_eq_gpt</td>\n", | |
" <td>5.0</td>\n", | |
" <td>Accept:Diff_icd_post</td>\n", | |
" <td>20.0</td>\n", | |
" <td>i&p: ra_vrfd_rb&ra_rb_eq_gpt</td>\n", | |
" <td>multi_des</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>9810</td>\n", | |
" <td>2018-10-15</td>\n", | |
" <td>M/V Candy Store; Cameron, LA, USA</td>\n", | |
" <td>Cameron, LA, USA</td>\n", | |
" <td>29.795911</td>\n", | |
" <td>-93.330860</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>It is estimated that 1,000 gallons of diesel f...</td>\n", | |
" <td>RA_Release = 1,000 gallons\\nRA_total = 3,000 g...</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>ra_vrfd_gpt</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>Accept:Diff_icd_post</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>p: ra_vrfd_gpt</td>\n", | |
" <td>multi_des</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2 rows × 54 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id open_date name \\\n", | |
"0 10012 2019-11-16 Crowley Barge 650-2 Crude Oil Spill; Anacortes... \n", | |
"1 9810 2018-10-15 M/V Candy Store; Cameron, LA, USA \n", | |
"\n", | |
" location lat lon threat tags commodity \\\n", | |
"0 Anacortes, WA 48.504536 -122.574863 Oil NaN Unknown \n", | |
"1 Cameron, LA, USA 29.795911 -93.330860 Oil NaN Unknown \n", | |
"\n", | |
" measure_skim ... post_w_oa \\\n", | |
"0 NaN ... Twenty gallons of oil leaked onto a barge, whi... \n", | |
"1 NaN ... It is estimated that 1,000 gallons of diesel f... \n", | |
"\n", | |
" text_gpt35_post RA_gpt_post_w_oa \\\n", | |
"0 RA_Release = 20 gallons\\n\\nRA_total = 20 gallo... 20.0 \n", | |
"1 RA_Release = 1,000 gallons\\nRA_total = 3,000 g... 1000.0 \n", | |
"\n", | |
" RA_merge_posts_rb_gpt source_RA_merge_posts RA_post_Manual \\\n", | |
"0 20.0 ra_rb_eq_gpt 5.0 \n", | |
"1 1000.0 ra_vrfd_gpt 1000.0 \n", | |
"\n", | |
" Diff_RAs_Accept ra_final source_final_ra category_icds \n", | |
"0 Accept:Diff_icd_post 20.0 i&p: ra_vrfd_rb&ra_rb_eq_gpt multi_des \n", | |
"1 Accept:Diff_icd_post 1000.0 p: ra_vrfd_gpt multi_des \n", | |
"\n", | |
"[2 rows x 54 columns]" | |
] | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"print(df_ip_merge.shape)\n", | |
"df_ip_merge.head(2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a5f598f2-8a1f-4538-86ef-36aa91b2ad7f", | |
"metadata": {}, | |
"source": [ | |
"## Write results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"id": "e43c2fcf-b82f-4b98-95b3-7cb7ebf444a4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"incidents_posts_1_E_01_final_ra_wt_00.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"filename_parts = [tt_ip,version_ip, \"E\",\"01\", 'final_ra_wt_00',\"xlsx\"]\n", | |
"filename_ip_finalra = generate_filename(filename_parts)\n", | |
"print(filename_ip_finalra)\n", | |
"df_ip_merge.to_excel(filename_ip_finalra,engine='openpyxl',index = False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3a9ffcda-18c9-4aa2-9146-8fc76be14f2d", | |
"metadata": {}, | |
"source": [ | |
"# Merge zero_zero with RA available incidents " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"id": "4c8f48c5-4916-4a35-911b-dc8dc86423c4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"filename_parts = [tt_ip,version_ip, \"E\",\"01\", 'final_ra_wt_00',\"xlsx\"]\n", | |
"filename_ip_finalra = generate_filename(filename_parts)\n", | |
"df_ip_merge = pd.read_excel(filename_ip_finalra)\n", | |
"\n", | |
"df_zero_zero = pd.read_excel(filename_ip_grpby,sheet_name = 'zero')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"id": "7a9f4287-6597-4b95-a900-cc4ac365df77", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(1737, 50)\n", | |
"(1813, 54)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df_zero_zero.shape)\n", | |
"print(df_ip_merge.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 108, | |
"id": "e44c71b9-7ba7-4b7f-8ad4-d95a3a71fb10", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Run it if you want to df_zero_zero get combined as well\n", | |
"df_zero_zero[col_category_icds] = 'zero'\n", | |
"df_zero_zero[col_RA_Comb_des_post] = -1\n", | |
"df_zero_zero[col_source_RA_comb] = 'unknown'\n", | |
"\n", | |
"# Drop columns in df_zero_zero that have all NaN or None values\n", | |
"df_zero_zero.dropna(axis=1, how='all', inplace=True)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 109, | |
"id": "b31aff2e-e9c0-4ee4-acc2-3a1c233e75a3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(3550, 54)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>open_date</th>\n", | |
" <th>name</th>\n", | |
" <th>location</th>\n", | |
" <th>lat</th>\n", | |
" <th>lon</th>\n", | |
" <th>threat</th>\n", | |
" <th>tags</th>\n", | |
" <th>commodity</th>\n", | |
" <th>measure_skim</th>\n", | |
" <th>...</th>\n", | |
" <th>post_w_oa</th>\n", | |
" <th>text_gpt35_post</th>\n", | |
" <th>RA_gpt_post_w_oa</th>\n", | |
" <th>RA_merge_posts_rb_gpt</th>\n", | |
" <th>source_RA_merge_posts</th>\n", | |
" <th>RA_post_Manual</th>\n", | |
" <th>Diff_RAs_Accept</th>\n", | |
" <th>ra_final</th>\n", | |
" <th>source_final_ra</th>\n", | |
" <th>category_icds</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10012</td>\n", | |
" <td>2019-11-16</td>\n", | |
" <td>Crowley Barge 650-2 Crude Oil Spill; Anacortes...</td>\n", | |
" <td>Anacortes, WA</td>\n", | |
" <td>48.504536</td>\n", | |
" <td>-122.574863</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>Twenty gallons of oil leaked onto a barge, whi...</td>\n", | |
" <td>RA_Release = 20 gallons\\n\\nRA_total = 20 gallo...</td>\n", | |
" <td>20.0</td>\n", | |
" <td>20.0</td>\n", | |
" <td>ra_rb_eq_gpt</td>\n", | |
" <td>5.0</td>\n", | |
" <td>Accept:Diff_icd_post</td>\n", | |
" <td>20.0</td>\n", | |
" <td>i&p: ra_vrfd_rb&ra_rb_eq_gpt</td>\n", | |
" <td>multi_des</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>9810</td>\n", | |
" <td>2018-10-15</td>\n", | |
" <td>M/V Candy Store; Cameron, LA, USA</td>\n", | |
" <td>Cameron, LA, USA</td>\n", | |
" <td>29.795911</td>\n", | |
" <td>-93.330860</td>\n", | |
" <td>Oil</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Unknown</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>It is estimated that 1,000 gallons of diesel f...</td>\n", | |
" <td>RA_Release = 1,000 gallons\\nRA_total = 3,000 g...</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>ra_vrfd_gpt</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>Accept:Diff_icd_post</td>\n", | |
" <td>1000.0</td>\n", | |
" <td>p: ra_vrfd_gpt</td>\n", | |
" <td>multi_des</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2 rows × 54 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id open_date name \\\n", | |
"0 10012 2019-11-16 Crowley Barge 650-2 Crude Oil Spill; Anacortes... \n", | |
"1 9810 2018-10-15 M/V Candy Store; Cameron, LA, USA \n", | |
"\n", | |
" location lat lon threat tags commodity \\\n", | |
"0 Anacortes, WA 48.504536 -122.574863 Oil NaN Unknown \n", | |
"1 Cameron, LA, USA 29.795911 -93.330860 Oil NaN Unknown \n", | |
"\n", | |
" measure_skim ... post_w_oa \\\n", | |
"0 NaN ... Twenty gallons of oil leaked onto a barge, whi... \n", | |
"1 NaN ... It is estimated that 1,000 gallons of diesel f... \n", | |
"\n", | |
" text_gpt35_post RA_gpt_post_w_oa \\\n", | |
"0 RA_Release = 20 gallons\\n\\nRA_total = 20 gallo... 20.0 \n", | |
"1 RA_Release = 1,000 gallons\\nRA_total = 3,000 g... 1000.0 \n", | |
"\n", | |
" RA_merge_posts_rb_gpt source_RA_merge_posts RA_post_Manual \\\n", | |
"0 20.0 ra_rb_eq_gpt 5.0 \n", | |
"1 1000.0 ra_vrfd_gpt 1000.0 \n", | |
"\n", | |
" Diff_RAs_Accept ra_final source_final_ra category_icds \n", | |
"0 Accept:Diff_icd_post 20.0 i&p: ra_vrfd_rb&ra_rb_eq_gpt multi_des \n", | |
"1 Accept:Diff_icd_post 1000.0 p: ra_vrfd_gpt multi_des \n", | |
"\n", | |
"[2 rows x 54 columns]" | |
] | |
}, | |
"execution_count": 109, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Align df_zero_zero to have the same columns as df_ip_merge\n", | |
"# df_zero_zero = df_zero_zero[df_ip_merge.columns]\n", | |
"\n", | |
"# Merge df_zero_zero with df_ip_merge along the vertical axis\n", | |
"df_ip_merge_w_00 = pd.concat([df_ip_merge, df_zero_zero], axis=0, ignore_index=True)\n", | |
"print(df_ip_merge_w_00.shape)\n", | |
"df_ip_merge_w_00.head(2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"id": "61b90d40-c4f7-485f-a4a6-9bd226fa832d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"incidents_posts_1_E_02_final_ra_w_00.xlsx\n" | |
] | |
} | |
], | |
"source": [ | |
"os.chdir(folder_path_ip)\n", | |
"filename_parts = [tt_ip,version_ip, \"E\",\"01\", 'final_ra_w_00',\"xlsx\"]\n", | |
"filename_ip_finalra = generate_filename(filename_parts)\n", | |
"print(filename_ip_finalra)\n", | |
"df_ip_merge_w_00.to_excel(filename_ip_finalra,engine='openpyxl',index = False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": { | |
"height": "calc(100% - 180px)", | |
"left": "10px", | |
"top": "150px", | |
"width": "272.6px" | |
}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
}, | |
"toc-autonumbering": true, | |
"toc-showcode": false, | |
"toc-showmarkdowntxt": false, | |
"toc-showtags": false | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |