1_RA_extraction.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0b86191a-960b-48cb-bef0-6ac54eb72e42",
   "metadata": {},
   "source": [
    "# Initialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "934e7ce5-0de1-486f-89c5-70e27212b339",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1b9f02b-8071-4f3b-955a-9360d200155b",
   "metadata": {},
   "source": [
    "Replace the folder path with the absolute address of the folder on your local disk. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a39b698c-9212-41dc-ae4f-e47e6be5f6a7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "RootFile_Name = 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\\\'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a180235f-948a-4880-a695-6627f9e2bcc7",
   "metadata": {},
   "source": [
    "Change folder, run custom functions, define folder paths and filenames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4efab6c8-fecb-4aa1-a1f8-91020cd3147a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-07-06 19:56:41 INFO: Loading these models for language: en (English):\n",
      "========================\n",
      "| Processor | Package  |\n",
      "------------------------\n",
      "| tokenize  | combined |\n",
      "| pos       | combined |\n",
      "| lemma     | combined |\n",
      "========================\n",
      "\n",
      "2024-07-06 19:56:41 INFO: Use device: cpu\n",
      "2024-07-06 19:56:41 INFO: Loading: tokenize\n",
      "2024-07-06 19:56:41 INFO: Loading: pos\n",
      "2024-07-06 19:56:41 INFO: Loading: lemma\n",
      "2024-07-06 19:56:41 INFO: Done loading processors!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Directory 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\description\\' already exists.\n",
      "Directory 'F:/OneDrive_Files/OneDrive - purdue.edu/Documents/1_Research_Code/Release_Info_Extraction/Scientific_data/simplified\\posts\\' already exists.\n"
     ]
    }
   ],
   "source": [
    "os.chdir(RootFile_Name)\n",
    "\n",
    "# Read custom functions used in the RA candidate extraction\n",
    "%run func_set_general.ipynb\n",
    "%run func_set_ra_eval.ipynb\n",
    "\n",
    "# Better to separately create folders for both descriptions and posts.\n",
    "# use the created folder path to replace the following one\n",
    "folder_path_description = RootFile_Name +'description\\\\'\n",
    "folder_path_posts = RootFile_Name +'posts\\\\'\n",
    "check_and_create_directory(folder_path_description)\n",
    "check_and_create_directory(folder_path_posts)\n",
    "folder_path_manual = RootFile_Name+'manual\\\\'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b1711f9-4941-4b90-a1bd-9634bfad968a",
   "metadata": {},
   "source": [
    "# Run the following code twice for description and posts separately\n",
    "the first time is for processing description (by setting type_ra_ext = 'description')\n",
    "\n",
    "the second time is for posts (by setting type_ra_ext = 'post')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "dc0fc3cc-3b0e-4439-8fcb-d70c43c2070f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Select if you want to process descriptions or posts. Note that you must process descriptions before posts\n",
    "type_ra_ext = 'description'\n",
    "# type_ra_ext = 'post'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "62f516c5-7099-4aa5-8cad-4c42753f2830",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'filename_prefix': 'incident',\n",
       " 'col_text': 'description',\n",
       " 'col_date': 'open_date',\n",
       " 'col_ra_gpt_from_sample': 'RA_icd_GPT_OA_des',\n",
       " 'col_ra_gpt': 'RA_icd_gpt',\n",
       " 'col_ra_merge': 'RA_merge_icd_rb_gpt',\n",
       " 'col_source_ra_merge': 'source_RA_merge_icd',\n",
       " 'sheetnames_mvrfd_ext': ['S9', 'S10', 'S11', 'S12', 'S13', 'S14'],\n",
       " 'col_refer': 'id',\n",
       " 'col_ra_gpt_oa_text': 'RA_gpt_des_w_oa',\n",
       " 'col_threat': 'threat',\n",
       " 'col_sub': 'commodity',\n",
       " 'col_text_wt_traj': 'des_wt_traj',\n",
       " 'if_text_has_hypothesis': 'trj_prd',\n",
       " 'col_oa_text_from_sample': 'OA_des',\n",
       " 'col_oa_text': 'des_w_oa',\n",
       " 'col_ra_rb': 'RA_rb_des_wt_traj',\n",
       " 'col_oatt_rb': 'CA_rb_des_wt_traj',\n",
       " 'col_if_tl_plt': 'Times_larger_plt',\n",
       " 'col_if_rls': 'Rls_doc',\n",
       " 'col_if_no_rls': 'No_release',\n",
       " 'if_do_no_rls_check': True,\n",
       " 'col_oatt_gpt_ext': 'OAtt_GPT_Ext',\n",
       " 'col_oarv_gpt_ext': 'OArv_GPT_Ext',\n",
       " 'col_if_oa_irlvt_gpt': 'OA_irlvt_GPT',\n",
       " 'col_if_oa_unit_lost': 'OA_unit_lost',\n",
       " 'col_id_of_sample': 'id',\n",
       " 'col_text_gpt35': 'text_gpt35_description',\n",
       " 'col_ra_gpt_ext': 'RA_gpt_des_w_oa',\n",
       " 'col_vrf_ra': 'RA_des_Manual',\n",
       " 'col_id_of_vrfd_ra': 'id',\n",
       " 'col_ra_source_of_vrfd_ra': 'Source'}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# loading variables\n",
    "if type_ra_ext == 'description':\n",
    "    folder_path = folder_path_description\n",
    "else:\n",
    "    folder_path = folder_path_posts\n",
    "\n",
    "vars = define_variables(type_ra_ext)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "779eeea2-9d86-4bc5-9ad0-13c2fa360b23",
   "metadata": {},
   "source": [
    "# Identify oil spill-related incidents/posts"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24b254ba-0318-41e0-b6c1-f85dbfcd5061",
   "metadata": {},
   "source": [
    "## Description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "95258389-1847-48e8-9d6e-728250c3b900",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The size of original dataset is 4473\n",
      "The columns of original dataset include Index(['id', 'open_date', 'name', 'location', 'lat', 'lon', 'threat', 'tags',\n",
      "       'commodity', 'measure_skim', 'measure_shore', 'measure_bio',\n",
      "       'measure_disperse', 'measure_burn', 'max_ptl_release_gallons', 'posts',\n",
      "       'description'],\n",
      "      dtype='object')\n",
      "The number of incidents in refined dataset is 3550\n",
      "The refined incident were written into 1_incident_level_oil_spill_related_1.xlsx\n"
     ]
    }
   ],
   "source": [
    "# read original dataset\n",
    "os.chdir(folder_path_description)\n",
    "df_noaa = pd.read_csv('incidents_20231207.csv',parse_dates = ['open_date'], encoding='latin1')\n",
    "# replace 'incidents_20231207.csv' with new raw incident data downloaded from NOAA ORR (https://incidentnews.noaa.gov/raw/index) if you want to update the dataset\n",
    "\n",
    "print('The size of original dataset is',df_noaa.shape[0])\n",
    "print('The columns of original dataset include',df_noaa.columns)\n",
    "\n",
    "# identify oil spill relevance by column \"threat\" and \"commodity\"\n",
    "\n",
    "# fillna nan values in the columns to be filtered in advance\n",
    "df_noaa.loc[:,[vars['col_threat'],vars['col_sub']]] = df_noaa.loc[:,[vars['col_threat'],vars['col_sub']]].fillna('Unknown')\n",
    "\n",
    "# filter out the incidents related to keywords-topic\n",
    "# Search in \"threat\" column\n",
    "L_threat = keywords_searching(df_noaa[vars['col_threat']], kws_oil, case_sentivity = False)\n",
    "\n",
    "# Search in \"commodity\" column\n",
    "L_risk_sub = keywords_searching(df_noaa[vars['col_sub']], kws_oil, case_sentivity = False)\n",
    "L_Condi = L_threat | L_risk_sub\n",
    "\n",
    "df_noaa = df_noaa[L_Condi].reset_index(drop = True);\n",
    "print(\"The number of incidents in refined dataset is\", df_noaa.shape[0]);\n",
    "\n",
    "if type_ra_ext == 'description':\n",
    "    # write in\n",
    "    os.chdir(folder_path)\n",
    "    filename_oil_related = '1_' + vars['filename_prefix'] + '_level_oil_spill_related_1.xlsx'\n",
    "    df_noaa.to_excel(filename_oil_related, index=False)\n",
    "    print('The refined ' + vars['filename_prefix'] + ' were written into',filename_oil_related)\n",
    "else:\n",
    "    df_incidents = df_noaa.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d93e47d-cdeb-4d09-8496-73ef51473391",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Posts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "54316e2f-fe60-40ea-962f-440a52194f2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "if type_ra_ext == 'post':\n",
    "\n",
    "    # read the original post\n",
    "    os.chdir(folder_path_posts)\n",
    "    df_noaa = pd.read_excel('posts_raw_all.xlsx',parse_dates = ['post date'])\n",
    "    # replace 'posts_raw_all.xlsx' with new raw post data if you want to update the dataset\n",
    "    print('The size of original dataset is',df_noaa.shape[0])\n",
    "    print('The columns of original dataset include',df_noaa.columns)\n",
    "\n",
    "    # Oil spill incidents need to be finished before conducting this filter\n",
    "    IF_post_is_oil = pd.Series([False] * df_noaa.shape[0])\n",
    "    for k in tqdm(range(df_noaa.shape[0])):\n",
    "        id_noaa_i = df_noaa.loc[k,'noaa id']\n",
    "        if id_noaa_i in df_incidents['id'].values:\n",
    "            IF_post_is_oil[k] = True;\n",
    "    df_noaa = df_noaa.loc[IF_post_is_oil,:]\n",
    "    print(\"The number of posts in refined dataset is\", df_noaa.shape[0])\n",
    "    \n",
    "    # write in\n",
    "    os.chdir(folder_path_posts)\n",
    "    # Assuming df is your DataFrame and 'url_column' is the name of the column with long URLs\n",
    "    # df_noaa[vars['col_text']] = \"'\" + df_noaa[vars['col_text']].astype(str)\n",
    "\n",
    "    filename_oil_related = '1_' + vars['filename_prefix'] + '_level_oil_spill_related_1.xlsx'\n",
    "    # Now write to Excel\n",
    "    df_noaa.to_excel(filename_oil_related, engine='openpyxl', index=False)\n",
    "    \n",
    "    # df_noaa.to_excel(filename_oil_related, index=False)\n",
    "    print('The refined ' + vars['filename_prefix'] + ' were written into:',filename_oil_related)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9087429e-5b2f-4633-a0a4-5d2aeddeff2d",
   "metadata": {},
   "source": [
    "# Rule-based RA extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af01a8f2-c8c2-4c25-8277-3dc1d048d270",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Pretreatment: remove hypothetical sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3f5b4ec3-5d82-49a4-8778-837fdbafaa5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start rule-based RA candidates for posts\n",
      "Size of df: 21934 9\n",
      "Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts']\n",
      "Memory Usage (Total in MB): 34.96\n"
     ]
    }
   ],
   "source": [
    "# read the post data\n",
    "print('Start rule-based RA candidates for ' + vars['filename_prefix'])\n",
    "os.chdir(folder_path)\n",
    "df_noaa = pd.read_excel(filename_oil_related,parse_dates = [vars['col_date']])\n",
    "\n",
    "# Generate the summary\n",
    "df_summary = summarize_large_dataframe(df_noaa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8759762c-302c-44a8-9247-477b271686d3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The column 'post content' exists in the DataFrame.\n",
      "The column 'trj_prd' does not exist in the DataFrame. Creating and filling it with the specified default value.\n",
      "The column 'post_wt_traj' does not exist in the DataFrame. Creating and filling it with the specified default value.\n"
     ]
    }
   ],
   "source": [
    "df_noaa = ensure_column_exists(df_noaa, vars['col_text'], False)\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['if_text_has_hypothesis'], False)\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['col_text_wt_traj'], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "35c8b19f-eb60-426d-b9c3-52792e8b1939",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Exluding the hypothetical sentences...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████| 21934/21934 [2:01:59<00:00,  3.00it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# exclude hypothetical sentences\n",
    "print('Exluding the hypothetical sentences...')\n",
    "for k in tqdm(range(df_noaa.shape[0])):\n",
    "    if pd.isna(df_noaa.loc[k,vars['col_text_wt_traj']]):\n",
    "        str_real_sents = 'Unknown'\n",
    "        des = adjust_format(df_noaa.loc[k, vars['col_text']])\n",
    "        doc = nlp(des)\n",
    "        for sent in doc.sentences:\n",
    "            df_sent = sent2df(sent)\n",
    "            # skip the sentence if it is regarded as traj prediction\n",
    "            L_predict_sent = trajectory_prediction(df_sent)\n",
    "            if L_predict_sent:\n",
    "                df_noaa.loc[k, vars['if_text_has_hypothesis']] = True\n",
    "                continue\n",
    "            # Take the sentences that don't contain Traj prediction\n",
    "            str_real_sent_i = sent.text\n",
    "            str_real_sents = str_real_sent_i if str_real_sents == 'Unknown' else str_real_sents + ' ' + str_real_sent_i;\n",
    "        df_noaa.loc[k, vars['col_text_wt_traj']] = str_real_sents\n",
    "print(\"Done.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "99ccf4d4-7b16-4564-bce4-93dc230f14aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of df: 21934 11\n",
      "Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj']\n",
      "Memory Usage (Total in MB): 63.53\n",
      "The refined posts were written into: 2_posts_level_exclude_hypo_sents_1.xlsx\n"
     ]
    }
   ],
   "source": [
    "# Write in\n",
    "os.chdir(folder_path)\n",
    "filename = '2_' + vars['filename_prefix'] + '_level_exclude_hypo_sents_1.xlsx'\n",
    "\n",
    "# Now write to Excel\n",
    "df_noaa.to_excel(filename, engine='openpyxl', index=False)\n",
    "\n",
    "# df_noaa.to_excel(filename, index=False, urlbox=False)\n",
    "df_summary = summarize_large_dataframe(df_noaa)\n",
    "print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5bf21d6-54d3-4795-8afd-9cc0eec835fd",
   "metadata": {},
   "source": [
    "## Utilize rule-based algorithm to extract oil spill amount"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1add3b76-2365-438b-98e9-b1dfb5f41114",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The column 'Times_larger_plt' does not exist in the DataFrame. Creating and filling it with the specified default value.\n",
      "The column 'Rls_doc' does not exist in the DataFrame. Creating and filling it with the specified default value.\n",
      "The column 'No_release' does not exist in the DataFrame. Creating and filling it with the specified default value.\n",
      "The column 'CA_rb_post_wt_traj' does not exist in the DataFrame. Creating and filling it with the specified default value.\n",
      "The column 'RA_rb_post_wt_traj' does not exist in the DataFrame. Creating and filling it with the specified default value.\n"
     ]
    }
   ],
   "source": [
    "# Check existence of required columns and create them if needed\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['col_if_tl_plt'], False)\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['col_if_rls'], False)\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['col_if_no_rls'], False)\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['col_oatt_rb'], np.nan)\n",
    "df_noaa = ensure_column_exists(df_noaa, vars['col_ra_rb'], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3565c74a-fc8b-4cb0-ab63-043b8e1f574c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ptl_max_RA was added.\n"
     ]
    }
   ],
   "source": [
    "# remove column \"max_ptl_release_gallons\" in df_noaa\n",
    "if 'ptl_max_RA' in df_noaa.columns:\n",
    "    df_noaa.drop(columns = ['ptl_max_RA'],inplace = True)\n",
    "\n",
    "# migrate potential maximum release amount\n",
    "# For posts: migrate potential maximum release amount\n",
    "if 'max_ptl_release_gallons' not in df_noaa.columns:\n",
    "    os.chdir(folder_path_description);\n",
    "    try:\n",
    "        df_icd = pd.read_csv('incidents_20231207.csv',parse_dates = ['open_date'], encoding='latin1')\n",
    "    except FileNotFoundError:\n",
    "        df_icd = pd.read_excel('incidents_20231207.xlsx',parse_dates = ['open_date'])\n",
    "    df_icd['max_ptl_release_gallons'].fillna(-1,inplace = True)\n",
    "    df_ptl_temp = df_icd[['id','max_ptl_release_gallons']]\n",
    "    df_noaa = pd.merge(df_noaa,\n",
    "                       df_ptl_temp,\n",
    "                      left_on = 'noaa id',\n",
    "                       right_on = 'id',\n",
    "                      how = 'left')\n",
    "    df_noaa.drop(columns = ['id'],inplace=True)\n",
    "    print('The column of potential maximum RA was added.')\n",
    "else:\n",
    "    print('The column of potential maximum RA exists. No need for adding.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "454c6831-7f5f-4b2a-af73-feadb5e12fe0",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████| 21934/21934 [2:06:38<00:00,  2.89it/s]\n"
     ]
    }
   ],
   "source": [
    "# Call rule-based function \n",
    "for k in tqdm(range(df_noaa.shape[0])): #\n",
    "    if pd.isna(df_noaa.loc[k,vars['col_ra_rb']]):\n",
    "        [RA_rb, CA_rb, L_times_larger_plt,if_rls,if_no_rls] = RA_rb_ext(\n",
    "            df_noaa = df_noaa,\n",
    "            k = k,\n",
    "            col_text = vars['col_text_wt_traj'],\n",
    "            no_rls_check = vars['if_do_no_rls_check'],\n",
    "            ptl_large_check = True,\n",
    "            ratio_oa_ptl = 1,\n",
    "            label_UCRA = False);\n",
    "    \n",
    "        df_noaa.loc[k,vars['col_ra_rb']] = RA_rb\n",
    "        df_noaa.loc[k,vars['col_oatt_rb']] = CA_rb\n",
    "        df_noaa.loc[k,vars['col_if_tl_plt']] = L_times_larger_plt\n",
    "        df_noaa.loc[k,vars['col_if_rls']] = if_rls\n",
    "        df_noaa.loc[k,vars['col_if_no_rls']] = if_no_rls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c03ee5fe-dd35-4bc8-91f3-f03a1bcbd252",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of df: 21934 17\n",
      "Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA']\n",
      "Memory Usage (Total in MB): 64.77\n",
      "The refined posts were written into: 3_posts_level_rule_based_candidate_1.xlsx\n"
     ]
    }
   ],
   "source": [
    "# Write in\n",
    "os.chdir(folder_path)\n",
    "filename = '3_' + vars['filename_prefix'] + '_level_rule_based_candidate_1.xlsx'\n",
    "df_noaa.to_excel(filename, engine='openpyxl', index=False)\n",
    "# Generate the summary\n",
    "df_summary = summarize_large_dataframe(df_noaa)\n",
    "print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6386db8-a33d-4583-8bdf-f7317bb49ebe",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# LLM-based RA extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae457c3f-d1e0-4927-9419-f01566e605f6",
   "metadata": {},
   "source": [
    "## Pretreatment: remove OA-lacking sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8e250662-4b61-492a-a87c-4ebad3ce4121",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = '3_' + vars['filename_prefix'] + '_level_rule_based_candidate_1.xlsx'\n",
    "os.chdir(folder_path)\n",
    "df_noaa = pd.read_excel(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4b9613bc-4788-4dc6-bfab-a79e60e8003d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The column 'post_w_oa' does not exist in the DataFrame. Creating and filling it with the specified default value.\n"
     ]
    }
   ],
   "source": [
    "df_noaa = ensure_column_exists(df_noaa, vars['col_oa_text'], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "64b89a11-1fb8-4ac2-a5c6-772b31350014",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████| 21934/21934 [2:00:07<00:00,  3.04it/s]\n"
     ]
    }
   ],
   "source": [
    "# a) Extract sentences contains oil amount(OA) from description(or post text) \n",
    "for k in tqdm(range(df_noaa.shape[0])):\n",
    "    if pd.isna(df_noaa.loc[k,vars['col_oa_text']]):\n",
    "        str_OA_sents = 'Unknown';\n",
    "        text = adjust_format(df_noaa.loc[k, vars['col_text_wt_traj']])\n",
    "        try:\n",
    "            doc = nlp(text)  # run annotation over a sentence\n",
    "            for sent in doc.sentences:\n",
    "                df_sent = sent2df(sent)\n",
    "                L_text_contain_oa = if_text_contain_oa(df_sent)\n",
    "                if L_text_contain_oa == True:\n",
    "                    str_OA_sent_i = sent.text;\n",
    "                    str_OA_sents = str_OA_sent_i if str_OA_sents == 'Unknown' else str_OA_sents + ' ' + str_OA_sent_i;\n",
    "        except ValueError:\n",
    "            print('ValueError happened! id: ','')\n",
    "        df_noaa.loc[k,vars['col_oa_text']] = str_OA_sents\n",
    "# df_noaa[vars['col_oa_text']].fillna('Unknown',inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e1b5e463-4e34-4057-90b1-db5f407bb251",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of df: 21934 18\n",
      "Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa']\n",
      "Memory Usage (Total in MB): 66.39\n",
      "The refined posts were written into: 4_posts_level_oa_sents_1.xlsx\n"
     ]
    }
   ],
   "source": [
    "# Write in\n",
    "filename = '4_' + vars['filename_prefix'] + '_level_oa_sents_1.xlsx'\n",
    "df_noaa.to_excel(filename, engine='openpyxl', index=False)\n",
    "# Generate the summary\n",
    "df_summary = summarize_large_dataframe(df_noaa)\n",
    "print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f870dbc1-3c94-4391-a824-ace9b3a1d9fc",
   "metadata": {},
   "source": [
    "## Utilize LLM to extract oil spill amount\n",
    "\n",
    "In this section, incident descriptions/posts are processed by a Large Language Model (LLM) to generate LLM-based RA candidates. However, due to the inherent randomness of LLMs, results may vary across incidents. Users can either load our LLM-based RA candidates to ensure repeatability or run the LLM model themselves. We use the GPT-3.5-turbo API for this purpose. To run it, please refer to OpenAI's API instructions (https://platform.openai.com/docs/models) and update the API key (variable name \"API_KEY\") in the \"Call GPT\" section of \"func_set_ra_eval.ipynb\"."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b8fc095-dfea-4945-90c5-a25ab3364103",
   "metadata": {},
   "source": [
    "### Option 1: load the prepared LLM-based RA candidates"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa34d56d-78ab-406b-8c5f-62dc38da9137",
   "metadata": {},
   "source": [
    "Read in RA_GPT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3c265f33-2489-4cd2-8e95-53686e823943",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The RA_GPT data of description has been read\n"
     ]
    }
   ],
   "source": [
    "# read gpt data (reference)\n",
    "reference_path_icd = RootFile_Name +'gpted\\\\'\n",
    "os.chdir(reference_path_icd)\n",
    "df_gpted = pd.read_excel(type_ra_ext+'_gpted.xlsx')\n",
    "print('The RA_GPT data of ' + type_ra_ext + ' has been read')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9c7f4100-45da-4667-8560-4f7a76fe3d50",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of df: 21934 4\n",
      "Column Names: ['NPost id', 'old_npost_id_2parts', 'text_gpt35_post', 'RA_gpt_post_w_oa']\n",
      "Memory Usage (Total in MB): 4.54\n"
     ]
    }
   ],
   "source": [
    "# Generate the summary\n",
    "df_summary = summarize_large_dataframe(df_gpted)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34164c19-3f8d-4070-92df-20e6df317b47",
   "metadata": {},
   "source": [
    "Merge RA_GPT to incident-level data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "be904bca-1282-43a3-a064-18a2cdfa48da",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read incidents/posts\n",
    "os.chdir(folder_path)\n",
    "df_noaa = pd.read_excel(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "95b8c983-e260-46be-9789-41c35d8f3753",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_noaa = ensure_column_exists(df_noaa, vars['col_id_of_sample'], np.nan)\n",
    "# df_noaa = ensure_column_exists(df_noaa, vars['col_text_gpt35'], np.nan)\n",
    "# df_noaa = ensure_column_exists(df_noaa, vars['col_ra_gpt_ext'], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "087ba90c-8651-459a-b2ae-fd4b4a8a3de8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['NPost id', 'text_gpt35_post', 'RA_gpt_post_w_oa']\n"
     ]
    }
   ],
   "source": [
    "col_merge_gpt = [vars['col_id_of_sample'], vars['col_text_gpt35'],vars['col_ra_gpt_ext']]\n",
    "print(col_merge_gpt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b742cafd-a86f-41a9-9634-ce0ce9696437",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21934"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# migrate OA_des and RA_GPT\n",
    "df_gpted_for_merge = df_gpted[col_merge_gpt]\n",
    "df_noaa_w_gpt = pd.merge(left = df_noaa,\n",
    "                        right = df_gpted_for_merge,\n",
    "                        how = 'left',\n",
    "                        on = vars['col_id_of_sample']\n",
    "                        )\n",
    "df_noaa_w_gpt.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c3375965-9225-47d0-9236-0003f79214f4",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 post need GPT extraction.\n"
     ]
    }
   ],
   "source": [
    "print(df_noaa_w_gpt[vars['col_ra_gpt_ext']].isna().sum(),  type_ra_ext + ' need GPT extraction.')\n",
    "df_noaa = df_noaa_w_gpt.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74433654-2d43-4b29-ba72-c847c4449e44",
   "metadata": {},
   "source": [
    "### Option 2: Call GPT through API to process the text "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24ac1695-1391-451c-9b24-a3bcdb054ccd",
   "metadata": {},
   "source": [
    "Call GPT API to analyze text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ec56b53a-e7ae-4292-b4aa-7264c86966f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(folder_path)\n",
    "df_noaa = df_noaa_w_gpt.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "3a1c7de1-1cd3-43f6-ae38-e6adb9a6fa8f",
   "metadata": {
    "code_folding": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Solely based on the following information, how much oil has been actually released or spilled? Return \"RA_Release = \".Return \"RA_Release = Unknown\" if the spill amount is not able to be found. How much product has been carried or involved? Return \"RA_total = \". Return \"RA_total = Unknown\" if the carry amount is not able to be found.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# short prompt\n",
    "Qn_short = \\\n",
    "'Solely based on the following information, how much oil has been actually released or spilled? \\\n",
    "Return \\\"RA_Release = \\\".Return \\\"RA_Release = Unknown\\\" if the spill amount is not able to be found.\\\n",
    " How much product has been carried or involved? Return \\\"RA_total = \\\". \\\n",
    "Return \\\"RA_total = Unknown\\\" if the carry amount is not able to be found.\\n'\n",
    "print(Qn_short)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "534157d4-e1ca-47c3-ae17-902866986211",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpt model selection\n",
    "def gpt_model_select(gpt_model_shortname):\n",
    "    if gpt_model_shortname == \"gpt4\":\n",
    "        gpt_model = \"gpt-4-1106-preview\"\n",
    "        gpt_output = 'text_gpt4'\n",
    "    elif gpt_model_shortname == \"gpt3\":\n",
    "        gpt_model = \"gpt-3.5-turbo\"\n",
    "        gpt_output = 'text_gpt35'\n",
    "    return gpt_model,gpt_output\n",
    "\n",
    "gpt_model_shortname = \"gpt3\" #\"gpt4\"\n",
    "gpt_model,gpt_output = gpt_model_select(gpt_model_shortname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "3f9e43c7-615c-454c-976a-685428ea7348",
   "metadata": {},
   "outputs": [],
   "source": [
    "def gpt_cost_estimate(gpt_model,n_prompt_tokens,n_completion_tokens,n_total_tokens):\n",
    "    if gpt_model == \"gpt-4-1106-preview\":\n",
    "        costs = (n_prompt_tokens*0.01 + n_completion_tokens*0.03)/1000\n",
    "    elif gpt_model == \"gpt-3.5-turbo\":\n",
    "        costs = (n_total_tokens*0.00200)/1000\n",
    "    return round(costs,2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "9af51a34-9f7b-4b8d-85c6-d5518c7f85ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column text_gpt35_post exists.\n"
     ]
    }
   ],
   "source": [
    "# Version 2P for post GPT extraction\n",
    "n_prompt_tokens = 0\n",
    "n_completion_tokens = 0\n",
    "n_total_tokens = 0\n",
    "n_posts_pcsed = 0\n",
    "col_gpt_output = gpt_output + '_'+ type_ra_ext\n",
    "\n",
    "if col_gpt_output not in df_noaa.columns:\n",
    "    df_noaa[col_gpt_output] = np.nan\n",
    "    print('column',col_gpt_output,\"didn't exist and has been created\")\n",
    "else:\n",
    "    # df_noaa[col_gpt_output].fillna('Unprocessed',inplace = True)\n",
    "    print('column',col_gpt_output,\"exists.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "a3efd1f2-1e79-4cab-a20f-b7d01d3963ee",
   "metadata": {
    "code_folding": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 73037.60it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of posts that have been processed is 0\n",
      "The number of all tokens is 0 The estimated price is $ 0.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Version 4.1 for GPT extraction\n",
    "# adapted from Version 3.1\n",
    "\n",
    "for i in tqdm(range(df_noaa.shape[0])): #\n",
    "    text_input = df_noaa.loc[i, vars['col_oa_text']]\n",
    "    text_output = df_noaa.loc[i, col_gpt_output]\n",
    "    if text_input!='Unknown' and pd.isna(text_output):\n",
    "        des = text_preprocess(df_noaa.loc[i, vars['col_oa_text']]) # col_oa_input, vars['col_text_wt_traj']\n",
    "        QnA = Qn_short + des\n",
    "        chat_completion = query_gpt(QnA, model=gpt_model) # \"gpt-3.5-turbo\"\n",
    "        df_noaa.loc[i, col_gpt_output] = chat_completion.choices[0].message.content.lstrip('\\n')\n",
    "        n_prompt_tokens += chat_completion.usage.prompt_tokens\n",
    "        n_completion_tokens += chat_completion.usage.completion_tokens\n",
    "        n_total_tokens += chat_completion.usage.total_tokens\n",
    "        n_posts_pcsed += 1\n",
    "gpt_api_cost = gpt_cost_estimate(gpt_model,n_prompt_tokens,n_completion_tokens,n_total_tokens)\n",
    "print('The number of posts that have been processed is',n_posts_pcsed);\n",
    "print('The number of all tokens is',n_total_tokens,'The estimated price is $',gpt_api_cost)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "3d6c5c07-6152-4630-9068-e88d16a12c27",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 incidents(posts) need convert text_GPT into RA.\n"
     ]
    }
   ],
   "source": [
    "print(df_noaa[vars['col_ra_gpt_oa_text']].isna().sum(), 'incidents(posts) need convert text_GPT into RA.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3166ac4-67c2-4c8b-af99-9bbbb112a93c",
   "metadata": {},
   "source": [
    "Extract LLM-based candidate RA from gpt-generated text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "40081d43-efbb-4f5e-9f8d-89c1714bd7cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The column 'RA_gpt_post_w_oa' exists in the DataFrame.\n"
     ]
    }
   ],
   "source": [
    "df_noaa = ensure_column_exists(df_noaa, vars['col_ra_gpt_oa_text'], np.nan)\n",
    "# df_noaa = ensure_column_exists(df_noaa, vars['col_oatt_gpt_ext'], np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "4136cfc8-a7cd-4210-932c-33088ed7a1a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The RA_GPT column RA_gpt_post_w_oa exists.\n"
     ]
    }
   ],
   "source": [
    "# if migrated RA_gpt, the column vars['col_ra_gpt'] should exist\n",
    "if vars['col_ra_gpt_oa_text'] not in df_noaa.columns:\n",
    "    df_noaa[vars['col_ra_gpt_oa_text']] = np.nan\n",
    "    print(\"The RA_GPT column %d didn't exist and has been created with NaN.\" %vars['col_ra_gpt_oa_text'])\n",
    "else:\n",
    "    # df_noaa[vars['col_ra_gpt']].fillna(-1,inplace = True)\n",
    "    print(\"The RA_GPT column %s exists.\" %vars['col_ra_gpt_oa_text'])\n",
    "# df_noaa['OAtt_GPT_Ext'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "e68a26d1-e60b-4260-9075-fd902b97550d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 71940.27it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(df_noaa.shape[0])): #df_noaa.shape[0]\n",
    "    text_gpt_return = df_noaa.loc[i, col_gpt_output]\n",
    "    ra_gpt_return = df_noaa.loc[i, vars['col_ra_gpt_oa_text']]\n",
    "    if not pd.isna(text_gpt_return) and pd.isna(ra_gpt_return):\n",
    "        text_gpt_output = df_noaa.loc[i, col_gpt_output]\n",
    "        # Extract OAs right after getting GPT results\n",
    "        [RA_GPT_i,OAtt_GPT_i] = get_oil_amount_from_short_prompt(\n",
    "            text_gpt_output, \n",
    "            ra_signal_word = 'RA_Release', \n",
    "            oatt_signal_word = 'RA_total'\n",
    "        )\n",
    "        df_noaa.loc[i, vars['col_ra_gpt_oa_text']] = RA_GPT_i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "1a19e12b-e502-4f31-9063-b69196fcadb0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of df: 21934 20\n",
      "Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa', 'text_gpt35_post', 'RA_gpt_post_w_oa']\n",
      "Memory Usage (Total in MB): 68.81\n",
      "The refined posts were written into: 5_posts_level_add_llm_ra_1.xlsx\n"
     ]
    }
   ],
   "source": [
    "# Generate file name and write in\n",
    "os.chdir(folder_path)\n",
    "filename = '5_' + vars['filename_prefix'] + '_level_add_llm_ra_1.xlsx'\n",
    "df_noaa.to_excel(filename, engine='openpyxl', index=False)\n",
    "# Generate the summary\n",
    "df_summary = summarize_large_dataframe(df_noaa)\n",
    "print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1bc9c54-a9e0-4b80-af66-02363652a528",
   "metadata": {},
   "source": [
    "# Check the concensus of rule-based and LLM-based RA candidates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "303cc61a-1252-4238-bb02-53d1e6c327f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'5_posts_level_add_llm_ra_1.xlsx'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '5_' + vars['filename_prefix'] + '_level_add_llm_ra_1.xlsx'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "3fa259ff-5950-42de-be7c-1cf29c83f011",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(folder_path)\n",
    "df_noaa = pd.read_excel(filename)\n",
    "df_noaa.rename(columns = {'RA_GPT_Ext': vars['col_ra_gpt']},inplace = True)\n",
    "df_noaa[vars['col_ra_gpt_oa_text']].fillna(-1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "afd16490-7b54-4b52-a272-dae3811da8c3",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 52423.36it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1335 incidents(posts) have different RA from rule-based and gpt tools.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "df_noaa[[vars['col_ra_merge'],vars['col_source_ra_merge']]] = [np.nan,'Unknown']\n",
    "IF_ra_rb_eq_gpt = pd.Series([False] * df_noaa.shape[0]);\n",
    "for k in tqdm(range(df_noaa.shape[0])):\n",
    "    RA_rb = df_noaa.loc[k, vars['col_ra_rb']]\n",
    "    RA_gpt = df_noaa.loc[k, vars['col_ra_gpt_oa_text']]\n",
    "    if not pd.isna(RA_gpt):\n",
    "        if_ra_rb_eq_gpt = RA_Equality_Compare(RA_rb,RA_gpt,ignore_NoRls = False,ignore_Unk = True)\n",
    "        IF_ra_rb_eq_gpt[k] = if_ra_rb_eq_gpt;\n",
    "df_noaa.loc[IF_ra_rb_eq_gpt, vars['col_ra_merge']] = df_noaa.loc[IF_ra_rb_eq_gpt, vars['col_ra_rb']]\n",
    "df_noaa.loc[IF_ra_rb_eq_gpt, vars['col_source_ra_merge']] = 'ra_rb_eq_gpt'\n",
    "print((~IF_ra_rb_eq_gpt).sum(), 'incidents(posts) have different RA from rule-based and gpt tools.')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef1c4d4a-37e2-48e1-b8fd-1138aab39838",
   "metadata": {},
   "source": [
    "# Manually verify the inconsistent results to get the document-level candidate RA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "b5e9e97f-fa03-4227-b57b-cd8845eb2da7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "column RA_post_Manual doesn't exist\n",
      "column Source doesn't exist\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['NPost id', 'post title', 'post date', 'post tags', 'post content',\n",
       "       'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts',\n",
       "       'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release',\n",
       "       'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa',\n",
       "       'text_gpt35_post', 'RA_gpt_post_w_oa', 'RA_merge_posts_rb_gpt',\n",
       "       'source_RA_merge_posts'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.chdir(folder_path)\n",
    "\n",
    "# Drop the specified columns from inconsistent_df_noaa because they will be re-created and re-assigned later on\n",
    "columns_to_drop = [vars['col_vrf_ra'], vars['col_ra_source_of_vrfd_ra']]\n",
    "for col in columns_to_drop:\n",
    "    try:\n",
    "        df_noaa = df_noaa.drop(columns=col, axis=1)\n",
    "        print('column', col,'has been dropped.')\n",
    "    except KeyError:\n",
    "        print(\"column\",col,\"doesn't exist\")\n",
    "\n",
    "# Add id column if needed\n",
    "if vars['col_id_of_vrfd_ra'] not in df_noaa.columns:\n",
    "    # Step 2: Drop the last two substrings from the unique ID column and store the result in a new column\n",
    "    df_noaa[vars['col_id_of_vrfd_ra']] = df_noaa[vars['col_old_id']].apply(lambda x: \"_\".join(x.split(\"_\")[:2]))\n",
    "    print(\"column of id (%d) doesn't exist and has been created\" %vars['col_id_of_vrfd_ra'])\n",
    "df_noaa.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6378b24b-44e7-4fe1-a51b-48d808889b7a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['RA_des_Manual', 'Source']\n"
     ]
    }
   ],
   "source": [
    "# create columns for manual verified candidate RA\n",
    "cols_to_add = [vars['col_vrf_ra'], vars['col_ra_source_of_vrfd_ra']]; print(cols_to_add)\n",
    "df_noaa[cols_to_add] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "dbe6bdf1-473d-4888-bd46-71c8d98f407a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "verified records are from post where sheets involved are: ['Sheet1']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of modified rows in df_noaa: 3331\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# add manually verified candidate RAs to df_noaa\n",
    "filename_manual = 'RA_candidate_' + type_ra_ext + '_manual.xlsx'\n",
    "os.chdir(folder_path_manual)\n",
    "df_manual = pd.ExcelFile(filename_manual)\n",
    "\n",
    "# List all sheet names and filter those that start with 'S'\n",
    "sheets_to_read = [sheet_name for sheet_name in df_manual.sheet_names if sheet_name.startswith('S')]\n",
    "\n",
    "# sheetnames_mvrfd_ext = df_manual.sheet_names\n",
    "print(\"verified records are from\",type_ra_ext, \"where sheets involved are:\", sheets_to_read)\n",
    "\n",
    "# Initialize a counter for modified rows\n",
    "modified_rows_count = 0\n",
    "\n",
    "for sheet_name in tqdm(sheets_to_read):\n",
    "    df_vrfd_i = df_manual.parse(sheet_name)\n",
    "\n",
    "    # Remove duplicates in the 'id' column, keeping only the last occurrence\n",
    "    df_vrfd_i = df_vrfd_i.drop_duplicates(subset=vars['col_id_of_vrfd_ra'], keep='last')\n",
    "\n",
    "    df_vrfd_i.dropna(subset=[vars['col_vrf_ra']], inplace=True)\n",
    "    df_vrfd_i.reset_index(drop=True, inplace=True)\n",
    "    df_vrfd_i_merge = df_vrfd_i[[vars['col_id_of_vrfd_ra']] + cols_to_add]\n",
    "\n",
    "    # Merge and update df_noaa\n",
    "    before_merge = df_noaa.copy()\n",
    "    df_noaa = pd.merge(df_noaa,\n",
    "                       df_vrfd_i_merge,\n",
    "                       on=vars['col_id_of_vrfd_ra'],\n",
    "                       how='left',\n",
    "                       suffixes=('', '_updated')\n",
    "                      )\n",
    "\n",
    "    # Update the values and count modified rows\n",
    "    for col in cols_to_add:\n",
    "        df_noaa[col] = df_noaa[col].combine_first(df_noaa[col + '_updated'])\n",
    "        modified_rows_count += df_noaa[col].notna().sum() - before_merge[col].notna().sum()\n",
    "\n",
    "    # Drop the temporary '_updated' columns\n",
    "    df_noaa.drop([col + '_updated' for col in cols_to_add], axis=1, inplace=True)\n",
    "\n",
    "df_manual.close()\n",
    "\n",
    "print(f\"Number of modified rows in df_noaa: {modified_rows_count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "30a752e5-5bb6-4a83-b685-c9aa8f4fbc0c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████| 21934/21934 [00:00<00:00, 22647.11it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All done.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# merge verified candidate RA with extracted candidate RA\n",
    "# also note the sources of merged candidate RA\n",
    "for k in tqdm(range(df_noaa.shape[0])):\n",
    "    source_ra_merge = df_noaa.loc[k, vars['col_source_ra_merge']]\n",
    "    ra_vrfd = df_noaa.loc[k, vars['col_vrf_ra']]\n",
    "    if (source_ra_merge == 'Unknown') and (pd.notna(ra_vrfd)):\n",
    "        df_noaa.loc[k, vars['col_ra_merge']] = ra_vrfd\n",
    "        ra_rb = df_noaa.loc[k, vars['col_ra_rb']]\n",
    "        ra_gpt = df_noaa.loc[k, vars['col_ra_gpt_oa_text']]\n",
    "        # add merged RA source\n",
    "        if (ra_vrfd == ra_rb) or (ra_vrfd<0 and ra_rb<0):\n",
    "            df_noaa.loc[k, vars['col_source_ra_merge']] = 'ra_vrfd_rb'\n",
    "        elif (ra_vrfd == ra_gpt) or (ra_vrfd<0 and ra_gpt<0):\n",
    "            df_noaa.loc[k, vars['col_source_ra_merge']] = 'ra_vrfd_gpt'\n",
    "        else:\n",
    "            df_noaa.loc[k, vars['col_source_ra_merge']] = 'ra_vrfd_other'\n",
    "\n",
    "IF_source_is_unk = df_noaa[vars['col_source_ra_merge']] == 'Unknown'\n",
    "if IF_source_is_unk.sum() ==0:\n",
    "    print('All done.')\n",
    "else:\n",
    "    print(IF_source_is_unk.sum(), 'incidents or posts still need to be verified')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "6b7a0a5c-8481-4478-9fb5-bfa3cb48bb7c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All records has met the criteria or been verified. No verification needed.\n"
     ]
    }
   ],
   "source": [
    "# check whether all incidents have extracted/verified candidate RAs\n",
    "# The incidents that need verification (df_unverified) will be copied to clipboard\n",
    "if IF_source_is_unk.sum() != 0:\n",
    "    df_unverified = df_noaa.loc[IF_source_is_unk,:]\n",
    "    df_unverified.to_clipboard(index = False)\n",
    "    print('The unverified incidents (posts) are written to clipboard.')\n",
    "else:\n",
    "    print('All records has met the criteria or been verified. No verification needed.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "7cecb7cc-9667-41b0-bb38-72103896549b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of df: 21934 24\n",
      "Column Names: ['NPost id', 'post title', 'post date', 'post tags', 'post content', 'attachment availability', 'noaa id', 'post id', 'old_npost_id_2parts', 'trj_prd', 'post_wt_traj', 'Times_larger_plt', 'Rls_doc', 'No_release', 'CA_rb_post_wt_traj', 'RA_rb_post_wt_traj', 'ptl_max_RA', 'post_w_oa', 'text_gpt35_post', 'RA_gpt_post_w_oa', 'RA_merge_posts_rb_gpt', 'source_RA_merge_posts', 'RA_post_Manual', 'Source']\n",
      "Memory Usage (Total in MB): 71.3\n",
      "The refined posts were written into: 6_posts_level_ra_candidate_1.xlsx\n"
     ]
    }
   ],
   "source": [
    "# Generate file name and write in\n",
    "os.chdir(folder_path)\n",
    "filename = '6_' + vars['filename_prefix'] + '_level_ra_candidate_1.xlsx'\n",
    "df_noaa.to_excel(filename, engine='openpyxl', index=False)\n",
    "# Generate the summary\n",
    "df_summary = summarize_large_dataframe(df_noaa)\n",
    "print('The refined ' + vars['filename_prefix'] + ' were written into:',filename)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}