func_set_ra_eval.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ae8311ba-bb71-4ae9-bca0-7094eaea4a46",
   "metadata": {},
   "source": [
    "# Define input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "7c44fccf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c4aefe3c-9cca-4dab-a8de-9aefc4e1e989",
   "metadata": {},
   "outputs": [],
   "source": [
    "def define_variables(type_ra_ext):\n",
    "    \"\"\"\n",
    "    Define variables based on the type of incident description or post.\n",
    "\n",
    "    Parameters:\n",
    "    type_ra_ext (str): The type of data extraction, either 'description' or 'post'.\n",
    "\n",
    "    Returns:\n",
    "    dict: A dictionary containing relevant variables.\n",
    "    \"\"\"\n",
    "\n",
    "    variables = {}\n",
    "\n",
    "    if type_ra_ext == 'description':\n",
    "        variables = {\n",
    "            'filename_prefix': 'incident',\n",
    "            'col_text': 'description',\n",
    "            'col_date': 'open_date',\n",
    "            # 'data_source': 'RAEV',\n",
    "            'col_ra_gpt_from_sample': 'RA_icd_GPT_OA_des',\n",
    "            'col_ra_gpt': 'RA_icd_gpt',\n",
    "            'col_ra_merge': 'RA_merge_icd_rb_gpt',\n",
    "            'col_source_ra_merge': 'source_RA_merge_icd',\n",
    "            'sheetnames_mvrfd_ext': ['S9', 'S10', 'S11', 'S12', 'S13','S14'],\n",
    "            'col_refer': 'id',\n",
    "            'col_ra_gpt_oa_text': 'RA_gpt_des_w_oa'\n",
    "        }\n",
    "\n",
    "    elif type_ra_ext == 'post':\n",
    "        variables = {\n",
    "            'filename_prefix': 'posts',\n",
    "            'col_old_id': 'NPost_id',\n",
    "            'col_text': 'post content',\n",
    "            'col_date': 'post date',\n",
    "            # 'data_source': 'Posts',\n",
    "            'col_ra_gpt_from_sample': 'RA_post_GPT_OA_des',\n",
    "            'col_ra_gpt': 'RA_post_gpt',\n",
    "            'col_ra_merge': 'RA_merge_posts_rb_gpt',\n",
    "            'col_source_ra_merge': 'source_RA_merge_posts',\n",
    "            'sheetnames_mvrfd_ext': ['S1', 'S2', 'S3', 'S4', 'S5','S6','S7','S8','S9','S10','S11'],\n",
    "            'col_refer': 'NPost id',\n",
    "            'col_ra_gpt_oa_text': 'RA_gpt_post_w_oa'\n",
    "        }\n",
    "\n",
    "    variables['col_threat'] = 'threat'\n",
    "    variables['col_sub'] = 'commodity'#'commodity'#'risk_sub'\n",
    "    # variables['col_des'] = 'description'\n",
    "\n",
    "    # columns names for text without trajectory prediction etc\n",
    "    if type_ra_ext == 'description':\n",
    "        variables['col_text_wt_traj'] = 'des_wt_traj'\n",
    "    elif type_ra_ext == 'post':\n",
    "        variables['col_text_wt_traj'] = 'post_wt_traj'\n",
    "    variables['if_text_has_hypothesis'] = 'trj_prd'\n",
    "    \n",
    "    # columns names for oa-related text\n",
    "    variables['col_oa_text_from_sample'] = 'OA_des'\n",
    "    if type_ra_ext == 'description':\n",
    "        variables['col_oa_text'] = 'des_w_oa'\n",
    "    elif type_ra_ext == 'post':\n",
    "        variables['col_oa_text'] = 'post_w_oa'\n",
    "\n",
    "    # columns names for rule-based ra extraction\n",
    "    # variables['col_real_input'] = variables['col_text_wt_traj']\n",
    "    variables['col_ra_rb'] = 'RA_rb_' + variables['col_text_wt_traj']\n",
    "    variables['col_oatt_rb'] = 'CA_rb_' + variables['col_text_wt_traj']\n",
    "    variables['col_if_tl_plt'] = 'Times_larger_plt'\n",
    "    variables['col_if_rls'] = 'Rls_doc'\n",
    "    variables['col_if_no_rls'] = 'No_release'\n",
    "    if type_ra_ext == 'description':\n",
    "        variables['if_do_no_rls_check'] = True\n",
    "    elif type_ra_ext == 'post':\n",
    "        variables['if_do_no_rls_check'] = False\n",
    "\n",
    "    # column names for gpt extraction results\n",
    "    \n",
    "    variables['col_oatt_gpt_ext'] = 'OAtt_GPT_Ext'\n",
    "    variables['col_oarv_gpt_ext'] = 'OArv_GPT_Ext'\n",
    "    variables['col_if_oa_irlvt_gpt'] = 'OA_irlvt_GPT'\n",
    "    variables['col_if_oa_unit_lost'] = 'OA_unit_lost'\n",
    "    if type_ra_ext == 'description':\n",
    "        variables['col_id_of_sample'] = 'id'\n",
    "        variables['col_text_gpt35'] = 'text_gpt35_description'\n",
    "        variables['col_ra_gpt_ext'] = 'RA_gpt_des_w_oa'\n",
    "    elif type_ra_ext == 'post':\n",
    "        variables['col_id_of_sample'] = 'NPost id'\n",
    "        variables['col_text_gpt35'] = 'text_gpt35_post'\n",
    "        variables['col_ra_gpt_ext'] = 'RA_gpt_post_w_oa'\n",
    "\n",
    "    # columns for merging rule-based and gpt results\n",
    "    if type_ra_ext == 'description':\n",
    "        variables['col_vrf_ra'] = 'RA_des_Manual'\n",
    "        variables['col_id_of_vrfd_ra'] = 'id'\n",
    "    elif type_ra_ext == 'post':\n",
    "        variables['col_vrf_ra'] = 'RA_post_Manual'\n",
    "        variables['col_id_of_vrfd_ra'] = 'old_npost_id_2parts'\n",
    "    variables['col_ra_source_of_vrfd_ra'] = 'Source'\n",
    "\n",
    "    return variables"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dde3c61d-296b-460b-bc02-657bccbc9a53",
   "metadata": {},
   "source": [
    "## Format adjustment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0b5b445f-d9e1-4caf-bc05-d7f0a4542f76",
   "metadata": {},
   "outputs": [],
   "source": [
    "# adjust format but don't change text content \n",
    "# Aim: ideally we don't need to mention it in the paper\n",
    "def adjust_format(text):\n",
    "    try:\n",
    "        text = str(text);\n",
    "        text = text.replace('_x000B_',' ');\n",
    "        text = text.replace('•','');\n",
    "        text = text.replace(',OOO','000');\n",
    "        if text.count('-') != text.count(' - '):\n",
    "            text = text.replace('-',' - ');\n",
    "        while '  ' in text:\n",
    "            text = text.replace('  ',' ');\n",
    "    except:\n",
    "        pass\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6afc649f-113e-475f-9e3f-b9fc6cfce901",
   "metadata": {},
   "source": [
    "# Extract Risk substance Name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "38a62e39-1725-44ba-92dd-f0be142b1ff6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "kws_oil = ['condensate','oil', 'diesel', 'heavy fuel', 'lube', 'lubrica',\n",
    "           'bunker', 'hydraulic', 'petro', 'gasoline', 'diesel', 'JP',\n",
    "           'Jet', 'crude', 'petroleum', 'shale', '#2', 'No. 2', '#4',\n",
    "           'number 2', '#6', 'No. 6', '# 6', 'number 6', 'IFO', 'HFO']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4198b8ba",
   "metadata": {},
   "source": [
    "# Rule-based RA extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "edb3beea-7cbc-4404-9f84-1f62ce1e6805",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-03-24 19:21:15 INFO: Loading these models for language: en (English):\n",
      "========================\n",
      "| Processor | Package  |\n",
      "------------------------\n",
      "| tokenize  | combined |\n",
      "| pos       | combined |\n",
      "| lemma     | combined |\n",
      "========================\n",
      "\n",
      "2024-03-24 19:21:15 INFO: Use device: cpu\n",
      "2024-03-24 19:21:15 INFO: Loading: tokenize\n",
      "2024-03-24 19:21:15 INFO: Loading: pos\n",
      "2024-03-24 19:21:16 INFO: Loading: lemma\n",
      "2024-03-24 19:21:16 INFO: Done loading processors!\n"
     ]
    }
   ],
   "source": [
    "import stanza\n",
    "nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7fde465a",
   "metadata": {},
   "outputs": [],
   "source": [
    "release_words = ['spill', 'release', 'pollution', 'leak',  'damage', 'discharge',\\\n",
    "                 'sheen','lose','loss','spillage','pollute','seep','overflow','dump'];\n",
    "release_phrase = ['in water','in the water'];\n",
    "neg_words = ['no', 'not','nothing'];\n",
    "acccause_words = ['aground','ground','adrift','drift','collsion','allision','collide', 'hurricane','sink']\n",
    "# 'grounded','ran aground',''\n",
    "unit_words = [ 'gallon', 'barrel', 'barrels','bbl','bbls', 'gal', 'liter','cubicfeet','cubicyard']; #'pound',\n",
    "\n",
    "amount_words = ['million','k'];\n",
    "avoid_words = ['onboard', 'carry','load','capacity','contain','abroad','aboard','potential'];\n",
    "fuzzy_carry_words = ['have','hold'];\n",
    "exclude_fzy_carry_words = ['have been','has been'];\n",
    "accsit_words = ['sink', 'fire', 'explode','burn','explosion','capsize'];\n",
    "accmin_words = ['adrift']; #,'re-floated','refloat','floated','afloat','hard aground'\n",
    "accmin_phrase = ['lost propulsion','without propulsion','lost power']\n",
    "separator_word = [',','\\(','\\)'];\n",
    "# recover keywords\n",
    "recover_word = ['recovered'];\n",
    "transfer_word = ['transfer','collect'] #'remove'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a1c50394-18fd-45f0-92ea-428328e5cd75",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Compare two strings are similar\n",
    "# consider that county names in NRC are chaotic, here we use county, city names from NOAA, try to match with county name from NRC\n",
    "#locationName_NRC must be a pandas Series!\n",
    "# the returned value is booling\n",
    "def keywords_searching(locationName_NRC,keywords_set, case_sentivity = False):\n",
    "    keywords_pattern = '|'.join(keywords_set);\n",
    "    L_kw = locationName_NRC.str.contains(keywords_pattern,case = case_sentivity ,regex=True)\n",
    "    return L_kw "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04892220-962f-4215-815d-3ee38045576b",
   "metadata": {},
   "source": [
    "## Text pretreatment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a0e3c5ed-fd43-424b-a0d9-3386e3e03256",
   "metadata": {},
   "outputs": [],
   "source": [
    "def text_preprocess(text):\n",
    "    \"\"\"\n",
    "    Preprocess the input text by replacing specific substrings.\n",
    "\n",
    "    This function takes a text input and performs a series of string replacements\n",
    "    to standardize certain terms and correct common formatting issues.\n",
    "\n",
    "    Parameters:\n",
    "    text (str): The text to be processed.\n",
    "\n",
    "    Returns:\n",
    "    str: The processed text with replacements applied.\n",
    "    \"\"\"\n",
    "\n",
    "    try:\n",
    "        # Ensure the input is in string format\n",
    "        text = str(text)\n",
    "\n",
    "        # Replacements for various terms and formatting issues\n",
    "        text = text.replace('_x000B_', ' ')\n",
    "        text = text.replace('cubic feet', 'cubicfeet')\n",
    "        text = text.replace('•', '')\n",
    "        text = text.replace('approx.', 'approximately')\n",
    "        text = text.replace('APPROX.', 'approximately')\n",
    "        text = text.replace('gal. of', 'gal of')\n",
    "        text = text.replace('bbl. of', 'bbl of')\n",
    "        text = text.replace('bbls. of', 'bbls of')\n",
    "        text = text.replace('est.', 'estimate')\n",
    "        text = text.replace('gals.', 'gallons')\n",
    "        text = text.replace(',OOO', '000')\n",
    "\n",
    "        # Ensure spaces around hyphens are consistent\n",
    "        if text.count('-') != text.count(' - '):\n",
    "            text = text.replace('-', ' - ')\n",
    "\n",
    "        # Replace double spaces with single spaces\n",
    "        while '  ' in text:\n",
    "            text = text.replace('  ', ' ')\n",
    "    except:\n",
    "        # If an error occurs, pass without making changes\n",
    "        pass\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaa69ab6-30d9-4c65-b6d6-3595688283a2",
   "metadata": {},
   "source": [
    "## Exclude hypothetical sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f23736a",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdc_object = ['NOAA','SSC','USGC'];\n",
    "pdc_behavior = ['request','ask'];\n",
    "pdc_task = ['trajectory','fate'];\n",
    "\n",
    "def trajectory_prediction(df_sent):\n",
    "    L_obj = df_sent['lemma'].str.contains('|'.join(pdc_object), case=True).any();\n",
    "    L_bhv = df_sent['lemma'].str.contains('|'.join(pdc_behavior), case=False).any();\n",
    "    L_task = df_sent['lemma'].str.contains('|'.join(pdc_task), case=False).any();\n",
    "\n",
    "    L_pdc_sent = True if ((L_task == True) & ((L_obj == True) | (L_bhv == True))) else False;\n",
    "    return L_pdc_sent"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbaf3fb8-fb0f-49fc-a095-9bc95ba40b67",
   "metadata": {},
   "source": [
    "## Remove OA-lacking sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "35cd41fb-f9a9-4d7b-a709-51638185a4ed",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def if_text_contain_oa(df_sent):\n",
    "    # RA search\n",
    "    df_sent['lemma'].replace(',','', regex=True, inplace=True); #remove comma from words\n",
    "    # Check if there are any types of numbers in this sentence\n",
    "    df_num_sent = (df_sent['upos'] == 'NUM');\n",
    "    L_num_sent = df_num_sent.any()\n",
    "    if L_num_sent:\n",
    "        list_OA = unit_forward_search(df_sent,unit_words);\n",
    "        list_OA.sort(reverse=True)  # descending sort\n",
    "        if (len(list_OA) > 0):\n",
    "            # Take the sentence that contains oil amount\n",
    "            return True\n",
    "        else:\n",
    "            return False"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "184d8917-49cf-440d-925e-ebee6f53cc22",
   "metadata": {},
   "source": [
    "## Rule-based NLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "96924d08",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 20230210Update: Add sequence constraint\n",
    "def shortest_distance(L1,L2,sequence = 'two-way'):\n",
    "    L1 = [L1] if type(L1) == int else L1;\n",
    "    L2 = [L2] if type(L2) == int else L2;\n",
    "    L1.sort(); L2.sort();\n",
    "    dist = float(\"inf\");\n",
    "    i = j = 0;\n",
    "    while i < len(L1) and j < len(L2):\n",
    "        dis_ij = L2[j] - L1[i] \n",
    "        if sequence == 'two-way':\n",
    "            dist = min(dist, abs(dis_ij));\n",
    "        elif sequence == 'forward':\n",
    "            # In this case, elements i from L1 must be in front of element j from L2\n",
    "            # This can be used in situations which need set order of words such as amount + unit\n",
    "            if dis_ij > 0:\n",
    "                dist = min(dist, dis_ij);\n",
    "        if L1[i] < L2[j]:\n",
    "            i += 1\n",
    "        else:\n",
    "            j += 1\n",
    "    return dist    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "bac661ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The function is to find the nearest group of two group of words\n",
    "# The input value include: (1) df_sent, (2-3)the indexes of two groups of words respectively\n",
    "# (4)separator word, default value is ',' (5) the effctive distance for two words\n",
    "# The function of this function include:\n",
    "# a.check if the distance between two words less than the effctive distance, \n",
    "# b.check if defined separators are between two words.\n",
    "# if a and b are both yes, assign True.\n",
    "def Separator_check(df_sent,release_ind,neg_ind,separator_word = ',',effective_dist = 10):\n",
    "    dis_ij = 999;\n",
    "    l_wsparator = [];\n",
    "    l_wdistance = [];\n",
    "    L_fseparate = False;\n",
    "    #     L_distance = False; \n",
    "    # Default: distance between rls and carry is smaller than effective distance\n",
    "    if (release_ind != []) & (neg_ind != []):\n",
    "        for i in range(len(release_ind)):\n",
    "            for j in range(len(neg_ind)):\n",
    "                release_i = release_ind[i];\n",
    "                neg_i = neg_ind[j];\n",
    "                if dis_ij > abs(release_i - neg_i):\n",
    "                    dis_ij = abs(release_i - neg_i);\n",
    "                    release_min = release_i;\n",
    "                    neg_min = neg_i;\n",
    "\n",
    "                if abs(release_i - neg_i) <= effective_dist:\n",
    "                    l_wdistance.append(abs(release_i - neg_i) <= effective_dist)\n",
    "                    # check if there are comma between the nearest neg and releases word   \n",
    "                    df_mid_neg_rls = df_sent.loc[min(neg_i,release_i)+1:max(neg_i,release_i)-1];\n",
    "                    separator_word_pattern = '|'.join(separator_word);\n",
    "                    l_wsparator.append((df_mid_neg_rls['lemma'].str.contains(separator_word_pattern,case=False)).any());\n",
    "\n",
    "        L_distance = any(l_wdistance);                    \n",
    "        L_fseparate = any(l_wsparator);\n",
    "    # Temportaily don't consider distance restriction(l_wdistance)-update 09/24- didn't run main program\n",
    "    return L_fseparate #[L_distance,L_fseparate]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02e08f0f-6d6e-44a9-9fe2-c0ded662340f",
   "metadata": {},
   "source": [
    "### Release check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "d5a58c44",
   "metadata": {},
   "outputs": [],
   "source": [
    "def release_check(df_sent):\n",
    "    pattern_release = '|'.join(release_words);\n",
    "    ll_release = df_sent['lemma'].str.fullmatch(pattern_release, case=False);\n",
    "    L_rls_word = ll_release.any();\n",
    "    return L_rls_word"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea2fff75-f0c8-43b2-ab22-12a3add59e12",
   "metadata": {},
   "source": [
    "### Unrelease check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c3567249",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 20230309 Upate: use \"shortest_distance\" to replace previous code\n",
    "# current new function need neg word in front of rls word\n",
    "\n",
    "# Update: detect separator characters between release word and negetive word\n",
    "def Unrealase_check(df_sent,sent,separator_Rstrct = True):\n",
    "    L_unrelease = False;\n",
    "\n",
    "    pettern_neg = '|'.join(neg_words)\n",
    "    df_sent['neg_words'] = df_sent['text'].str.fullmatch(pettern_neg, case=False)\n",
    "    # try to use text rather than lemma, to avoid noise like No.(no)\n",
    "    list_neg_idx = df_sent.index[df_sent['neg_words'] == True].tolist();\n",
    "\n",
    "    pattern_release = '|'.join(release_words)\n",
    "    df_sent['release_words'] = df_sent['lemma'].str.fullmatch(pattern_release, case=False)\n",
    "    list_rls_idx = df_sent.index[df_sent['release_words'] == True].tolist();\n",
    "\n",
    "    #calculate the shortest distance between neg words and release words\n",
    "    if (list_rls_idx != [])&(list_neg_idx !=[]):\n",
    "        dist_2min = shortest_distance(list_neg_idx,list_rls_idx,sequence = 'forward');\n",
    "        if dist_2min <= 5:\n",
    "            L_unrelease = True;\n",
    "\n",
    "    if L_unrelease == False:\n",
    "        ll_release = False;\n",
    "        for i in release_phrase: # in water in the water\n",
    "            li_release = i in sent.text;\n",
    "            if li_release == True:\n",
    "                ll_release = True;\n",
    "        ll_neg = df_sent['text'].str.fullmatch(pettern_neg, case=False).any();\n",
    "        if (ll_release == True) & (ll_neg == True):\n",
    "            L_unrelease = True;\n",
    "\n",
    "    if separator_Rstrct == True:\n",
    "        L_fseparate = Separator_check(df_sent,list_rls_idx,list_neg_idx);\n",
    "    else:\n",
    "        L_fseparate = False;\n",
    "\n",
    "    return [L_unrelease,L_fseparate]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "487184e5-dd95-4c9c-90de-3a2bd997d7cd",
   "metadata": {},
   "source": [
    "### Carry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "7227c5ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if one sentence contains carried word and doesn't contain release word.\n",
    "# After practicing, this version removed all the release check here, as it seems a little redundant\n",
    "\n",
    "def Carry_sub_V2(df_sent,sent):\n",
    "    L_carry = False;\n",
    "    L_exclude_Fzy_carry = False;\n",
    "    pattern_carry = '|'.join(avoid_words);\n",
    "    ll_carry_1 = df_sent['lemma'].str.contains(pattern_carry, case=False)\n",
    "    ll_carry = (ll_carry_1.any()) | ('on board' in sent.text) ;\n",
    "    pattern_release = '|'.join(release_words)\n",
    "    ll_release = df_sent['lemma'].str.contains(pattern_release, case=False).any();\n",
    "    if (ll_carry == True) & (ll_release == False):\n",
    "        L_carry = True;\n",
    "\n",
    "    pattern_fcarry = '|'.join(fuzzy_carry_words)\n",
    "    ll_fcarry = df_sent['lemma'].str.fullmatch(pattern_fcarry, case=False);\n",
    "\n",
    "    L_fuzzy_carry = (df_sent['lemma'].str.fullmatch(pattern_fcarry, case=False).any());\n",
    "    if any(word in sent.text for word in exclude_fzy_carry_words):\n",
    "        L_exclude_Fzy_carry = True;\n",
    "\n",
    "    # for some fuzzy carried words like 'have', some restrains needed to consider, \n",
    "    # currently release amount and units are applied as restrains\n",
    "    elif (L_fuzzy_carry == True) & (L_exclude_Fzy_carry == False):\n",
    "        ll_num = (df_sent['upos'] == 'NUM');\n",
    "\n",
    "        pattern_units = '|'.join(unit_words);\n",
    "        ll_units = df_sent['lemma'].str.fullmatch(pattern_units, case=False);\n",
    "        if (ll_num.any() == True) & (ll_units.any() == True):\n",
    "            unit_list = ll_units.index[ll_units == True].tolist();\n",
    "            carry_list = ll_fcarry.index[ll_fcarry == True].tolist();\n",
    "            shortest_dis_2list = shortest_distance(unit_list,carry_list);\n",
    "            if shortest_dis_2list <= 3:\n",
    "                L_carry = True;\n",
    "    return L_carry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "411d4a78",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sent2df(sent):\n",
    "    df_sent = pd.DataFrame();\n",
    "    for word in sent.words:\n",
    "        temp = pd.DataFrame.from_dict(word.to_dict(),orient = 'index')\n",
    "        if 'head' in temp.index:\n",
    "            temp.rename(index={\"head\": \"head_id\"},inplace = True)\n",
    "            temp.loc['head'] = sent.words[word.head-1].text;\n",
    "        df_sent = pd.concat([df_sent, temp],axis = 1,ignore_index=False )\n",
    "    df_sent = df_sent.T.reset_index(drop = True)\n",
    "    return df_sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "cf928f21",
   "metadata": {},
   "outputs": [],
   "source": [
    "def Isnumber(x):\n",
    "    try:\n",
    "        x_float = float(x);\n",
    "        Isnumber = True;\n",
    "    except:\n",
    "        Isnumber = False;\n",
    "    return Isnumber"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c557e384-902f-4f94-b113-0813dcc24486",
   "metadata": {},
   "source": [
    "### oil amount extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "90c47eef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function:Extract volume amount from one sentence (version 3) 20230309\n",
    "# (1) Fix: some numbers separated by comma (such as 100,000) cannot be lemmatized and treated as numbers\n",
    "# Version 2 updates: see unit_forward_search_V2\n",
    "\n",
    "# load data of posts\n",
    "os.chdir(RootFile_Name)\n",
    "unit_converter = pd.read_excel('Unit_Converter_NOAA.xlsx',sheet_name='Convertable',index_col=[0]);\n",
    "quantifier_converter = pd.read_excel('Unit_Converter_NOAA.xlsx',sheet_name='quantifier',index_col=[0]);\n",
    "\n",
    "from word2number import w2n\n",
    "def unit_forward_search(df_sent,unit_kw = unit_words):\n",
    "    pattern_amount = '|'.join(unit_kw);\n",
    "    L_unit = df_sent['lemma'].str.fullmatch(pattern_amount, case=False);\n",
    "    df_unit = df_sent.loc[L_unit,['lemma']].reset_index(drop = True)\n",
    "    amount_sent = [];\n",
    "    for i in range(L_unit.sum()):#L_unit.sum()\n",
    "        coeff2gal = unit_converter.loc[df_unit.loc[i,'lemma'].upper(),'To_gallon'];\n",
    "        a2num_tt = 1;\n",
    "        num_tt = 1;\n",
    "        unit_index = df_sent[L_unit].index[i]\n",
    "        df_unitforward = df_sent.loc[unit_index-2:unit_index-1].reset_index(drop = True);\n",
    "        # create a copy for df.str.contains/fullmatch etc.\n",
    "        df_unitforward_c = df_unitforward.copy()\n",
    "\n",
    "        # stanza sometimes cannot convert numbers with comma into \"numbers\"\n",
    "        # str.isnumeric cannot detect numbers with comma as \"numbers\" \n",
    "        # So displace comma and convert it into numbers manually\n",
    "        L_comma_num = (df_unitforward['upos'] == 'NUM') & (df_unitforward['lemma'].str.contains(','));\n",
    "        df_unitforward.loc[L_comma_num,'lemma'] = df_unitforward.loc[L_comma_num,'lemma'].str.replace(',', '').astype(float);\n",
    "\n",
    "        # recognize and mark aIsnumberll types of numbers in 'lemma' column.\n",
    "        L_num = (df_unitforward['upos'] == 'NUM') & (df_unitforward['lemma'].str.isnumeric())\n",
    "        # str.isnumeric cannot detect decimal \n",
    "        # convert the strings to floats in 'lemma' column (if applicable)\n",
    "        for j in range(df_unitforward.shape[0]):\n",
    "            L_dec = Isnumber(df_unitforward.loc[j,'lemma']);\n",
    "            if L_dec == True:\n",
    "                df_unitforward.loc[j,'lemma'] = float(df_unitforward.loc[j,'lemma']);\n",
    "                L_num[j] = True;\n",
    "\n",
    "        #here the alphanumber must contain \"K\" (for 1000)\n",
    "        L_anum = (df_unitforward['upos'] == 'NUM') & (df_unitforward_c['lemma'].str.isalpha())\n",
    "\n",
    "        if L_num.any() == True:\n",
    "            df_num = df_unitforward[L_num].reset_index(drop = True)\n",
    "            df_num['lemma'].replace(',','', regex=True, inplace=True); #remove comma from words\n",
    "            for j in range(df_num.shape[0]):\n",
    "                num_i = df_num.loc[j,'lemma'];\n",
    "                num_tt = num_tt * float(num_i);\n",
    "\n",
    "        if L_anum.any() == True:\n",
    "            df_anum = df_unitforward[L_anum].reset_index(drop = True)\n",
    "            for j in range(df_anum.shape[0]):\n",
    "                anum_i = df_anum.loc[j,'lemma'];\n",
    "                try:\n",
    "                    a2num_i = w2n.word_to_num(anum_i);\n",
    "                except ValueError:\n",
    "                    anum_i = df_anum.loc[j,'lemma'].lower();\n",
    "                    a2num_i = quantifier_converter.loc[anum_i,'To_number'];\n",
    "                except:\n",
    "                    a2num_i = -99999999; #give an abnormal value,to notify Error\n",
    "                a2num_tt = a2num_tt * a2num_i;\n",
    "\n",
    "        else:\n",
    "            a2num_i = 1;\n",
    "        amount_sent.append(float(num_tt * a2num_tt * coeff2gal));\n",
    "    return amount_sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "a0b0e83f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def carry_index(df_sent,sent,avoid_words):\n",
    "    if 'on board' in sent:\n",
    "        avoid_words.append('board');\n",
    "    pattern_carry = '|'.join(avoid_words);\n",
    "    ll_carry = df_sent['lemma'].str.contains(pattern_carry, case=False) ;\n",
    "    carry_ind = ll_carry.index[ll_carry == True].tolist();\n",
    "    return carry_ind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "16199127",
   "metadata": {},
   "outputs": [],
   "source": [
    "def kw_plt(df_sent):\n",
    "    L_potential = df_sent['lemma'].str.contains('potential').any();\n",
    "    return L_potential"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "b4092ab1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def NER2df(doc):\n",
    "    df_sent = pd.DataFrame();\n",
    "    for ent in doc.ents:\n",
    "        temp = pd.DataFrame.from_dict(ent.to_dict(),orient = 'index')\n",
    "        df_sent = pd.concat([df_sent, temp],axis = 1,ignore_index=False)\n",
    "    df_sent = df_sent.T.reset_index(drop = True)\n",
    "    return df_sent"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e5c4e73-e89f-42c7-b431-7cfe3bdce7dd",
   "metadata": {},
   "source": [
    "### Rule-based NLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b943ce83-8ff7-461b-ae6d-b4c2f2b4ae8c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Main run 9.1\n",
    "#a) modify OAtt extraction \n",
    "#b) modify coefficient of identifying RA and OAtt from 0.95 to 1\n",
    "#c) modify variable names\n",
    "\n",
    "#Main run 8.1\n",
    "# a) follow PFaL search RA from paragraph only\n",
    "\n",
    "def RA_rb_ext(df_noaa,k,col_text,\n",
    "              no_rls_check = True,\n",
    "              ptl_large_check = True,\n",
    "              ratio_oa_ptl = 1,\n",
    "             label_UCRA = False):\n",
    "    \n",
    "    L_times_larger_plt = False; if_rls = False; if_no_rls = False\n",
    "    RA_rb = -1; CA_rb = -1;\n",
    "    list_RA = []; list_OA_carry = [];\n",
    "    sents_unrelease = []; list_sents_rls = [];\n",
    "\n",
    "    # df_noaa.rename(columns = {'max_ptl_release_gallons': 'ptl_max_RA',\n",
    "    #                          'plt_max_RA': 'ptl_max_RA'},inplace = True);\n",
    "\n",
    "    text = df_noaa.loc[k,col_text];\n",
    "    des = text_preprocess(text);\n",
    "    # df_noaa.loc[k,col_text] = des;\n",
    "\n",
    "    if des != 'Unknown':\n",
    "        plt_max_RA = df_noaa.loc[k,'max_ptl_release_gallons'];\n",
    "        try:\n",
    "            doc = nlp(des) # run annotation over a sentence\n",
    "            for sent in doc.sentences:\n",
    "                df_sent = sent2df(sent);\n",
    "                # This must be put under the loop of sentences!\n",
    "                Ls_unrelease = Unrealase_check(df_sent,sent,separator_Rstrct = False)[0]\n",
    "                sents_unrelease.append(Ls_unrelease)\n",
    "\n",
    "                # check if this sentence has released word\n",
    "                L_rls_word = release_check(df_sent);\n",
    "                L_potential = kw_plt(df_sent) if ptl_large_check==True else False\n",
    "\n",
    "                # label records with release behaviors and consider take No_Rls into account\n",
    "                if ((L_rls_word == True) and (Ls_unrelease == False) and (L_potential == False)):\n",
    "                    L_rls_sent = True;\n",
    "                else:\n",
    "                    L_rls_sent = False;\n",
    "                list_sents_rls.append(L_rls_sent);\n",
    "\n",
    "                # Start RA extraction\n",
    "                # give proirity to RA searching, do No_Rls assignemnt later\n",
    "                df_sent['lemma'].replace(',','', regex=True, inplace=True); #remove comma from words\n",
    "\n",
    "                # Check if there are any types of numbers in this sentence\n",
    "                L_num_intext = (df_sent['upos'] == 'NUM').any();\n",
    "\n",
    "                # Check if this sentence has carried word\n",
    "                L_cry_sent = Carry_sub_V2(df_sent,sent)\n",
    "\n",
    "                if (L_num_intext == True):\n",
    "                    list_OA = unit_forward_search(df_sent,unit_words);\n",
    "                    list_OA.sort(reverse=True); #descending\n",
    "\n",
    "                    if (len(list_OA) > 0):\n",
    "                        if (L_cry_sent == False) & (L_rls_sent == True):\n",
    "                            list_RA.append(max(list_OA));\n",
    "                        elif (L_cry_sent == True) & (L_rls_sent == True):\n",
    "                            for oil_amount_i in list_OA:\n",
    "                                # if plt_max_RA != -1:\n",
    "                                if (oil_amount_i/plt_max_RA <= ratio_oa_ptl):\n",
    "                                    list_RA.append(oil_amount_i);\n",
    "                                else:\n",
    "                                    list_OA_carry.append(oil_amount_i);\n",
    "                        elif (L_cry_sent == True):\n",
    "                            list_OA_carry.append(max(list_OA));\n",
    "\n",
    "            # Release identification\n",
    "            if_rls = any(list_sents_rls)\n",
    "\n",
    "            # No release identification\n",
    "            if_no_rls = any(sents_unrelease)\n",
    "\n",
    "            # Carry amount identification\n",
    "            if len(list_OA_carry)>0:\n",
    "                CA_rb = max(list_OA_carry);\n",
    "\n",
    "            # No release check\n",
    "            if (no_rls_check == True) & (any(sents_unrelease) == True):\n",
    "                RA_rb = 0;\n",
    "\n",
    "            # RA identification\n",
    "            if (len(list_RA) > 0):\n",
    "                # if (max(list_RA) > df_noaa.loc[k,col_Srch_RA]):\n",
    "                RA_rb = max(list_RA);\n",
    "                \n",
    "            # label Times_larger_plt\n",
    "            if (plt_max_RA != -1) and (RA_rb>(3*plt_max_RA)):\n",
    "                # df_noaa.loc[k,col_Srch_RA] = plt_max_RA;\n",
    "                L_times_larger_plt = True;\n",
    "\n",
    "            # label UCRA (unclear release amount)\n",
    "            if (label_UCRA == True) and RA_rb <= 0 and (any(list_sents_rls)): # ???\n",
    "                RA_rb = -2;\n",
    "\n",
    "        except ValueError:\n",
    "            RA_rb = 'ValueError';\n",
    "        except KeyError:\n",
    "            RA_rb = 'KeyError';\n",
    "    return [RA_rb, CA_rb, L_times_larger_plt,if_rls,if_no_rls]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2ae063d-f8e1-49ff-acc6-ab77d22a9623",
   "metadata": {},
   "source": [
    "## GPT RA Extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15a29b2a-1390-45c1-9472-37e297426662",
   "metadata": {},
   "source": [
    "### Call GPT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2cd2b9cd-c584-4aa2-8dc3-4df4407674c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpt model selection\n",
    "def gpt_model_select(gpt_model_shortname):\n",
    "    if gpt_model_shortname == \"gpt4\":\n",
    "        gpt_model = \"gpt-4-1106-preview\"\n",
    "        col_gpt_output = 'text_gpt4'\n",
    "    elif gpt_model_shortname == \"gpt4-0125\":\n",
    "        gpt_model = \"gpt-4-0125-preview\"\n",
    "        col_gpt_output = 'text_gpt4_0125'        \n",
    "    elif gpt_model_shortname == \"gpt35\":\n",
    "        gpt_model = \"gpt-3.5-turbo\"\n",
    "        col_gpt_output = 'text_gpt35'\n",
    "    return gpt_model,col_gpt_output\n",
    "\n",
    "gpt_model_shortname = 'gpt4-0125' #\"gpt4-0125\"\n",
    "gpt_model,col_gpt_output = gpt_model_select(gpt_model_shortname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c11f6dc9-c85f-4b5a-a2af-14b4dc6234e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def gpt_cost_estimate(gpt_model,n_prompt_tokens,n_completion_tokens,n_total_tokens):\n",
    "    if gpt_model == \"gpt-4-1106-preview\" or gpt_model == \"gpt-4-0125-preview\":\n",
    "        costs = (n_prompt_tokens*0.01 + n_completion_tokens*0.03)/1000\n",
    "    elif gpt_model == \"gpt-3.5-turbo\":\n",
    "        costs = (n_total_tokens*0.00200)/1000\n",
    "    return round(costs,2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fb918c65-3342-41e7-9242-a78807ea8b71",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from openai import OpenAI\n",
    "\n",
    "API_KEY = 'xxxxxx' # replace 'xxxxxx' with user's API key\n",
    "\n",
    "def query_gpt(prompt, model=\"gpt-3.5-turbo\"):\n",
    "    #     \"\"\"\n",
    "    #     Function to query GPT-3.5-turbo model.\n",
    "    #     :param prompt: The prompt to send to the model.\n",
    "    #     :param model: The model to use, default is 'gpt-3.5-turbo'.\n",
    "    #     :param max_tokens: The maximum number of tokens to generate.\n",
    "    #     :return: The response from the model.\n",
    "    #     \"\"\"\n",
    "\n",
    "    client = OpenAI(api_key=API_KEY)\n",
    "\n",
    "    chat_completion = client.chat.completions.create(\n",
    "        messages=[\n",
    "            {\n",
    "                \"role\": \"user\",\n",
    "                \"content\": prompt,\n",
    "            }\n",
    "        ],\n",
    "        model=model,\n",
    "    )\n",
    "    return chat_completion"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2a83bbf-25ee-4157-80dd-b276284e553b",
   "metadata": {},
   "source": [
    "### Extract OA from GPT eval results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5284d4c5-ae7b-411f-949d-3488b36f1070",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'unit_words' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_8704/3567559654.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# removed unit adding\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[1;32mdef\u001b[0m \u001b[0mOA_Ext_From_GPT_V3\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_sent\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0munit_kw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munit_words\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m     \u001b[0mpattern_unit\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'|'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0munit_words\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m     \u001b[0mL_unit_lost\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf_sent\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'lemma'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfullmatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpattern_unit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcase\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'unit_words' is not defined"
     ]
    }
   ],
   "source": [
    "# Version 3: Extract OA amount from GPT results 20231218\n",
    "# removed unit adding\n",
    "\n",
    "def OA_Ext_From_GPT_V3(df_sent,unit_kw = unit_words):\n",
    "    pattern_unit = '|'.join(unit_words)\n",
    "    L_unit_lost = df_sent['lemma'].str.fullmatch(pattern_unit, case=False).any()\n",
    "    try:\n",
    "        list_OA_GPT = unit_forward_search(df_sent,unit_words)\n",
    "    except:\n",
    "        list_OA_GPT = []\n",
    "    if len(list_OA_GPT) >= 1: # For most results with units, existed function is usable\n",
    "        list_OA_GPT.sort(reverse=True)\n",
    "    return [list_OA_GPT,L_unit_lost]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "f04de39c-231b-4de8-824e-76e331de712a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract RA,OAtt,OArcv 20240324\n",
    "# Version 3.1, adapted from Version 2.1\n",
    "# remove unit-lost dection, recover amount extraction\n",
    "# don't use maximum RA use the last RA\n",
    "\n",
    "def get_RA_from_gpt_0324(text,RA_field_name = 'Final RA', prompt_type=\"short\",separator = '\\n'):\n",
    "    # Initialization\n",
    "    signal_words = {\n",
    "        \"short\": ('ra_release', 'ra_total'),\n",
    "        \"long\": (RA_field_name.lower(), 'OA_total'.lower())\n",
    "    }\n",
    "    ra_signal_word, oatt_signal_word = signal_words[prompt_type]\n",
    "\n",
    "    RA_gpt, OAtt_gpt = -1, -1\n",
    "    # Text preprocessing and split it into segments\n",
    "    pretreat_text = pretreat_for_nlp(text)\n",
    "    segments = pretreat_text.split(separator)\n",
    "    \n",
    "    # go through segments backward (Use the latest appeared RA instead of largest because of the new prompts)\n",
    "    for segment in reversed(segments):\n",
    "        segment = segment.strip(' ')\n",
    "        # print(segment)\n",
    "        # recognize RA = -1 at first\n",
    "        if segment.startswith('- Final RA: Unknown') & (RA_gpt == -1):\n",
    "            # print('yes')\n",
    "            break\n",
    "        else:\n",
    "            doc = nlp(segment)\n",
    "            for sentence in doc.sentences:\n",
    "                sentence_df = sent2df(sentence)\n",
    "                contains_RA = ra_signal_word in sentence.text.lower()\n",
    "                contains_OAtt = oatt_signal_word in sentence.text.lower()\n",
    "\n",
    "                if (sentence_df['upos'] == 'NUM').any():\n",
    "                    OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words)\n",
    "                    if len(OA_temp) > 0:\n",
    "                        if contains_RA and RA_gpt==-1:\n",
    "                            RA_gpt = max(OA_temp)\n",
    "                        elif contains_OAtt and OAtt_gpt==-1:\n",
    "                            OAtt_gpt = max(OA_temp)\n",
    "                if (RA_gpt == -1) & ('Final RA: 0' in sentence.text): # may be remove in the future\n",
    "                    RA_gpt = 0\n",
    "    return [RA_gpt, OAtt_gpt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "d5afa350-4dc0-478c-8028-4a3848685024",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract RA,OAtt,OArcv 20240324\n",
    "# Version 3.1, adapted from Version 2.1\n",
    "# remove unit-lost dection, recover amount extraction\n",
    "# don't use maximum RA use the last RA\n",
    "\n",
    "def get_RA_from_long_prompt_0324(text,RA_field_name = 'Actual RA', prompt_type=\"short\",separator = '\\n'):\n",
    "    # Initialization\n",
    "    signal_words = {\n",
    "        \"short\": ('ra_release', 'ra_total'),\n",
    "        \"long\": (RA_field_name.lower(), 'OA_total'.lower())\n",
    "    }\n",
    "    ra_signal_word, oatt_signal_word = signal_words[prompt_type]\n",
    "\n",
    "    RA_gpt = -1\n",
    "    No_spill = False\n",
    "    # Text preprocessing and split it into segments\n",
    "    pretreat_text = pretreat_for_nlp(text)\n",
    "    segments = pretreat_text.split(separator)\n",
    "\n",
    "    # go through segments backward (Use the latest appeared RA instead of largest because of the new prompts)\n",
    "    for segment in segments:\n",
    "        segment = segment.strip(' ')\n",
    "        # print(segment)\n",
    "        # recognize RA = -1 at first\n",
    "        if segment.startswith('- No spill: Yes') & (RA_gpt == -1): # may be remove in the future\n",
    "            RA_gpt = 0\n",
    "            No_spill = True\n",
    "        else:\n",
    "            doc = nlp(segment)\n",
    "            for sentence in doc.sentences:\n",
    "                sentence_df = sent2df(sentence)\n",
    "                contains_RA = ra_signal_word in sentence.text.lower()\n",
    "                if (sentence_df['upos'] == 'NUM').any():\n",
    "                    OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words)\n",
    "                    if len(OA_temp) > 0:\n",
    "                        if contains_RA and RA_gpt==-1:\n",
    "                            RA_gpt = max([RA_gpt] + OA_temp)\n",
    "    return [RA_gpt, No_spill]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "e28026ce-a391-4b48-ab46-25a0cb50c4d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "RA_field_name='Final RA'\n",
    "'- ' + RA_field_name + ': Unknown' == '- Final RA: Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f3152833-ea20-434a-a2c9-b68fb38143de",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pretreatments on GPT results (text) 20240324\n",
    "# Version 1, adapted from process_text_from_gpt, Version 2 \n",
    "def pretreat_for_nlp(text):\n",
    "    text = text.replace('\\n\\n','\\n');\n",
    "    text = text.replace('cubic yard','cubicyard');\n",
    "    text = text.replace('• ','- ')\n",
    "    text = text.strip(' ')\n",
    "    \n",
    "    # add a white space to \"BBL\"\n",
    "    if 'bbl' in text.lower() and ' bbl' not in text.lower():\n",
    "        text = text.replace('bbl',' bbl')\n",
    "        text = text.replace('BBL',' bbl')\n",
    "        \n",
    "    while '..' in text:\n",
    "        text = text.replace('..','.')\n",
    "\n",
    "    # Ensure spaces around hyphens are consistent\n",
    "    if text.count('-') != text.count(' - '):\n",
    "        text = text.replace('-', ' - ')\n",
    "\n",
    "    while '  ' in text:\n",
    "        text = text.replace('  ',' ')\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4056cc84-e0b4-446a-9d06-68f94f28a234",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract RA,OAtt 20240329\n",
    "# Version 2.2, adapted from Version 2.1\n",
    "# remove unit-lose detection\n",
    "# Remove recover amount extraction\n",
    "\n",
    "def get_oil_amount_from_short_prompt(text, ra_signal_word = 'RA_Release', oatt_signal_word = 'RA_total'):\n",
    "    # Initialization\n",
    "    # signal_words = {\n",
    "    #     \"short\": ('ra_release', 'ra_total')\n",
    "    # }\n",
    "    # ra_signal_word = , oatt_signal_word = signal_words[prompt_type]\n",
    "\n",
    "    max_RA, max_OAtt = -1, -1\n",
    "\n",
    "    # Text preprocessing\n",
    "    processed_text = process_text_from_gpt(text)\n",
    "    segments = processed_text.split('FENDUAN')\n",
    "\n",
    "    for segment in segments:\n",
    "        doc = nlp(segment)\n",
    "        for sentence in doc.sentences:\n",
    "            sentence_df = sent2df(sentence)\n",
    "            has_number = (sentence_df['upos'] == 'NUM').any()\n",
    "            contains_RA = ra_signal_word in sentence.text; #print(sentence.text)\n",
    "            contains_OAtt = oatt_signal_word in sentence.text\n",
    "\n",
    "            if has_number:\n",
    "                # print('has number')\n",
    "                OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words); \n",
    "                if contains_RA:\n",
    "                    max_RA = max([max_RA] + OA_temp)\n",
    "                    # if (max_RA == -1) & (ra_signal_word+': 0' in sentence.text):\n",
    "                    #     max_RA = max([max_RA] + [0])\n",
    "                elif contains_OAtt:\n",
    "                    max_OAtt = max([max_OAtt] + OA_temp)\n",
    "\n",
    "    return [max_RA, max_OAtt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31820246-58de-46b3-aa63-ed131591ef27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract RA,OAtt,OArcv 20231218\n",
    "# Version 2.1, adapted from Version 2, revised by ChatGPT\n",
    "# To distinguish with long prompt, add short prompt to its function name\n",
    "# Removed unit adding function in OA_Ext_From_GPT_V3 as it is no longer used\n",
    "\n",
    "def extract_RA_from_gpt(text,prompt_type=\"short\", extract_recovery=False):\n",
    "    # Initialization\n",
    "    signal_words = {\n",
    "        \"short\": ('ra_release', 'ra_total', 'ra_recover'),\n",
    "        \"long\": ('Final RA'.lower(), 'OA_total'.lower(), 'OA_recovered'.lower())\n",
    "    }\n",
    "    ra_signal_word, oatt_signal_word, oarcv_signal_word = signal_words[prompt_type]\n",
    "\n",
    "    max_RA, max_OAtt, max_OArv = -1, -1, -1\n",
    "\n",
    "    # Text preprocessing\n",
    "    processed_text = process_text_from_gpt(text)\n",
    "    segments = processed_text.split('FENDUAN')\n",
    "\n",
    "    for segment in segments:\n",
    "        doc = nlp(segment)\n",
    "        for sentence in doc.sentences:\n",
    "            sentence_df = sent2df(sentence)\n",
    "            has_number = (sentence_df['upos'] == 'NUM').any()\n",
    "            contains_RA = ra_signal_word in sentence.text.lower()\n",
    "            contains_OAtt = oatt_signal_word in sentence.text.lower()\n",
    "            contains_OArv = oarcv_signal_word in sentence.text.lower() if extract_recovery else False\n",
    "\n",
    "            if has_number:\n",
    "                OA_temp, OA_unit_lost = OA_Ext_From_GPT_V3(sentence_df, unit_kw=unit_words)\n",
    "                if contains_RA:\n",
    "                    RA_unit_lost = OA_unit_lost\n",
    "                    max_RA = max([max_RA] + OA_temp)\n",
    "                    if (max_RA == -1) & ('Final RA: 0' in sentence.text):\n",
    "                        max_RA = max([max_RA] + [0])\n",
    "                elif contains_OAtt:\n",
    "                    OAtt_unit_lost = OA_unit_lost\n",
    "                    max_OAtt = max([max_OAtt] + OA_temp)\n",
    "                elif contains_OArv:\n",
    "                    OArv_unit_lost = OA_unit_lost\n",
    "                    max_OArv = max([max_OArv] + OA_temp)\n",
    "\n",
    "    return [max_RA, max_OAtt, max_OArv]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d992f239-b2d1-44af-9936-00be2425b8c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pretreatments on GPT results (text) 20230309\n",
    "# Version 2\n",
    "def process_text_from_gpt(text):\n",
    "\n",
    "    # text pretreatment\n",
    "    text = text.replace('\\n','FENDUAN');\n",
    "    text = text.replace('cubic yard','cubicyard');\n",
    "    text = text.strip(' ')\n",
    "\n",
    "    # add a white space to \"BBL\"\n",
    "    if 'bbl' in text.lower() and ' bbl' not in text.lower():\n",
    "        text = text.replace('bbl',' bbl')\n",
    "        text = text.replace('BBL',' bbl')\n",
    "        \n",
    "    while '..' in text:\n",
    "        text = text.replace('..','.')\n",
    "    while '  ' in text:\n",
    "        text = text.replace('  ',' ')\n",
    "    while 'FENDUANFENDUAN' in text:\n",
    "        text = text.replace('FENDUANFENDUAN','FENDUAN')\n",
    "\n",
    "    # Ensure spaces around hyphens are consistent\n",
    "    if text.count('-') != text.count(' - '):\n",
    "        text = text.replace('-', ' - ')\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3d813b08-d691-46ca-85fa-ec6a0729021e",
   "metadata": {},
   "source": [
    "## Merge rule-based RA and GPT RA"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efe08337-7f8b-4075-81c6-886e45c9f0df",
   "metadata": {},
   "source": [
    "# RA identification"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ce4f722-41a9-4485-bfcc-2b4463595d0a",
   "metadata": {},
   "source": [
    "## Group incidents by their ra_icd and ra_posts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "017f8c2c-6f55-4011-9845-de502f353ae8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def select_verified_excel_sheets(excel_file_path, excel_file_name, sheet_names):\n",
    "\n",
    "    verified_data_excel_file = pd.ExcelFile(excel_file_path + excel_file_name)\n",
    "\n",
    "    # Determine the sheets to process: use the provided list or determine based on the algorithm\n",
    "    if sheet_names is None:\n",
    "        # Algorithm to determine the sheet names\n",
    "        sample_sheet_names = verified_data_excel_file.sheet_names\n",
    "        sample_sheet_names = ['S5', 'S6']+[sheet for sheet in sample_sheet_names if sheet.startswith('F')] \n",
    "    else:\n",
    "        # Use the user-provided list of sheets\n",
    "        sample_sheet_names = sheet_names\n",
    "    print(\"Sheets being processed:\", sample_sheet_names)\n",
    "\n",
    "    verified_data_excel_file.close()\n",
    "    return sample_sheet_names"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d794a3f9-9bc7-41a1-a9ac-d6a1293f3445",
   "metadata": {
    "tags": []
   },
   "source": [
    "## determine final RA by group"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f80d2b3-4cb4-4f9d-b68a-1abc6775e57f",
   "metadata": {
    "tags": []
   },
   "source": [
    "define initial conditions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "29efd429-ee52-4c20-862c-72ff20653a35",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "col_RA_icd_Comb_Srch_GPT = 'RA_merge_icd_rb_gpt'\n",
    "col_RA_post_Comb_Srch_GPT = 'RA_merge_posts_rb_gpt'\n",
    "\n",
    "col_RA_Comb_des_post = 'ra_final'\n",
    "col_source_RA_comb = 'source_final_ra'\n",
    "col_category_icds = 'category_icds'\n",
    "col_source_RA_des = 'source_RA_merge_icd'\n",
    "col_source_RA_post = 'source_RA_merge_posts'\n",
    "col_vrf_RAs_des_post = 'RA_Comb_des_post_Manual'\n",
    "n_icd_unvrfd = 0"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "306px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-showtags": false
 },
 "nbformat": 4,
 "nbformat_minor": 5
}