Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
doi_summarizer/flask_app.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
409 lines (355 sloc)
20.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""================================================================================================= | |
Flask application that will return a html page with input for a list of DOIs, email address, and table name. | |
Given the list of DOIs, it will use Entrez to search for the matching PMID, return the article. It will then parse the | |
XML to find the abstract and the title. Using the abstract, it will use two methods: nltk and networkx to return two | |
summaries of the paper to give the user a better understanding of their paper based on the two results. It will then use | |
scikit-learn and NLTK to return the top 5 keywords associated with the paper. | |
This will then write their results to a database and return the output in a new html file formatted as a table. | |
Then it will recommend papers based on both sets of keywords and MeSH terms and return them below the summarizations. | |
Rose Wilfong & Wenxuan Dong 05/05/2023 | |
=================================================================================================""" | |
# import libraries | |
from flask import Flask, render_template, request, redirect, url_for, session | |
from datetime import datetime | |
from collections import defaultdict | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from nltk.tokenize import sent_tokenize | |
import networkx as nx | |
import numpy as np | |
import xml.etree.ElementTree as ET | |
import sqlite3 | |
from Bio import Entrez | |
from string import punctuation | |
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
import urllib.parse | |
app = Flask(__name__) | |
app.secret_key = 'pickles_rules' # set key for Flask session | |
@app.route('/') | |
def index(): | |
return render_template('starting_page.html') # the HTML file to return when Flask is first deployed | |
@app.route('/summarizer', methods=['POST', 'GET']) # summarization portion of Flask | |
def summarizer(): | |
if request.method == 'POST': | |
# when the form is submitted, analyze the input | |
# retrieve entries from the HTML form | |
email = request.form['email'] | |
table_name = request.form['table_name'] | |
api_key = request.form.get('api_key') | |
if api_key: | |
pass | |
else: | |
api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09' # if the user leaves the API key empty, use this | |
# clean input DOIs to remove any whitespaces, new lines, etc. | |
doi_list = [doi.strip() for doi in request.form['links'].split(',')] | |
# parse DOIs for PMIDS, MeSH terms, abstracts, summaries, and keywords | |
cleaned_abstracts = abstract_analysis(doi_list, email, table_name, api_key) | |
# get recommendations based on keywords (scikit-learn, NLTK, and MeSH terms) | |
recommended = recommend_similar_articles(cleaned_abstracts, email, api_key) | |
# save session | |
session['cleaned_abstracts'] = cleaned_abstracts | |
session['recommended'] = recommended | |
# render output when submitted | |
return render_template('output.html', results=cleaned_abstracts, recs=recommended) | |
# results and recs will be used to return the table in output.html | |
else: | |
# else, if the form hasn't been submitted, just return the form HTMl file | |
return render_template('doi_summarizer.html') | |
@app.route('/return_data', methods=['GET', 'POST']) | |
# route for the return your data portion (not the one where you're looking up DOIs and summarizing them) | |
def return_data(): | |
if request.method == 'POST': | |
# retrieve the form data | |
email = request.form['email'] | |
table_name = request.form['table_name'] | |
date = request.form['date'] | |
# connect to the database | |
conn = sqlite3.connect('database.db') | |
# create a cursor | |
cursor = conn.cursor() | |
# create the SQL query to return the DOI, title, summary, keywords, and MeSH terms based on the email and table name | |
query = "SELECT doi, title, summary, scikit_keywords, nltk_keywords, mesh_terms FROM papers WHERE email = ? AND table_name = ?" | |
if date: | |
query += " AND date = ?" | |
# execute the query based on if the date is used or not | |
if date: | |
cursor.execute(query, (email, table_name, date)) | |
# if implemented, it will return the data past a certain date | |
else: | |
# else, just use the email and table name from user input | |
cursor.execute(query, (email, table_name)) | |
# fetch the results | |
results = cursor.fetchall() | |
# close the database connection | |
cursor.close() | |
conn.close() | |
return render_template('return_data_table.html', results=results) | |
# then return the data in a table format where results are those collected | |
else: | |
# else, if not submitted, just render the input form to return data | |
return render_template('return_data.html') | |
@app.route('/delete_entries_database', methods=['POST']) # used to delete entries from the return_data_table.html file | |
def delete_entries_database(): | |
# users can use this to delete any entries in the database | |
# Retrieve the form data | |
# find the entries where the delete_row variable is seleted using the checkbox in html | |
checked_entries = request.form.getlist('delete_row') | |
# Connect to the database | |
conn = sqlite3.connect('database.db') | |
cursor = conn.cursor() | |
# Delete the checked entries from the database based on the DOI | |
for doi in checked_entries: | |
cursor.execute("DELETE FROM papers WHERE doi = ?", (doi,)) | |
conn.commit() # commit changes | |
# Close the database connection | |
cursor.close() | |
conn.close() | |
# Redirect back to the return_data route | |
# tried routing back to the table, but saving a session returned the same table, so user will have to look up again | |
return redirect(url_for('return_data')) | |
@app.route('/main') | |
# when the "DOI Summarizer and Recommender" in the top left will return to the starting page so users can nav back | |
def main(): | |
return render_template('starting_page.html') # return the original page | |
###### Functions ###### | |
# below are the functions used in the flask application | |
def extract_doi(url): | |
""" | |
This removes the doi.org prefix if it exists in the list of DOIs from the user | |
:param url: each string in the doi list | |
:return: the cleaned string | |
""" | |
prefix = "https://doi.org/" | |
if url.startswith(prefix): | |
return url[len(prefix):] | |
else: | |
return url | |
def abstract_analysis(doi_list, email, table_name, api_key): | |
""" | |
This is the main function of the script. It will find the PMIDs associated with the DOIs, extract the title and | |
abstracts. Generate the two sets of summaries and the keywords. It will return it in a list of dictionaries called | |
cleaned_abstracts and this will be written to the table seen in the output.html file. | |
:param doi_list: list of comma-separated values of dois. | |
:param email: the user's email address, str. | |
:param table_name: the name the user wants for this search. can be used more than once, str. | |
:param api_key: the API key from the if/else statement, str. | |
:return: list of dictionaries called cleaned_abstracts. | |
""" | |
Entrez.email = email # email for Entrez | |
Entrez.api_key = api_key # api key for Entrez | |
# Given a list of DOIs, find the PMIDs and article titles | |
# initialize empty lists and dictionaries | |
id_list = [] | |
titles = {} | |
abstracts = {} | |
mesh_terms = {} | |
# clean dois before iterating | |
dois = [extract_doi(doi) for doi in doi_list if extract_doi(doi)] | |
for doi in dois: | |
# for each entry: | |
handle = Entrez.esearch(db='pubmed', term=doi) # search pubmed with doi | |
record = Entrez.read(handle) | |
handle.close() | |
if len(record['IdList']) > 0: # if the IdList has a recording, continue | |
pmid = record['IdList'][0] | |
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text') # return the xml | |
xml = handle.read() | |
handle.close() # close the efetch handle | |
root = ET.fromstring(xml) # parse through the xml | |
article = root.find('.//PubmedArticle/MedlineCitation/Article') # find the title | |
title = article.find('ArticleTitle').text # extract title | |
id_list.append(pmid) # add the PMID to the list | |
titles[pmid] = title # add to dictionary | |
abstract_elem = root.find('.//AbstractText') # extract abstract | |
abstract = abstract_elem.text.strip() # remove any new lines, white spaces, etc. | |
abstracts[pmid] = abstract # add to dictionary | |
mesh_heading_list = root.findall('.//PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading') | |
# extract MeSH terms | |
mesh_terms[pmid] = [mesh.find('DescriptorName').text for mesh in mesh_heading_list] | |
# add MeSH to dictionary | |
# parse the abstracts | |
# initialize empty list to write the dictionaries to | |
cleaned_abstracts = [] | |
for pmid, abstract in abstracts.items(): | |
# abstracts contains the PMID and abstract | |
# make set of summaries based on the abstract using two methods | |
# first up, Networkx | |
sentences = sent_tokenize(abstract) # get every sentence in the abstract | |
count_vectorizer = CountVectorizer() # apply count vectorize, or tally the number of words in each sentence | |
X = count_vectorizer.fit_transform(sentences) # apply to the sentences | |
# create a graph of sentence similarity | |
graph = nx.Graph() | |
# construct the graph | |
for i, sentence_i in enumerate(sentences): | |
for j, sentence_j in enumerate(sentences): | |
if i == j: | |
continue | |
similarity = cosine_similarity(X[i], X[j])[0][0] # calculate the cosine similarity between the nodes | |
graph.add_edge(i, j, weight=similarity) # make the edges the similarity calculation | |
# compute PageRank scores for each sentence | |
scores = nx.pagerank(graph) | |
# get the indices of the top two sentences | |
top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2]) | |
# construct the summary by joining the top two sentences | |
nx_summary = ' '.join([sentences[i] for i in sorted(top_indices)]) | |
# next summary | |
# second set of summaries using NLTK. This uses the same sentences variable from above | |
stop_words = set(stopwords.words('english') + list(punctuation)) | |
# calculate the frequency of each word in the abstract | |
# generate the word frequencies with a count | |
word_frequencies = {} | |
for word in abstract.split(): | |
if word.lower() not in stop_words: | |
if word not in word_frequencies: | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
# calculate the score of each sentence based on the frequency of its words using word_frequencies | |
sentence_scores = {} | |
for sentence in sentences: | |
for word in sentence.split(): | |
if word.lower() in word_frequencies: | |
if len(sentence.split()) < 30: | |
if sentence not in sentence_scores: | |
sentence_scores[sentence] = word_frequencies[word] | |
else: | |
sentence_scores[sentence] += word_frequencies[word] | |
# sort the sentences by their score and return the top 2 | |
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2] | |
# join the top 2 sentences to create the summary | |
nltk_summary = ' '.join(summary_sentences) | |
# Combine the summaries | |
total_summary = (f"Summary 1: {nx_summary} Summary 2: {nltk_summary}") | |
# maybe insert new line in the output tbl | |
# add dictionary to final list | |
cleaned_abstracts.append( | |
{'doi': doi_list[id_list.index(pmid)], 'pmid': pmid, 'title': titles[pmid], 'abstract': abstract, | |
'summary': total_summary, 'mesh_terms': mesh_terms[pmid]}) | |
# keyword generating | |
# now from the abstracts, return the keywords using scikit-learn | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer | |
corpus = [a['abstract'] for a in cleaned_abstracts] # extract each abstract in the list of dictionaries | |
# iterate through corpus | |
for i, abstract in enumerate(corpus): | |
tfidf_x = tfidf_vectorizer.fit_transform([abstract]) # apply the vectorizer to the abstract | |
feature_names = tfidf_vectorizer.get_feature_names_out() # return feature names | |
idf_scores = tfidf_vectorizer.idf_ # calculate the IDF scores | |
keyword_scores = defaultdict(float) # dictionary for the scores as a float | |
for j in range(tfidf_x.shape[1]): | |
keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j] # save the scores | |
top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5] # top 5 | |
# to return with the scores associated: | |
# top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5] | |
cleaned_abstracts[i]['scikit_keywords'] = top_keywords # return the scikit-learn keywords | |
# now create keywords that are pairs using NLTK since the scikit-learn keywords are generally one word | |
# initialize the BigramAssocMeasure, which measures the relationship between pairs of words | |
bigram_measures = BigramAssocMeasures() | |
words = word_tokenize(abstract.lower()) # starting with the most common keywords | |
stop_words = (stopwords.words("english")) | |
# add some custom stop words that are common in research papers | |
custom_stop_words = ['instead', 'study', 'results', 'analysis', 'method', 'data', 'experiment', 'figure', | |
'table', 'author', 'et al.', 'conclusion', 'discussion', 'findings', 'significant', | |
'difference', 'effect', 'increase', 'decrease', 'reduction', 'however', 'moreover', | |
'thus', 'therefore', 'also', 'similarly', 'hence', 'namely', 'cm', 'mm', 'mL', 'kg', | |
'accounts', 'approximation', 'across', 'research', 'approach', 'approaches'] | |
# extend the keywords by adding the custom stopwords from above | |
stop_words.extend(custom_stop_words) | |
# filter through the words in the abstract and only return those that are not stopwords | |
filtered_tokens = [token for token in words if token not in stop_words] | |
# apply the BigramCollocationFinder to find similar word pairs in the filtered words/tokens | |
finder = BigramCollocationFinder.from_words(filtered_tokens) | |
# get the measurements and return the best 5 pairs of words | |
keywords = finder.nbest(bigram_measures.pmi, 5) | |
# combine the keywords together in a list as strings | |
keywords = [' '.join(keyword) for keyword in keywords] | |
# return the NLTK keywords in the cleaned abstracts dictionary as a key, value pair | |
cleaned_abstracts[i]['nltk_keywords'] = keywords | |
# add the date time to the dictionary that will be written to the database | |
time = datetime.now() | |
for entry in cleaned_abstracts: | |
entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S") | |
# insert into database | |
conn = sqlite3.connect('database.db') | |
c = conn.cursor() | |
# add the list of dictionaries to the database as individual rows | |
c.execute('''INSERT INTO papers (email, table_name, doi, pmid, title, abstract, summary, mesh_terms, | |
scikit_keywords, nltk_keywords, time) | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', | |
(email, table_name, entry['doi'], entry['pmid'], entry['title'], entry['abstract'], entry['summary'], | |
', '.join(entry['mesh_terms']), ', '.join(entry['scikit_keywords']), | |
', '.join(entry['nltk_keywords']), entry['time'])) | |
conn.commit() | |
conn.close() | |
return cleaned_abstracts | |
# end abstract_analysis | |
def recommend_similar_articles(articles, email, api_key): | |
""" | |
content-based recommender for each research paper. | |
:param articles: input dictionary to the code. Should have the appropriate key, value pairs from abstract_analysis | |
:param email: users email to find a NCBI account, str. | |
:param api_key: users api key for NCBI from the if/else statement, str. | |
:return: returns the dictionary with three new key, value pairs for recommendations based on keywords. | |
""" | |
recommended_articles = [] | |
# abstracts = [article['abstract'] for article in articles] | |
for i, article in enumerate(articles): | |
# iterate through the articles in the dictionary | |
try: | |
# create a query from the list of keywords and mesh terms | |
# get the scikit and NLTK keywords in each dictionary as well as the MeSH terms | |
scikit_keywords = article.get("scikit_keywords", []) | |
nltk_keywords = article.get('nltk_keywords', []) | |
mesh_terms = article.get("mesh_terms", []) | |
# combine the keywords together to generate a query | |
sk_keyword_query = " ".join(scikit_keywords) | |
nltk_keyword_query = " ".join(nltk_keywords) | |
mesh_query = " ".join(mesh_terms) | |
# collect the data for keywords | |
Entrez.email = email # email for Entrez | |
Entrez.api_key = api_key # api key for Entrez | |
# scikit keywords | |
# search entrez for the scikit keywords as the criteria, return 5 entries | |
keyword_handle = Entrez.esearch(db="pubmed", term=sk_keyword_query, retmax=5) | |
keyword_record = Entrez.read(keyword_handle) # create a record | |
keyword_ids = keyword_record["IdList"] # get the ID list | |
keyword_handle = Entrez.efetch(db="pubmed", id=keyword_ids, retmode="xml") # fetch the XML for the PMID | |
keyword_records = Entrez.read(keyword_handle) | |
# return the title from the XML | |
keyword_titles = [record["MedlineCitation"]["Article"]["ArticleTitle"] for record in | |
keyword_records["PubmedArticle"]] | |
# NLTK keywords | |
# repeat the same process as above but with the NLTK keywords as input (keyword pairs) | |
nltk_keyword_handle = Entrez.esearch(db="pubmed", term=nltk_keyword_query, retmax=5) | |
nltk_keyword_record = Entrez.read(nltk_keyword_handle) | |
nltk_keyword_ids = nltk_keyword_record["IdList"] | |
nltk_keyword_handle = Entrez.efetch(db="pubmed", id=nltk_keyword_ids, retmode="xml") | |
nltk_keyword_records = Entrez.read(nltk_keyword_handle) | |
nltk_keyword_titles = [record["MedlineCitation"]["Article"]["ArticleTitle"] for record in | |
nltk_keyword_records["PubmedArticle"]] | |
# Step 3: Collect the data for mesh terms | |
# repeat the same process from scikit learn but with the MeSH terms as input | |
mesh_handle = Entrez.esearch(db="pubmed", term=mesh_query, retmax=5) | |
mesh_record = Entrez.read(mesh_handle) | |
mesh_ids = mesh_record["IdList"] | |
mesh_handle = Entrez.efetch(db="pubmed", id=mesh_ids, retmode="xml") | |
mesh_records = Entrez.read(mesh_handle) | |
mesh_titles = [record["MedlineCitation"]["Article"]["ArticleTitle"] for record in | |
mesh_records["PubmedArticle"]] | |
# add the recommended articles to the dictionary | |
article["scikit_keywords_recs"] = keyword_titles | |
article['nltk_keywords_recs'] = nltk_keyword_titles | |
article["mesh_recs"] = mesh_titles | |
recommended_articles.append(article) | |
except Exception as e: | |
# raise an exception if there is nothing for the recommendations and just return an empty list | |
article["scikit_keywords_recs"] = [] | |
article["nltk_keywords_recs"] = [] | |
article["mesh_recs"] = [] | |
recommended_articles.append(article) # append the results to the final list | |
return recommended_articles | |
# end recommended_articles | |
if __name__ == '__main__': | |
# main function, deploy the above application | |
app.run(debug=True) |