Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
"""=================================================================================================
Flask application that will return a html page with input for a list of DOIs, email address, and table name.
Given the list of DOIs, it will use Entrez to search for the matching PMID, return the article. It will then parse the
XML to find the abstract and the title. Using the abstract, it will use two methods: nltk and networkx to return two
summaries of the paper to give the user a better understanding of their paper based on the two results. It will then use
scikit-learn and NLTK to return the top 5 keywords associated with the paper.
This will then write their results to a database and return the output in a new html file formatted as a table.
Then it will recommend papers based on both sets of keywords and MeSH terms and return them below the summarizations.
Rose Wilfong & Wenxuan Dong 05/05/2023
================================================================================================="""
# import libraries
from flask import Flask, render_template, request, redirect, url_for, session
from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import networkx as nx
import numpy as np
import xml.etree.ElementTree as ET
import sqlite3
from Bio import Entrez
from string import punctuation
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import urllib.parse
app = Flask(__name__)
app.secret_key = 'pickles_rules' # set key for Flask session
@app.route('/')
def index():
return render_template('starting_page.html') # the HTML file to return when Flask is first deployed
@app.route('/summarizer', methods=['POST', 'GET']) # summarization portion of Flask
def summarizer():
if request.method == 'POST':
# when the form is submitted, analyze the input
# retrieve entries from the HTML form
email = request.form['email']
table_name = request.form['table_name']
api_key = request.form.get('api_key')
if api_key:
pass
else:
api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09' # if the user leaves the API key empty, use this
# clean input DOIs to remove any whitespaces, new lines, etc.
doi_list = [doi.strip() for doi in request.form['links'].split(',')]
# parse DOIs for PMIDS, MeSH terms, abstracts, summaries, and keywords
cleaned_abstracts = abstract_analysis(doi_list, email, table_name, api_key)
# get recommendations based on keywords (scikit-learn, NLTK, and MeSH terms)
recommended = recommend_similar_articles(cleaned_abstracts, email, api_key)
# save session
session['cleaned_abstracts'] = cleaned_abstracts
session['recommended'] = recommended
# render output when submitted
return render_template('output.html', results=cleaned_abstracts, recs=recommended)
# results and recs will be used to return the table in output.html
else:
# else, if the form hasn't been submitted, just return the form HTMl file
return render_template('doi_summarizer.html')
@app.route('/return_data', methods=['GET', 'POST'])
# route for the return your data portion (not the one where you're looking up DOIs and summarizing them)
def return_data():
if request.method == 'POST':
# retrieve the form data
email = request.form['email']
table_name = request.form['table_name']
date = request.form['date']
# connect to the database
conn = sqlite3.connect('database.db')
# create a cursor
cursor = conn.cursor()
# create the SQL query to return the DOI, title, summary, keywords, and MeSH terms based on the email and table name
query = "SELECT doi, title, summary, scikit_keywords, nltk_keywords, mesh_terms FROM papers WHERE email = ? AND table_name = ?"
if date:
query += " AND date = ?"
# execute the query based on if the date is used or not
if date:
cursor.execute(query, (email, table_name, date))
# if implemented, it will return the data past a certain date
else:
# else, just use the email and table name from user input
cursor.execute(query, (email, table_name))
# fetch the results
results = cursor.fetchall()
# close the database connection
cursor.close()
conn.close()
return render_template('return_data_table.html', results=results)
# then return the data in a table format where results are those collected
else:
# else, if not submitted, just render the input form to return data
return render_template('return_data.html')
@app.route('/delete_entries_database', methods=['POST']) # used to delete entries from the return_data_table.html file
def delete_entries_database():
# users can use this to delete any entries in the database
# Retrieve the form data
# find the entries where the delete_row variable is seleted using the checkbox in html
checked_entries = request.form.getlist('delete_row')
# Connect to the database
conn = sqlite3.connect('database.db')
cursor = conn.cursor()
# Delete the checked entries from the database based on the DOI
for doi in checked_entries:
cursor.execute("DELETE FROM papers WHERE doi = ?", (doi,))
conn.commit() # commit changes
# Close the database connection
cursor.close()
conn.close()
# Redirect back to the return_data route
# tried routing back to the table, but saving a session returned the same table, so user will have to look up again
return redirect(url_for('return_data'))
@app.route('/main')
# when the "DOI Summarizer and Recommender" in the top left will return to the starting page so users can nav back
def main():
return render_template('starting_page.html') # return the original page
###### Functions ######
# below are the functions used in the flask application
def extract_doi(url):
"""
This removes the doi.org prefix if it exists in the list of DOIs from the user
:param url: each string in the doi list
:return: the cleaned string
"""
prefix = "https://doi.org/"
if url.startswith(prefix):
return url[len(prefix):]
else:
return url
def abstract_analysis(doi_list, email, table_name, api_key):
"""
This is the main function of the script. It will find the PMIDs associated with the DOIs, extract the title and
abstracts. Generate the two sets of summaries and the keywords. It will return it in a list of dictionaries called
cleaned_abstracts and this will be written to the table seen in the output.html file.
:param doi_list: list of comma-separated values of dois.
:param email: the user's email address, str.
:param table_name: the name the user wants for this search. can be used more than once, str.
:param api_key: the API key from the if/else statement, str.
:return: list of dictionaries called cleaned_abstracts.
"""
Entrez.email = email # email for Entrez
Entrez.api_key = api_key # api key for Entrez
# Given a list of DOIs, find the PMIDs and article titles
# initialize empty lists and dictionaries
id_list = []
titles = {}
abstracts = {}
mesh_terms = {}
# clean dois before iterating
dois = [extract_doi(doi) for doi in doi_list if extract_doi(doi)]
for doi in dois:
# for each entry:
handle = Entrez.esearch(db='pubmed', term=doi) # search pubmed with doi
record = Entrez.read(handle)
handle.close()
if len(record['IdList']) > 0: # if the IdList has a recording, continue
pmid = record['IdList'][0]
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text') # return the xml
xml = handle.read()
handle.close() # close the efetch handle
root = ET.fromstring(xml) # parse through the xml
article = root.find('.//PubmedArticle/MedlineCitation/Article') # find the title
title = article.find('ArticleTitle').text # extract title
id_list.append(pmid) # add the PMID to the list
titles[pmid] = title # add to dictionary
abstract_elem = root.find('.//AbstractText') # extract abstract
abstract = abstract_elem.text.strip() # remove any new lines, white spaces, etc.
abstracts[pmid] = abstract # add to dictionary
mesh_heading_list = root.findall('.//PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading')
# extract MeSH terms
mesh_terms[pmid] = [mesh.find('DescriptorName').text for mesh in mesh_heading_list]
# add MeSH to dictionary
# parse the abstracts
# initialize empty list to write the dictionaries to
cleaned_abstracts = []
for pmid, abstract in abstracts.items():
# abstracts contains the PMID and abstract
# make set of summaries based on the abstract using two methods
# first up, Networkx
sentences = sent_tokenize(abstract) # get every sentence in the abstract
count_vectorizer = CountVectorizer() # apply count vectorize, or tally the number of words in each sentence
X = count_vectorizer.fit_transform(sentences) # apply to the sentences
# create a graph of sentence similarity
graph = nx.Graph()
# construct the graph
for i, sentence_i in enumerate(sentences):
for j, sentence_j in enumerate(sentences):
if i == j:
continue
similarity = cosine_similarity(X[i], X[j])[0][0] # calculate the cosine similarity between the nodes
graph.add_edge(i, j, weight=similarity) # make the edges the similarity calculation
# compute PageRank scores for each sentence
scores = nx.pagerank(graph)
# get the indices of the top two sentences
top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])
# construct the summary by joining the top two sentences
nx_summary = ' '.join([sentences[i] for i in sorted(top_indices)])
# next summary
# second set of summaries using NLTK. This uses the same sentences variable from above
stop_words = set(stopwords.words('english') + list(punctuation))
# calculate the frequency of each word in the abstract
# generate the word frequencies with a count
word_frequencies = {}
for word in abstract.split():
if word.lower() not in stop_words:
if word not in word_frequencies:
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
# calculate the score of each sentence based on the frequency of its words using word_frequencies
sentence_scores = {}
for sentence in sentences:
for word in sentence.split():
if word.lower() in word_frequencies:
if len(sentence.split()) < 30:
if sentence not in sentence_scores:
sentence_scores[sentence] = word_frequencies[word]
else:
sentence_scores[sentence] += word_frequencies[word]
# sort the sentences by their score and return the top 2
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2]
# join the top 2 sentences to create the summary
nltk_summary = ' '.join(summary_sentences)
# Combine the summaries
total_summary = (f"Summary 1: {nx_summary} Summary 2: {nltk_summary}")
# maybe insert new line in the output tbl
# add dictionary to final list
cleaned_abstracts.append(
{'doi': doi_list[id_list.index(pmid)], 'pmid': pmid, 'title': titles[pmid], 'abstract': abstract,
'summary': total_summary, 'mesh_terms': mesh_terms[pmid]})
# keyword generating
# now from the abstracts, return the keywords using scikit-learn
tfidf_vectorizer = TfidfVectorizer(stop_words='english') # create a TF-IDF vectorizer
corpus = [a['abstract'] for a in cleaned_abstracts] # extract each abstract in the list of dictionaries
# iterate through corpus
for i, abstract in enumerate(corpus):
tfidf_x = tfidf_vectorizer.fit_transform([abstract]) # apply the vectorizer to the abstract
feature_names = tfidf_vectorizer.get_feature_names_out() # return feature names
idf_scores = tfidf_vectorizer.idf_ # calculate the IDF scores
keyword_scores = defaultdict(float) # dictionary for the scores as a float
for j in range(tfidf_x.shape[1]):
keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j] # save the scores
top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5] # top 5
# to return with the scores associated:
# top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
cleaned_abstracts[i]['scikit_keywords'] = top_keywords # return the scikit-learn keywords
# now create keywords that are pairs using NLTK since the scikit-learn keywords are generally one word
# initialize the BigramAssocMeasure, which measures the relationship between pairs of words
bigram_measures = BigramAssocMeasures()
words = word_tokenize(abstract.lower()) # starting with the most common keywords
stop_words = (stopwords.words("english"))
# add some custom stop words that are common in research papers
custom_stop_words = ['instead', 'study', 'results', 'analysis', 'method', 'data', 'experiment', 'figure',
'table', 'author', 'et al.', 'conclusion', 'discussion', 'findings', 'significant',
'difference', 'effect', 'increase', 'decrease', 'reduction', 'however', 'moreover',
'thus', 'therefore', 'also', 'similarly', 'hence', 'namely', 'cm', 'mm', 'mL', 'kg',
'accounts', 'approximation', 'across', 'research', 'approach', 'approaches']
# extend the keywords by adding the custom stopwords from above
stop_words.extend(custom_stop_words)
# filter through the words in the abstract and only return those that are not stopwords
filtered_tokens = [token for token in words if token not in stop_words]
# apply the BigramCollocationFinder to find similar word pairs in the filtered words/tokens
finder = BigramCollocationFinder.from_words(filtered_tokens)
# get the measurements and return the best 5 pairs of words
keywords = finder.nbest(bigram_measures.pmi, 5)
# combine the keywords together in a list as strings
keywords = [' '.join(keyword) for keyword in keywords]
# return the NLTK keywords in the cleaned abstracts dictionary as a key, value pair
cleaned_abstracts[i]['nltk_keywords'] = keywords
# add the date time to the dictionary that will be written to the database
time = datetime.now()
for entry in cleaned_abstracts:
entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")
# insert into database
conn = sqlite3.connect('database.db')
c = conn.cursor()
# add the list of dictionaries to the database as individual rows
c.execute('''INSERT INTO papers (email, table_name, doi, pmid, title, abstract, summary, mesh_terms,
scikit_keywords, nltk_keywords, time)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
(email, table_name, entry['doi'], entry['pmid'], entry['title'], entry['abstract'], entry['summary'],
', '.join(entry['mesh_terms']), ', '.join(entry['scikit_keywords']),
', '.join(entry['nltk_keywords']), entry['time']))
conn.commit()
conn.close()
return cleaned_abstracts
# end abstract_analysis
def recommend_similar_articles(articles, email, api_key):
"""
content-based recommender for each research paper.
:param articles: input dictionary to the code. Should have the appropriate key, value pairs from abstract_analysis
:param email: users email to find a NCBI account, str.
:param api_key: users api key for NCBI from the if/else statement, str.
:return: returns the dictionary with three new key, value pairs for recommendations based on keywords.
"""
recommended_articles = []
# abstracts = [article['abstract'] for article in articles]
for i, article in enumerate(articles):
# iterate through the articles in the dictionary
try:
# create a query from the list of keywords and mesh terms
# get the scikit and NLTK keywords in each dictionary as well as the MeSH terms
scikit_keywords = article.get("scikit_keywords", [])
nltk_keywords = article.get('nltk_keywords', [])
mesh_terms = article.get("mesh_terms", [])
# combine the keywords together to generate a query
sk_keyword_query = " ".join(scikit_keywords)
nltk_keyword_query = " ".join(nltk_keywords)
mesh_query = " ".join(mesh_terms)
# collect the data for keywords
Entrez.email = email # email for Entrez
Entrez.api_key = api_key # api key for Entrez
# scikit keywords
# search entrez for the scikit keywords as the criteria, return 5 entries
keyword_handle = Entrez.esearch(db="pubmed", term=sk_keyword_query, retmax=5)
keyword_record = Entrez.read(keyword_handle) # create a record
keyword_ids = keyword_record["IdList"] # get the ID list
keyword_handle = Entrez.efetch(db="pubmed", id=keyword_ids, retmode="xml") # fetch the XML for the PMID
keyword_records = Entrez.read(keyword_handle)
# return the title from the XML
keyword_titles = [record["MedlineCitation"]["Article"]["ArticleTitle"] for record in
keyword_records["PubmedArticle"]]
# NLTK keywords
# repeat the same process as above but with the NLTK keywords as input (keyword pairs)
nltk_keyword_handle = Entrez.esearch(db="pubmed", term=nltk_keyword_query, retmax=5)
nltk_keyword_record = Entrez.read(nltk_keyword_handle)
nltk_keyword_ids = nltk_keyword_record["IdList"]
nltk_keyword_handle = Entrez.efetch(db="pubmed", id=nltk_keyword_ids, retmode="xml")
nltk_keyword_records = Entrez.read(nltk_keyword_handle)
nltk_keyword_titles = [record["MedlineCitation"]["Article"]["ArticleTitle"] for record in
nltk_keyword_records["PubmedArticle"]]
# Step 3: Collect the data for mesh terms
# repeat the same process from scikit learn but with the MeSH terms as input
mesh_handle = Entrez.esearch(db="pubmed", term=mesh_query, retmax=5)
mesh_record = Entrez.read(mesh_handle)
mesh_ids = mesh_record["IdList"]
mesh_handle = Entrez.efetch(db="pubmed", id=mesh_ids, retmode="xml")
mesh_records = Entrez.read(mesh_handle)
mesh_titles = [record["MedlineCitation"]["Article"]["ArticleTitle"] for record in
mesh_records["PubmedArticle"]]
# add the recommended articles to the dictionary
article["scikit_keywords_recs"] = keyword_titles
article['nltk_keywords_recs'] = nltk_keyword_titles
article["mesh_recs"] = mesh_titles
recommended_articles.append(article)
except Exception as e:
# raise an exception if there is nothing for the recommendations and just return an empty list
article["scikit_keywords_recs"] = []
article["nltk_keywords_recs"] = []
article["mesh_recs"] = []
recommended_articles.append(article) # append the results to the final list
return recommended_articles
# end recommended_articles
if __name__ == '__main__':
# main function, deploy the above application
app.run(debug=True)