Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
"""=================================================================================================
Flask application that will return a html page with input for a list of DOIs, email address, and table name.
Given the list of DOIs, it will use Entrez to search for the matching PMID, return the article. It will then parse the
XML to find the abstract and the title. Using the abstract, it will use two methods: nltk and networkx to return two
summaries of the paper to give the user a better understanding of their paper based on the two results. It will then use
scikit-learn to return the top 5 keywords associated with the paper. This will then write their results to a database
and return the output in a new html file formatted as a table.
Rose Wilfong & Wenxuan Dong
================================================================================================="""
from flask import Flask, render_template, request
from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import networkx as nx
import numpy as np
import xml.etree.ElementTree as ET
import sqlite3
from Bio import Entrez
from Bio import Medline
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
app = Flask(__name__)
@app.route('/')
def index():
return render_template('input.html')
@app.route('/query', methods=['POST'])
def process_data():
email = request.form['email']
table_name = request.form['table_name']
doi_list = [doi.strip() for doi in request.form['links'].split(',')]
# parse DOIs for PMIDS, MeSH terms, abstracts, summaries, and keywords
cleaned_abstracts = abstract_analysis(doi_list, email, table_name)
# get recommendations based on keywords
recommended_key = recommend_similar_articles(cleaned_abstracts, term_search='keywords')
# get recommendations based on msh terms
recommended_mesh = recommend_similar_articles(cleaned_abstracts, term_search='mesh_terms')
# render output when submitted
return render_template('output.html', results=cleaned_abstracts, rec_key=recommended_key,rec_mesh=recommended_mesh)
def extract_doi(url):
"""
This removes the doi.org prefix if it exists in the list of DOIs from the user
:param url: each string in the doi list
:return: the cleaned string
"""
prefix = "https://doi.org/"
if url.startswith(prefix):
return url[len(prefix):]
else:
return url
def abstract_analysis(doi_list, email, table_name):
"""
This is the main function of the script. It will find the PMIDs associated with the DOIs, extract the title and
abstracts. Generate the two sets of summaries and the keywords. It will return it in a list of dictionaries called
cleaned_abstracts and this will be written to the table seen in the output.html file.
:param doi_list: list of comma-separated values of dois.
:param email: the user's email address, str.
:param table_name: the name the user wants for this search. can be used more than once, str.
:return: list of dictionaries called cleaned_abstracts.
"""
Entrez.email = 'rwilfong@purdue.edu'
Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09'
# given a list of DOIs, find the PMIDs and article titles
id_list = []
titles = {}
abstracts = {}
mesh_terms = {}
dois = [extract_doi(doi) for doi in doi_list if extract_doi(doi)] # clean dois before iterating
for doi in dois:
handle = Entrez.esearch(db='pubmed', term=doi) # search pubmed with doi
record = Entrez.read(handle)
handle.close()
if len(record['IdList']) > 0: # if the IdList has a recording, continue
pmid = record['IdList'][0]
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text')
xml = handle.read()
handle.close()
root = ET.fromstring(xml)
article = root.find('.//PubmedArticle/MedlineCitation/Article')
title = article.find('ArticleTitle').text # extract title
id_list.append(pmid)
titles[pmid] = title # add to dictionary
abstract_elem = root.find('.//AbstractText') # extract abstract
abstract = abstract_elem.text.strip()
abstracts[pmid] = abstract # add to dictionary
mesh_heading_list = root.findall('.//PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading')
# extract MeSH terms
mesh_terms[pmid] = [mesh.find('DescriptorName').text for mesh in mesh_heading_list] # add to dictionary
# parse the abstracts
cleaned_abstracts = []
for pmid, abstract in abstracts.items():
# replace Medline tags with placeholders
placeholders = {}
abstract_lines = abstract.split('\n')
for i in range(len(abstract_lines)):
line = abstract_lines[i]
if line.startswith(' '):
tag, value = line.split('-', 1)
placeholder = f'__{tag.strip()}__'
abstract_lines[i] = placeholder + value
placeholders[placeholder] = tag.strip()
abstract = '\n'.join(abstract_lines)
# remove remaining tags and extra whitespace
abstract = ' '.join(abstract.split())
# Replace placeholders with tags
for placeholder, tag in placeholders.items():
abstract = abstract.replace(placeholder, f'<{tag}>')
# make set of summaries based on the abstract using two methods
# first up, Networkx
sentences = sent_tokenize(abstract) # get every sentence in the abstract
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(sentences)
# create a graph of sentence similarity
graph = nx.Graph()
for i, sentence_i in enumerate(sentences):
for j, sentence_j in enumerate(sentences):
if i == j:
continue
similarity = cosine_similarity(X[i], X[j])[0][0]
graph.add_edge(i, j, weight=similarity)
# compute PageRank scores for each sentence
scores = nx.pagerank(graph)
# get the indices of the top two sentences
top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])
# construct the summary by joining the top two sentences
nx_summary = ' '.join([sentences[i] for i in sorted(top_indices)])
# next summary
# second set of summaries using NLTK. This uses the same sentences variable from above
stop_words = set(stopwords.words('english') + list(punctuation))
# calculate the frequency of each word in the abstract
word_frequencies = {}
for word in abstract.split():
if word.lower() not in stop_words:
if word not in word_frequencies:
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
# calculate the score of each sentence based on the frequency of its words
sentence_scores = {}
for sentence in sentences:
for word in sentence.split():
if word.lower() in word_frequencies:
if len(sentence.split()) < 30:
if sentence not in sentence_scores:
sentence_scores[sentence] = word_frequencies[word]
else:
sentence_scores[sentence] += word_frequencies[word]
# sort the sentences by their score and return the top 2
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2]
# join the top 2 sentences to create the summary
nltk_summary = ' '.join(summary_sentences)
# Combine the summaries
total_summary = (f"Summary 1: {nx_summary} Summary 2: {nltk_summary}")
# maybe insert new line in the output tbl
# add to final list
cleaned_abstracts.append(
{'doi': doi_list[id_list.index(pmid)], 'pmid': pmid, 'title': titles[pmid], 'abstract': abstract,
'summary': total_summary, 'mesh_terms': mesh_terms[pmid]})
# now from the abstracts, return the keywords using scikit-learn
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
corpus = [a['abstract'] for a in cleaned_abstracts]
# iterate through corpus
for i, abstract in enumerate(corpus):
tfidf_x = tfidf_vectorizer.fit_transform([abstract])
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_scores = tfidf_vectorizer.idf_
keyword_scores = defaultdict(float)
for j in range(tfidf_x.shape[1]):
keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j]
top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5] # top 5
# to return with the scores associated:
# top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
cleaned_abstracts[i]['keywords'] = top_keywords
# add the date time
time = datetime.now()
for entry in cleaned_abstracts:
entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")
# insert into database
conn = sqlite3.connect('database.db')
c = conn.cursor()
c.execute('''INSERT INTO papers (email, table_name, doi, pmid, title, abstract, summary, mesh_terms, keywords, time)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
(email, table_name, entry['doi'], entry['pmid'], entry['title'], entry['abstract'], entry['summary'],
', '.join(entry['mesh_terms']), ', '.join(entry['keywords']), entry['time']))
conn.commit()
conn.close()
return cleaned_abstracts
# end abstract_analysis
def recommend_similar_articles(articles, term_search):
"""
Recommendation function for finding additional articles based on the ones being searched. This utilizes Entrez
and searches by either keywords or MeSH terms to find similar articles.
:param articles: returned dictionary from abstract_analysis or a similar dictionary with the same layout.
:param term_search: either keywords or mesh_terms. These are the words to go off of for finding recommended articles
:return: dictionary with original article and recommended articles
"""
# create a query from the list of keywords
abstracts = [article["abstract"] for article in articles]
queries = [" ".join(article[term_search]) for article in articles]
recommended_articles = []
for i, query in enumerate(queries):
try:
# collect the data
Entrez.email = "rwilfong@purdue.edu" # replace with your email address
Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09'
handle = Entrez.esearch(db="pubmed", term=query, retmax=50)
record = Entrez.read(handle)
ids = record["IdList"]
handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
records = Entrez.read(handle)
titles = []
for record in records["PubmedArticle"]:
try:
title = record["MedlineCitation"]["Article"]["ArticleTitle"]
titles.append(title)
except KeyError:
pass
# create a feature matrix
vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(abstracts)
# calculate similarity scores
similarity_scores = cosine_similarity(feature_matrix)
# get the indices of the most similar articles
num_articles = len(abstracts)
similar_indices = similarity_scores[-1].argsort()[
:-6:-1] # top 5 most similar articles, excluding the original article
# get the titles of the most similar articles
similar_titles = [titles[index] for index in similar_indices]
recommended_articles.append({"original": articles[i], "recommended": similar_titles})
except Exception as e:
recommended_articles.append({"original": articles[i], "recommended": [], "error": str(e)})
return recommended_articles
# end recommend_similar articles
if __name__ == '__main__':
app.run(debug=True)