Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
doi_summarizer/flask_app_old_version.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
274 lines (237 sloc)
12.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""================================================================================================= | |
Flask application that will return a html page with input for a list of DOIs, email address, and table name. | |
Given the list of DOIs, it will use Entrez to search for the matching PMID, return the article. It will then parse the | |
XML to find the abstract and the title. Using the abstract, it will use two methods: nltk and networkx to return two | |
summaries of the paper to give the user a better understanding of their paper based on the two results. It will then use | |
scikit-learn to return the top 5 keywords associated with the paper. This will then write their results to a database | |
and return the output in a new html file formatted as a table. | |
Rose Wilfong & Wenxuan Dong | |
=================================================================================================""" | |
from flask import Flask, render_template, request | |
from datetime import datetime | |
from collections import defaultdict | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from nltk.tokenize import sent_tokenize | |
import networkx as nx | |
import numpy as np | |
import xml.etree.ElementTree as ET | |
import sqlite3 | |
from Bio import Entrez | |
from Bio import Medline | |
from string import punctuation | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
app = Flask(__name__) | |
@app.route('/') | |
def index(): | |
return render_template('input.html') | |
@app.route('/query', methods=['POST']) | |
def process_data(): | |
email = request.form['email'] | |
table_name = request.form['table_name'] | |
doi_list = [doi.strip() for doi in request.form['links'].split(',')] | |
# parse DOIs for PMIDS, MeSH terms, abstracts, summaries, and keywords | |
cleaned_abstracts = abstract_analysis(doi_list, email, table_name) | |
# get recommendations based on keywords | |
recommended_key = recommend_similar_articles(cleaned_abstracts, term_search='keywords') | |
# get recommendations based on msh terms | |
recommended_mesh = recommend_similar_articles(cleaned_abstracts, term_search='mesh_terms') | |
# render output when submitted | |
return render_template('output.html', results=cleaned_abstracts, rec_key=recommended_key,rec_mesh=recommended_mesh) | |
def extract_doi(url): | |
""" | |
This removes the doi.org prefix if it exists in the list of DOIs from the user | |
:param url: each string in the doi list | |
:return: the cleaned string | |
""" | |
prefix = "https://doi.org/" | |
if url.startswith(prefix): | |
return url[len(prefix):] | |
else: | |
return url | |
def abstract_analysis(doi_list, email, table_name): | |
""" | |
This is the main function of the script. It will find the PMIDs associated with the DOIs, extract the title and | |
abstracts. Generate the two sets of summaries and the keywords. It will return it in a list of dictionaries called | |
cleaned_abstracts and this will be written to the table seen in the output.html file. | |
:param doi_list: list of comma-separated values of dois. | |
:param email: the user's email address, str. | |
:param table_name: the name the user wants for this search. can be used more than once, str. | |
:return: list of dictionaries called cleaned_abstracts. | |
""" | |
Entrez.email = 'rwilfong@purdue.edu' | |
Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09' | |
# given a list of DOIs, find the PMIDs and article titles | |
id_list = [] | |
titles = {} | |
abstracts = {} | |
mesh_terms = {} | |
dois = [extract_doi(doi) for doi in doi_list if extract_doi(doi)] # clean dois before iterating | |
for doi in dois: | |
handle = Entrez.esearch(db='pubmed', term=doi) # search pubmed with doi | |
record = Entrez.read(handle) | |
handle.close() | |
if len(record['IdList']) > 0: # if the IdList has a recording, continue | |
pmid = record['IdList'][0] | |
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text') | |
xml = handle.read() | |
handle.close() | |
root = ET.fromstring(xml) | |
article = root.find('.//PubmedArticle/MedlineCitation/Article') | |
title = article.find('ArticleTitle').text # extract title | |
id_list.append(pmid) | |
titles[pmid] = title # add to dictionary | |
abstract_elem = root.find('.//AbstractText') # extract abstract | |
abstract = abstract_elem.text.strip() | |
abstracts[pmid] = abstract # add to dictionary | |
mesh_heading_list = root.findall('.//PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading') | |
# extract MeSH terms | |
mesh_terms[pmid] = [mesh.find('DescriptorName').text for mesh in mesh_heading_list] # add to dictionary | |
# parse the abstracts | |
cleaned_abstracts = [] | |
for pmid, abstract in abstracts.items(): | |
# replace Medline tags with placeholders | |
placeholders = {} | |
abstract_lines = abstract.split('\n') | |
for i in range(len(abstract_lines)): | |
line = abstract_lines[i] | |
if line.startswith(' '): | |
tag, value = line.split('-', 1) | |
placeholder = f'__{tag.strip()}__' | |
abstract_lines[i] = placeholder + value | |
placeholders[placeholder] = tag.strip() | |
abstract = '\n'.join(abstract_lines) | |
# remove remaining tags and extra whitespace | |
abstract = ' '.join(abstract.split()) | |
# Replace placeholders with tags | |
for placeholder, tag in placeholders.items(): | |
abstract = abstract.replace(placeholder, f'<{tag}>') | |
# make set of summaries based on the abstract using two methods | |
# first up, Networkx | |
sentences = sent_tokenize(abstract) # get every sentence in the abstract | |
count_vectorizer = CountVectorizer() | |
X = count_vectorizer.fit_transform(sentences) | |
# create a graph of sentence similarity | |
graph = nx.Graph() | |
for i, sentence_i in enumerate(sentences): | |
for j, sentence_j in enumerate(sentences): | |
if i == j: | |
continue | |
similarity = cosine_similarity(X[i], X[j])[0][0] | |
graph.add_edge(i, j, weight=similarity) | |
# compute PageRank scores for each sentence | |
scores = nx.pagerank(graph) | |
# get the indices of the top two sentences | |
top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2]) | |
# construct the summary by joining the top two sentences | |
nx_summary = ' '.join([sentences[i] for i in sorted(top_indices)]) | |
# next summary | |
# second set of summaries using NLTK. This uses the same sentences variable from above | |
stop_words = set(stopwords.words('english') + list(punctuation)) | |
# calculate the frequency of each word in the abstract | |
word_frequencies = {} | |
for word in abstract.split(): | |
if word.lower() not in stop_words: | |
if word not in word_frequencies: | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
# calculate the score of each sentence based on the frequency of its words | |
sentence_scores = {} | |
for sentence in sentences: | |
for word in sentence.split(): | |
if word.lower() in word_frequencies: | |
if len(sentence.split()) < 30: | |
if sentence not in sentence_scores: | |
sentence_scores[sentence] = word_frequencies[word] | |
else: | |
sentence_scores[sentence] += word_frequencies[word] | |
# sort the sentences by their score and return the top 2 | |
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2] | |
# join the top 2 sentences to create the summary | |
nltk_summary = ' '.join(summary_sentences) | |
# Combine the summaries | |
total_summary = (f"Summary 1: {nx_summary} Summary 2: {nltk_summary}") | |
# maybe insert new line in the output tbl | |
# add to final list | |
cleaned_abstracts.append( | |
{'doi': doi_list[id_list.index(pmid)], 'pmid': pmid, 'title': titles[pmid], 'abstract': abstract, | |
'summary': total_summary, 'mesh_terms': mesh_terms[pmid]}) | |
# now from the abstracts, return the keywords using scikit-learn | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') | |
corpus = [a['abstract'] for a in cleaned_abstracts] | |
# iterate through corpus | |
for i, abstract in enumerate(corpus): | |
tfidf_x = tfidf_vectorizer.fit_transform([abstract]) | |
feature_names = tfidf_vectorizer.get_feature_names_out() | |
idf_scores = tfidf_vectorizer.idf_ | |
keyword_scores = defaultdict(float) | |
for j in range(tfidf_x.shape[1]): | |
keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j] | |
top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5] # top 5 | |
# to return with the scores associated: | |
# top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5] | |
cleaned_abstracts[i]['keywords'] = top_keywords | |
# add the date time | |
time = datetime.now() | |
for entry in cleaned_abstracts: | |
entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S") | |
# insert into database | |
conn = sqlite3.connect('database.db') | |
c = conn.cursor() | |
c.execute('''INSERT INTO papers (email, table_name, doi, pmid, title, abstract, summary, mesh_terms, keywords, time) | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', | |
(email, table_name, entry['doi'], entry['pmid'], entry['title'], entry['abstract'], entry['summary'], | |
', '.join(entry['mesh_terms']), ', '.join(entry['keywords']), entry['time'])) | |
conn.commit() | |
conn.close() | |
return cleaned_abstracts | |
# end abstract_analysis | |
def recommend_similar_articles(articles, term_search): | |
""" | |
Recommendation function for finding additional articles based on the ones being searched. This utilizes Entrez | |
and searches by either keywords or MeSH terms to find similar articles. | |
:param articles: returned dictionary from abstract_analysis or a similar dictionary with the same layout. | |
:param term_search: either keywords or mesh_terms. These are the words to go off of for finding recommended articles | |
:return: dictionary with original article and recommended articles | |
""" | |
# create a query from the list of keywords | |
abstracts = [article["abstract"] for article in articles] | |
queries = [" ".join(article[term_search]) for article in articles] | |
recommended_articles = [] | |
for i, query in enumerate(queries): | |
try: | |
# collect the data | |
Entrez.email = "rwilfong@purdue.edu" # replace with your email address | |
Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09' | |
handle = Entrez.esearch(db="pubmed", term=query, retmax=50) | |
record = Entrez.read(handle) | |
ids = record["IdList"] | |
handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml") | |
records = Entrez.read(handle) | |
titles = [] | |
for record in records["PubmedArticle"]: | |
try: | |
title = record["MedlineCitation"]["Article"]["ArticleTitle"] | |
titles.append(title) | |
except KeyError: | |
pass | |
# create a feature matrix | |
vectorizer = TfidfVectorizer() | |
feature_matrix = vectorizer.fit_transform(abstracts) | |
# calculate similarity scores | |
similarity_scores = cosine_similarity(feature_matrix) | |
# get the indices of the most similar articles | |
num_articles = len(abstracts) | |
similar_indices = similarity_scores[-1].argsort()[ | |
:-6:-1] # top 5 most similar articles, excluding the original article | |
# get the titles of the most similar articles | |
similar_titles = [titles[index] for index in similar_indices] | |
recommended_articles.append({"original": articles[i], "recommended": similar_titles}) | |
except Exception as e: | |
recommended_articles.append({"original": articles[i], "recommended": [], "error": str(e)}) | |
return recommended_articles | |
# end recommend_similar articles | |
if __name__ == '__main__': | |
app.run(debug=True) |