flask_app_old_version.py

"""=================================================================================================
Flask application that will return a html page with input for a list of DOIs, email address, and table name.
Given the list of DOIs, it will use Entrez to search for the matching PMID, return the article. It will then parse the
XML to find the abstract and the title. Using the abstract, it will use two methods: nltk and networkx to return two
summaries of the paper to give the user a better understanding of their paper based on the two results. It will then use
scikit-learn to return the top 5 keywords associated with the paper. This will then write their results to a database
and return the output in a new html file formatted as a table.

Rose Wilfong & Wenxuan Dong
================================================================================================="""

from flask import Flask, render_template, request
from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import networkx as nx
import numpy as np
import xml.etree.ElementTree as ET
import sqlite3
from Bio import Entrez
from Bio import Medline
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('input.html')


@app.route('/query', methods=['POST'])
def process_data():
    email = request.form['email']
    table_name = request.form['table_name']
    doi_list = [doi.strip() for doi in request.form['links'].split(',')]
    # parse DOIs for PMIDS, MeSH terms, abstracts, summaries, and keywords
    cleaned_abstracts = abstract_analysis(doi_list, email, table_name)
    # get recommendations based on keywords
    recommended_key = recommend_similar_articles(cleaned_abstracts, term_search='keywords')
    # get recommendations based on msh terms
    recommended_mesh = recommend_similar_articles(cleaned_abstracts, term_search='mesh_terms')
    # render output when submitted
    return render_template('output.html', results=cleaned_abstracts, rec_key=recommended_key,rec_mesh=recommended_mesh)


def extract_doi(url):
    """
    This removes the doi.org prefix if it exists in the list of DOIs from the user
    :param url: each string  in the doi list
    :return: the cleaned string
    """
    prefix = "https://doi.org/"
    if url.startswith(prefix):
        return url[len(prefix):]
    else:
        return url


def abstract_analysis(doi_list, email, table_name):
    """
    This is the main function of the script. It will find the PMIDs associated with the DOIs, extract the title and
    abstracts. Generate the two sets of summaries and the keywords. It will return it in a list of dictionaries called
    cleaned_abstracts and this will be written to the table seen in the output.html file.

    :param doi_list: list of comma-separated values of dois.
    :param email: the user's email address, str.
    :param table_name: the name the user wants for this search. can be used more than once, str.
    :return: list of dictionaries called cleaned_abstracts.
    """
    Entrez.email = 'rwilfong@purdue.edu'
    Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09'

    # given a list of DOIs, find the PMIDs and article titles
    id_list = []
    titles = {}
    abstracts = {}
    mesh_terms = {}
    dois = [extract_doi(doi) for doi in doi_list if extract_doi(doi)]  # clean dois before iterating
    for doi in dois:
        handle = Entrez.esearch(db='pubmed', term=doi)  # search pubmed with doi
        record = Entrez.read(handle)
        handle.close()
        if len(record['IdList']) > 0:  # if the IdList has a recording, continue
            pmid = record['IdList'][0]
            handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text')
            xml = handle.read()
            handle.close()
            root = ET.fromstring(xml)
            article = root.find('.//PubmedArticle/MedlineCitation/Article')
            title = article.find('ArticleTitle').text  # extract title
            id_list.append(pmid)
            titles[pmid] = title  # add to dictionary
            abstract_elem = root.find('.//AbstractText')  # extract abstract
            abstract = abstract_elem.text.strip()
            abstracts[pmid] = abstract  # add to dictionary
            mesh_heading_list = root.findall('.//PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading')
            # extract MeSH terms
            mesh_terms[pmid] = [mesh.find('DescriptorName').text for mesh in mesh_heading_list]  # add to dictionary

    # parse the abstracts
    cleaned_abstracts = []
    for pmid, abstract in abstracts.items():
        # replace Medline tags with placeholders
        placeholders = {}
        abstract_lines = abstract.split('\n')
        for i in range(len(abstract_lines)):
            line = abstract_lines[i]
            if line.startswith(' '):
                tag, value = line.split('-', 1)
                placeholder = f'__{tag.strip()}__'
                abstract_lines[i] = placeholder + value
                placeholders[placeholder] = tag.strip()
        abstract = '\n'.join(abstract_lines)
        # remove remaining tags and extra whitespace
        abstract = ' '.join(abstract.split())

        # Replace placeholders with tags
        for placeholder, tag in placeholders.items():
            abstract = abstract.replace(placeholder, f'<{tag}>')

        # make set of summaries based on the abstract using two methods
        # first up, Networkx
        sentences = sent_tokenize(abstract)  # get every sentence in the abstract
        count_vectorizer = CountVectorizer()
        X = count_vectorizer.fit_transform(sentences)
        # create a graph of sentence similarity
        graph = nx.Graph()
        for i, sentence_i in enumerate(sentences):
            for j, sentence_j in enumerate(sentences):
                if i == j:
                    continue
                similarity = cosine_similarity(X[i], X[j])[0][0]
                graph.add_edge(i, j, weight=similarity)

        # compute PageRank scores for each sentence
        scores = nx.pagerank(graph)

        # get the indices of the top two sentences
        top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])

        # construct the summary by joining the top two sentences
        nx_summary = ' '.join([sentences[i] for i in sorted(top_indices)])

        # next summary
        # second set of summaries using NLTK. This uses the same sentences variable from above
        stop_words = set(stopwords.words('english') + list(punctuation))

        # calculate the frequency of each word in the abstract
        word_frequencies = {}
        for word in abstract.split():
            if word.lower() not in stop_words:
                if word not in word_frequencies:
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        # calculate the score of each sentence based on the frequency of its words
        sentence_scores = {}
        for sentence in sentences:
            for word in sentence.split():
                if word.lower() in word_frequencies:
                    if len(sentence.split()) < 30:
                        if sentence not in sentence_scores:
                            sentence_scores[sentence] = word_frequencies[word]
                        else:
                            sentence_scores[sentence] += word_frequencies[word]

        # sort the sentences by their score and return the top 2
        summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2]
        # join the top 2 sentences to create the summary
        nltk_summary = ' '.join(summary_sentences)
        # Combine the summaries
        total_summary = (f"Summary 1: {nx_summary} Summary 2: {nltk_summary}")
        # maybe insert new line in the output tbl
        # add to final list
        cleaned_abstracts.append(
            {'doi': doi_list[id_list.index(pmid)], 'pmid': pmid, 'title': titles[pmid], 'abstract': abstract,
             'summary': total_summary, 'mesh_terms': mesh_terms[pmid]})

    # now from the abstracts, return the keywords using scikit-learn
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    corpus = [a['abstract'] for a in cleaned_abstracts]
    # iterate through corpus
    for i, abstract in enumerate(corpus):
        tfidf_x = tfidf_vectorizer.fit_transform([abstract])
        feature_names = tfidf_vectorizer.get_feature_names_out()
        idf_scores = tfidf_vectorizer.idf_
        keyword_scores = defaultdict(float)
        for j in range(tfidf_x.shape[1]):
            keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j]
        top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5]  # top 5
        # to return with the scores associated:
        # top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
        cleaned_abstracts[i]['keywords'] = top_keywords

    # add the date time
    time = datetime.now()
    for entry in cleaned_abstracts:
        entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")

        # insert into database
        conn = sqlite3.connect('database.db')
        c = conn.cursor()
        c.execute('''INSERT INTO papers (email, table_name, doi, pmid, title, abstract, summary, mesh_terms, keywords, time)
                     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                  (email, table_name, entry['doi'], entry['pmid'], entry['title'], entry['abstract'], entry['summary'],
                   ', '.join(entry['mesh_terms']), ', '.join(entry['keywords']), entry['time']))
        conn.commit()
        conn.close()

    return cleaned_abstracts
# end abstract_analysis

def recommend_similar_articles(articles, term_search):
    """
    Recommendation function for finding additional articles based on the ones being searched. This utilizes Entrez
    and searches by either keywords or MeSH terms to find similar articles.
    :param articles: returned dictionary from abstract_analysis or a similar dictionary with the same layout.
    :param term_search: either keywords or mesh_terms. These are the words to go off of for finding recommended articles
    :return: dictionary with original article and recommended articles
    """
    # create a query from the list of keywords
    abstracts = [article["abstract"] for article in articles]
    queries = [" ".join(article[term_search]) for article in articles]

    recommended_articles = []
    for i, query in enumerate(queries):
        try:
            # collect the data
            Entrez.email = "rwilfong@purdue.edu"  # replace with your email address
            Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09'
            handle = Entrez.esearch(db="pubmed", term=query, retmax=50)
            record = Entrez.read(handle)
            ids = record["IdList"]
            handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
            records = Entrez.read(handle)

            titles = []
            for record in records["PubmedArticle"]:
                try:
                    title = record["MedlineCitation"]["Article"]["ArticleTitle"]
                    titles.append(title)
                except KeyError:
                    pass

            # create a feature matrix
            vectorizer = TfidfVectorizer()
            feature_matrix = vectorizer.fit_transform(abstracts)

            # calculate similarity scores
            similarity_scores = cosine_similarity(feature_matrix)

            # get the indices of the most similar articles
            num_articles = len(abstracts)
            similar_indices = similarity_scores[-1].argsort()[
                              :-6:-1]  # top 5 most similar articles, excluding the original article

            # get the titles of the most similar articles
            similar_titles = [titles[index] for index in similar_indices]

            recommended_articles.append({"original": articles[i], "recommended": similar_titles})
        except Exception as e:
            recommended_articles.append({"original": articles[i], "recommended": [], "error": str(e)})

    return recommended_articles
# end recommend_similar articles

if __name__ == '__main__':
    app.run(debug=True)
	"""=================================================================================================
	Flask application that will return a html page with input for a list of DOIs, email address, and table name.
	Given the list of DOIs, it will use Entrez to search for the matching PMID, return the article. It will then parse the
	XML to find the abstract and the title. Using the abstract, it will use two methods: nltk and networkx to return two
	summaries of the paper to give the user a better understanding of their paper based on the two results. It will then use
	scikit-learn to return the top 5 keywords associated with the paper. This will then write their results to a database
	and return the output in a new html file formatted as a table.

	Rose Wilfong & Wenxuan Dong
	================================================================================================="""

	from flask import Flask, render_template, request
	from datetime import datetime
	from collections import defaultdict
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from nltk.tokenize import sent_tokenize
	import networkx as nx
	import numpy as np
	import xml.etree.ElementTree as ET
	import sqlite3
	from Bio import Entrez
	from Bio import Medline
	from string import punctuation
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer

	app = Flask(__name__)


	@app.route('/')
	def index():
	return render_template('input.html')


	@app.route('/query', methods=['POST'])
	def process_data():
	email = request.form['email']
	table_name = request.form['table_name']
	doi_list = [doi.strip() for doi in request.form['links'].split(',')]
	# parse DOIs for PMIDS, MeSH terms, abstracts, summaries, and keywords
	cleaned_abstracts = abstract_analysis(doi_list, email, table_name)
	# get recommendations based on keywords
	recommended_key = recommend_similar_articles(cleaned_abstracts, term_search='keywords')
	# get recommendations based on msh terms
	recommended_mesh = recommend_similar_articles(cleaned_abstracts, term_search='mesh_terms')
	# render output when submitted
	return render_template('output.html', results=cleaned_abstracts, rec_key=recommended_key,rec_mesh=recommended_mesh)


	def extract_doi(url):
	"""
	This removes the doi.org prefix if it exists in the list of DOIs from the user
	:param url: each string in the doi list
	:return: the cleaned string
	"""
	prefix = "https://doi.org/"
	if url.startswith(prefix):
	return url[len(prefix):]
	else:
	return url


	def abstract_analysis(doi_list, email, table_name):
	"""
	This is the main function of the script. It will find the PMIDs associated with the DOIs, extract the title and
	abstracts. Generate the two sets of summaries and the keywords. It will return it in a list of dictionaries called
	cleaned_abstracts and this will be written to the table seen in the output.html file.

	:param doi_list: list of comma-separated values of dois.
	:param email: the user's email address, str.
	:param table_name: the name the user wants for this search. can be used more than once, str.
	:return: list of dictionaries called cleaned_abstracts.
	"""
	Entrez.email = 'rwilfong@purdue.edu'
	Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09'

	# given a list of DOIs, find the PMIDs and article titles
	id_list = []
	titles = {}
	abstracts = {}
	mesh_terms = {}
	dois = [extract_doi(doi) for doi in doi_list if extract_doi(doi)] # clean dois before iterating
	for doi in dois:
	handle = Entrez.esearch(db='pubmed', term=doi) # search pubmed with doi
	record = Entrez.read(handle)
	handle.close()
	if len(record['IdList']) > 0: # if the IdList has a recording, continue
	pmid = record['IdList'][0]
	handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text')
	xml = handle.read()
	handle.close()
	root = ET.fromstring(xml)
	article = root.find('.//PubmedArticle/MedlineCitation/Article')
	title = article.find('ArticleTitle').text # extract title
	id_list.append(pmid)
	titles[pmid] = title # add to dictionary
	abstract_elem = root.find('.//AbstractText') # extract abstract
	abstract = abstract_elem.text.strip()
	abstracts[pmid] = abstract # add to dictionary
	mesh_heading_list = root.findall('.//PubmedArticle/MedlineCitation/MeshHeadingList/MeshHeading')
	# extract MeSH terms
	mesh_terms[pmid] = [mesh.find('DescriptorName').text for mesh in mesh_heading_list] # add to dictionary

	# parse the abstracts
	cleaned_abstracts = []
	for pmid, abstract in abstracts.items():
	# replace Medline tags with placeholders
	placeholders = {}
	abstract_lines = abstract.split('\n')
	for i in range(len(abstract_lines)):
	line = abstract_lines[i]
	if line.startswith(' '):
	tag, value = line.split('-', 1)
	placeholder = f'__{tag.strip()}__'
	abstract_lines[i] = placeholder + value
	placeholders[placeholder] = tag.strip()
	abstract = '\n'.join(abstract_lines)
	# remove remaining tags and extra whitespace
	abstract = ' '.join(abstract.split())

	# Replace placeholders with tags
	for placeholder, tag in placeholders.items():
	abstract = abstract.replace(placeholder, f'<{tag}>')

	# make set of summaries based on the abstract using two methods
	# first up, Networkx
	sentences = sent_tokenize(abstract) # get every sentence in the abstract
	count_vectorizer = CountVectorizer()
	X = count_vectorizer.fit_transform(sentences)
	# create a graph of sentence similarity
	graph = nx.Graph()
	for i, sentence_i in enumerate(sentences):
	for j, sentence_j in enumerate(sentences):
	if i == j:
	continue
	similarity = cosine_similarity(X[i], X[j])[0][0]
	graph.add_edge(i, j, weight=similarity)

	# compute PageRank scores for each sentence
	scores = nx.pagerank(graph)

	# get the indices of the top two sentences
	top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])

	# construct the summary by joining the top two sentences
	nx_summary = ' '.join([sentences[i] for i in sorted(top_indices)])

	# next summary
	# second set of summaries using NLTK. This uses the same sentences variable from above
	stop_words = set(stopwords.words('english') + list(punctuation))

	# calculate the frequency of each word in the abstract
	word_frequencies = {}
	for word in abstract.split():
	if word.lower() not in stop_words:
	if word not in word_frequencies:
	word_frequencies[word] = 1
	else:
	word_frequencies[word] += 1

	# calculate the score of each sentence based on the frequency of its words
	sentence_scores = {}
	for sentence in sentences:
	for word in sentence.split():
	if word.lower() in word_frequencies:
	if len(sentence.split()) < 30:
	if sentence not in sentence_scores:
	sentence_scores[sentence] = word_frequencies[word]
	else:
	sentence_scores[sentence] += word_frequencies[word]

	# sort the sentences by their score and return the top 2
	summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2]
	# join the top 2 sentences to create the summary
	nltk_summary = ' '.join(summary_sentences)
	# Combine the summaries
	total_summary = (f"Summary 1: {nx_summary} Summary 2: {nltk_summary}")
	# maybe insert new line in the output tbl
	# add to final list
	cleaned_abstracts.append(
	{'doi': doi_list[id_list.index(pmid)], 'pmid': pmid, 'title': titles[pmid], 'abstract': abstract,
	'summary': total_summary, 'mesh_terms': mesh_terms[pmid]})

	# now from the abstracts, return the keywords using scikit-learn
	tfidf_vectorizer = TfidfVectorizer(stop_words='english')
	corpus = [a['abstract'] for a in cleaned_abstracts]
	# iterate through corpus
	for i, abstract in enumerate(corpus):
	tfidf_x = tfidf_vectorizer.fit_transform([abstract])
	feature_names = tfidf_vectorizer.get_feature_names_out()
	idf_scores = tfidf_vectorizer.idf_
	keyword_scores = defaultdict(float)
	for j in range(tfidf_x.shape[1]):
	keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j]
	top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5] # top 5
	# to return with the scores associated:
	# top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
	cleaned_abstracts[i]['keywords'] = top_keywords

	# add the date time
	time = datetime.now()
	for entry in cleaned_abstracts:
	entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")

	# insert into database
	conn = sqlite3.connect('database.db')
	c = conn.cursor()
	c.execute('''INSERT INTO papers (email, table_name, doi, pmid, title, abstract, summary, mesh_terms, keywords, time)
	VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
	(email, table_name, entry['doi'], entry['pmid'], entry['title'], entry['abstract'], entry['summary'],
	', '.join(entry['mesh_terms']), ', '.join(entry['keywords']), entry['time']))
	conn.commit()
	conn.close()

	return cleaned_abstracts
	# end abstract_analysis

	def recommend_similar_articles(articles, term_search):
	"""
	Recommendation function for finding additional articles based on the ones being searched. This utilizes Entrez
	and searches by either keywords or MeSH terms to find similar articles.
	:param articles: returned dictionary from abstract_analysis or a similar dictionary with the same layout.
	:param term_search: either keywords or mesh_terms. These are the words to go off of for finding recommended articles
	:return: dictionary with original article and recommended articles
	"""
	# create a query from the list of keywords
	abstracts = [article["abstract"] for article in articles]
	queries = [" ".join(article[term_search]) for article in articles]

	recommended_articles = []
	for i, query in enumerate(queries):
	try:
	# collect the data
	Entrez.email = "rwilfong@purdue.edu" # replace with your email address
	Entrez.api_key = 'd881ac932ddb3b61dcb88feac3fcc450af09'
	handle = Entrez.esearch(db="pubmed", term=query, retmax=50)
	record = Entrez.read(handle)
	ids = record["IdList"]
	handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
	records = Entrez.read(handle)

	titles = []
	for record in records["PubmedArticle"]:
	try:
	title = record["MedlineCitation"]["Article"]["ArticleTitle"]
	titles.append(title)
	except KeyError:
	pass

	# create a feature matrix
	vectorizer = TfidfVectorizer()
	feature_matrix = vectorizer.fit_transform(abstracts)

	# calculate similarity scores
	similarity_scores = cosine_similarity(feature_matrix)

	# get the indices of the most similar articles
	num_articles = len(abstracts)
	similar_indices = similarity_scores[-1].argsort()[
	:-6:-1] # top 5 most similar articles, excluding the original article

	# get the titles of the most similar articles
	similar_titles = [titles[index] for index in similar_indices]

	recommended_articles.append({"original": articles[i], "recommended": similar_titles})
	except Exception as e:
	recommended_articles.append({"original": articles[i], "recommended": [], "error": str(e)})

	return recommended_articles
	# end recommend_similar articles

	if __name__ == '__main__':
	app.run(debug=True)