Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from Bio import Entrez
import xml.etree.ElementTree as ET
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
"""
This program takes an input of a list of doi's. It then uses Entrez to find the PMIDs based on the DOIs and returns the abstract and title.
Then it summarizes the abstract and returns the top 5 keywords.
"""
def abstract_analysis(doi_list):
Entrez.email = 'rwilfong@purdue.edu'
# Given a list of DOIs, find the PMIDs and article titles
id_list = []
titles = {}
for doi in doi_list:
handle = Entrez.esearch(db='pubmed', term=doi)
record = Entrez.read(handle)
handle.close()
if len(record['IdList']) > 0:
pmid = record['IdList'][0]
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text')
xml = handle.read()
handle.close()
root = ET.fromstring(xml)
article = root.find('.//PubmedArticle/MedlineCitation/Article')
title = article.find('ArticleTitle').text
id_list.append(pmid)
titles[pmid] = title
# Find the abstracts associated with the PMIDs
abstracts = {}
for pmid in id_list:
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='abstract', retmode='text')
abstract = handle.read()
handle.close()
abstracts[pmid] = abstract.strip()
# Parse the abstracts
cleaned_abstracts = []
for pmid, abstract in abstracts.items():
# Replace Medline tags with placeholders
placeholders = {}
abstract_lines = abstract.split('\n')
for i in range(len(abstract_lines)):
line = abstract_lines[i]
if line.startswith(' '):
tag, value = line.split('-', 1)
placeholder = f'__{tag.strip()}__'
abstract_lines[i] = placeholder + value
placeholders[placeholder] = tag.strip()
abstract = '\n'.join(abstract_lines)
# Remove remaining tags and extra whitespace
abstract = ' '.join(abstract.split())
# Replace placeholders with tags
for placeholder, tag in placeholders.items():
abstract = abstract.replace(placeholder, f'<{tag}>')
sentences = sent_tokenize(abstract) # get every sentence in the abstract
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(sentences)
# create a graph of sentence similarity
graph = nx.Graph()
for i, sentence_i in enumerate(sentences):
for j, sentence_j in enumerate(sentences):
if i == j:
continue
similarity = cosine_similarity(X[i], X[j])[0][0]
graph.add_edge(i, j, weight=similarity)
# compute PageRank scores for each sentence
scores = nx.pagerank(graph)
# get the indices of the top two sentences
top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])
# construct the summary by joining the top two sentences
summary = ' '.join([sentences[i] for i in sorted(top_indices)])
cleaned_abstracts.append(
{'doi': doi_list[id_list.index(pmid)], 'title': titles[pmid], 'abstract': abstract, 'summary': summary})
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
corpus = [a['abstract'] for a in cleaned_abstracts]
# iterate through corpus
for i, abstract in enumerate(corpus):
tfidf_x = tfidf_vectorizer.fit_transform([abstract])
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_scores = tfidf_vectorizer.idf_
keyword_scores = defaultdict(float)
for j in range(tfidf_x.shape[1]):
keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j]
top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5]
# to return with the scores associated: top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
cleaned_abstracts[i]['keywords'] = top_keywords
# add the date time
time = datetime.now()
for entry in cleaned_abstracts:
entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")
print(cleaned_abstracts)
return cleaned_abstracts
if __name__ == '__main__':
doi_list = ['10.1016/j.jsb.2006.07.014', '10.1016/j.jsb.2017.07.007']
abstract_analysis(doi_list)