Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
doi_summarizer/summary_example.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
116 lines (100 sloc)
4.57 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from collections import defaultdict | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from Bio import Entrez | |
import xml.etree.ElementTree as ET | |
from nltk.tokenize import sent_tokenize | |
import numpy as np | |
from sklearn.feature_extraction.text import CountVectorizer | |
import networkx as nx | |
from sklearn.metrics.pairwise import cosine_similarity | |
""" | |
This program takes an input of a list of doi's. It then uses Entrez to find the PMIDs based on the DOIs and returns the abstract and title. | |
Then it summarizes the abstract and returns the top 5 keywords. | |
""" | |
def abstract_analysis(doi_list): | |
Entrez.email = 'rwilfong@purdue.edu' | |
# Given a list of DOIs, find the PMIDs and article titles | |
id_list = [] | |
titles = {} | |
for doi in doi_list: | |
handle = Entrez.esearch(db='pubmed', term=doi) | |
record = Entrez.read(handle) | |
handle.close() | |
if len(record['IdList']) > 0: | |
pmid = record['IdList'][0] | |
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text') | |
xml = handle.read() | |
handle.close() | |
root = ET.fromstring(xml) | |
article = root.find('.//PubmedArticle/MedlineCitation/Article') | |
title = article.find('ArticleTitle').text | |
id_list.append(pmid) | |
titles[pmid] = title | |
# Find the abstracts associated with the PMIDs | |
abstracts = {} | |
for pmid in id_list: | |
handle = Entrez.efetch(db='pubmed', id=pmid, rettype='abstract', retmode='text') | |
abstract = handle.read() | |
handle.close() | |
abstracts[pmid] = abstract.strip() | |
# Parse the abstracts | |
cleaned_abstracts = [] | |
for pmid, abstract in abstracts.items(): | |
# Replace Medline tags with placeholders | |
placeholders = {} | |
abstract_lines = abstract.split('\n') | |
for i in range(len(abstract_lines)): | |
line = abstract_lines[i] | |
if line.startswith(' '): | |
tag, value = line.split('-', 1) | |
placeholder = f'__{tag.strip()}__' | |
abstract_lines[i] = placeholder + value | |
placeholders[placeholder] = tag.strip() | |
abstract = '\n'.join(abstract_lines) | |
# Remove remaining tags and extra whitespace | |
abstract = ' '.join(abstract.split()) | |
# Replace placeholders with tags | |
for placeholder, tag in placeholders.items(): | |
abstract = abstract.replace(placeholder, f'<{tag}>') | |
sentences = sent_tokenize(abstract) # get every sentence in the abstract | |
count_vectorizer = CountVectorizer() | |
X = count_vectorizer.fit_transform(sentences) | |
# create a graph of sentence similarity | |
graph = nx.Graph() | |
for i, sentence_i in enumerate(sentences): | |
for j, sentence_j in enumerate(sentences): | |
if i == j: | |
continue | |
similarity = cosine_similarity(X[i], X[j])[0][0] | |
graph.add_edge(i, j, weight=similarity) | |
# compute PageRank scores for each sentence | |
scores = nx.pagerank(graph) | |
# get the indices of the top two sentences | |
top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2]) | |
# construct the summary by joining the top two sentences | |
summary = ' '.join([sentences[i] for i in sorted(top_indices)]) | |
cleaned_abstracts.append( | |
{'doi': doi_list[id_list.index(pmid)], 'title': titles[pmid], 'abstract': abstract, 'summary': summary}) | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') | |
corpus = [a['abstract'] for a in cleaned_abstracts] | |
# iterate through corpus | |
for i, abstract in enumerate(corpus): | |
tfidf_x = tfidf_vectorizer.fit_transform([abstract]) | |
feature_names = tfidf_vectorizer.get_feature_names_out() | |
idf_scores = tfidf_vectorizer.idf_ | |
keyword_scores = defaultdict(float) | |
for j in range(tfidf_x.shape[1]): | |
keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j] | |
top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5] | |
# to return with the scores associated: top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5] | |
cleaned_abstracts[i]['keywords'] = top_keywords | |
# add the date time | |
time = datetime.now() | |
for entry in cleaned_abstracts: | |
entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S") | |
print(cleaned_abstracts) | |
return cleaned_abstracts | |
if __name__ == '__main__': | |
doi_list = ['10.1016/j.jsb.2006.07.014', '10.1016/j.jsb.2017.07.007'] | |
abstract_analysis(doi_list) |