summary_example.py

from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from Bio import Entrez
import xml.etree.ElementTree as ET
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

"""
This program takes an input of a list of doi's. It then uses Entrez to find the PMIDs based on the DOIs and returns the abstract and title.
Then it summarizes the abstract and returns the top 5 keywords.
"""


def abstract_analysis(doi_list):
    Entrez.email = 'rwilfong@purdue.edu'

    # Given a list of DOIs, find the PMIDs and article titles
    id_list = []
    titles = {}
    for doi in doi_list:
        handle = Entrez.esearch(db='pubmed', term=doi)
        record = Entrez.read(handle)
        handle.close()
        if len(record['IdList']) > 0:
            pmid = record['IdList'][0]
            handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text')
            xml = handle.read()
            handle.close()
            root = ET.fromstring(xml)
            article = root.find('.//PubmedArticle/MedlineCitation/Article')
            title = article.find('ArticleTitle').text
            id_list.append(pmid)
            titles[pmid] = title

    # Find the abstracts associated with the PMIDs
    abstracts = {}
    for pmid in id_list:
        handle = Entrez.efetch(db='pubmed', id=pmid, rettype='abstract', retmode='text')
        abstract = handle.read()
        handle.close()
        abstracts[pmid] = abstract.strip()

    # Parse the abstracts
    cleaned_abstracts = []
    for pmid, abstract in abstracts.items():
        # Replace Medline tags with placeholders
        placeholders = {}
        abstract_lines = abstract.split('\n')
        for i in range(len(abstract_lines)):
            line = abstract_lines[i]
            if line.startswith(' '):
                tag, value = line.split('-', 1)
                placeholder = f'__{tag.strip()}__'
                abstract_lines[i] = placeholder + value
                placeholders[placeholder] = tag.strip()
        abstract = '\n'.join(abstract_lines)
        # Remove remaining tags and extra whitespace
        abstract = ' '.join(abstract.split())

        # Replace placeholders with tags
        for placeholder, tag in placeholders.items():
            abstract = abstract.replace(placeholder, f'<{tag}>')

        sentences = sent_tokenize(abstract)  # get every sentence in the abstract
        count_vectorizer = CountVectorizer()
        X = count_vectorizer.fit_transform(sentences)
        # create a graph of sentence similarity
        graph = nx.Graph()
        for i, sentence_i in enumerate(sentences):
            for j, sentence_j in enumerate(sentences):
                if i == j:
                    continue
                similarity = cosine_similarity(X[i], X[j])[0][0]
                graph.add_edge(i, j, weight=similarity)

        # compute PageRank scores for each sentence
        scores = nx.pagerank(graph)

        # get the indices of the top two sentences
        top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])

        # construct the summary by joining the top two sentences
        summary = ' '.join([sentences[i] for i in sorted(top_indices)])

    cleaned_abstracts.append(
        {'doi': doi_list[id_list.index(pmid)], 'title': titles[pmid], 'abstract': abstract, 'summary': summary})

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    corpus = [a['abstract'] for a in cleaned_abstracts]
    # iterate through corpus
    for i, abstract in enumerate(corpus):
        tfidf_x = tfidf_vectorizer.fit_transform([abstract])
        feature_names = tfidf_vectorizer.get_feature_names_out()
        idf_scores = tfidf_vectorizer.idf_
        keyword_scores = defaultdict(float)
        for j in range(tfidf_x.shape[1]):
            keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j]
        top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5]
        # to return with the scores associated: top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
        cleaned_abstracts[i]['keywords'] = top_keywords

    # add the date time
    time = datetime.now()
    for entry in cleaned_abstracts:
        entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")
    print(cleaned_abstracts)

    return cleaned_abstracts

if __name__ == '__main__':
    doi_list = ['10.1016/j.jsb.2006.07.014', '10.1016/j.jsb.2017.07.007']
    abstract_analysis(doi_list)
	from datetime import datetime
	from collections import defaultdict
	from sklearn.feature_extraction.text import TfidfVectorizer
	from Bio import Entrez
	import xml.etree.ElementTree as ET
	from nltk.tokenize import sent_tokenize
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	import networkx as nx
	from sklearn.metrics.pairwise import cosine_similarity

	"""
	This program takes an input of a list of doi's. It then uses Entrez to find the PMIDs based on the DOIs and returns the abstract and title.
	Then it summarizes the abstract and returns the top 5 keywords.
	"""


	def abstract_analysis(doi_list):
	Entrez.email = 'rwilfong@purdue.edu'

	# Given a list of DOIs, find the PMIDs and article titles
	id_list = []
	titles = {}
	for doi in doi_list:
	handle = Entrez.esearch(db='pubmed', term=doi)
	record = Entrez.read(handle)
	handle.close()
	if len(record['IdList']) > 0:
	pmid = record['IdList'][0]
	handle = Entrez.efetch(db='pubmed', id=pmid, rettype='xml', retmode='text')
	xml = handle.read()
	handle.close()
	root = ET.fromstring(xml)
	article = root.find('.//PubmedArticle/MedlineCitation/Article')
	title = article.find('ArticleTitle').text
	id_list.append(pmid)
	titles[pmid] = title

	# Find the abstracts associated with the PMIDs
	abstracts = {}
	for pmid in id_list:
	handle = Entrez.efetch(db='pubmed', id=pmid, rettype='abstract', retmode='text')
	abstract = handle.read()
	handle.close()
	abstracts[pmid] = abstract.strip()

	# Parse the abstracts
	cleaned_abstracts = []
	for pmid, abstract in abstracts.items():
	# Replace Medline tags with placeholders
	placeholders = {}
	abstract_lines = abstract.split('\n')
	for i in range(len(abstract_lines)):
	line = abstract_lines[i]
	if line.startswith(' '):
	tag, value = line.split('-', 1)
	placeholder = f'__{tag.strip()}__'
	abstract_lines[i] = placeholder + value
	placeholders[placeholder] = tag.strip()
	abstract = '\n'.join(abstract_lines)
	# Remove remaining tags and extra whitespace
	abstract = ' '.join(abstract.split())

	# Replace placeholders with tags
	for placeholder, tag in placeholders.items():
	abstract = abstract.replace(placeholder, f'<{tag}>')

	sentences = sent_tokenize(abstract) # get every sentence in the abstract
	count_vectorizer = CountVectorizer()
	X = count_vectorizer.fit_transform(sentences)
	# create a graph of sentence similarity
	graph = nx.Graph()
	for i, sentence_i in enumerate(sentences):
	for j, sentence_j in enumerate(sentences):
	if i == j:
	continue
	similarity = cosine_similarity(X[i], X[j])[0][0]
	graph.add_edge(i, j, weight=similarity)

	# compute PageRank scores for each sentence
	scores = nx.pagerank(graph)

	# get the indices of the top two sentences
	top_indices = np.array(sorted(scores, key=scores.get, reverse=True)[:2])

	# construct the summary by joining the top two sentences
	summary = ' '.join([sentences[i] for i in sorted(top_indices)])

	cleaned_abstracts.append(
	{'doi': doi_list[id_list.index(pmid)], 'title': titles[pmid], 'abstract': abstract, 'summary': summary})

	tfidf_vectorizer = TfidfVectorizer(stop_words='english')
	corpus = [a['abstract'] for a in cleaned_abstracts]
	# iterate through corpus
	for i, abstract in enumerate(corpus):
	tfidf_x = tfidf_vectorizer.fit_transform([abstract])
	feature_names = tfidf_vectorizer.get_feature_names_out()
	idf_scores = tfidf_vectorizer.idf_
	keyword_scores = defaultdict(float)
	for j in range(tfidf_x.shape[1]):
	keyword_scores[feature_names[j]] += tfidf_x[0, j] * idf_scores[j]
	top_keywords = sorted(keyword_scores, key=keyword_scores.get, reverse=True)[:5]
	# to return with the scores associated: top_keywords = sorted(keyword_scores.items(), key=itemgetter(1), reverse=True)[:5]
	cleaned_abstracts[i]['keywords'] = top_keywords

	# add the date time
	time = datetime.now()
	for entry in cleaned_abstracts:
	entry['time'] = time.strftime("%Y-%m-%d %H:%M:%S")
	print(cleaned_abstracts)

	return cleaned_abstracts

	if __name__ == '__main__':
	doi_list = ['10.1016/j.jsb.2006.07.014', '10.1016/j.jsb.2017.07.007']
	abstract_analysis(doi_list)