Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
scholarlysearches/scholar
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
234 lines (189 sloc)
6.97 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import csv | |
import argparse | |
import pprint | |
from scholarly import scholarly | |
from collections import defaultdict | |
#from scholarly import ProxyGenerator | |
############### | |
# TODO | |
############### | |
# Add caching with a sqlite3 database | |
# Look for unique identifiers for authors | |
# Take name data from stdin | |
# Can be fired off from a shell like: | |
# while read -r line; do ./scholar -p "$line"; done < names |grep -v "^\"Year\",\"Author\",\"Title\",\"Citations\"" |tee output | |
# Set up a ProxyGenerator object to use free proxies | |
# This needs to be done only once per session | |
#pg = ProxyGenerator() | |
#pg.FreeProxies() | |
#scholarly.use_proxy(pg) | |
# Get the author indexes. arg is author_name, return is list | |
def get_author_indexes(author_name): | |
idx = [] | |
try: | |
# Search for the author by name | |
search_query = scholarly.search_author(author_name) | |
author = next(search_query, None) | |
except: | |
print(f"Error: Connection failed. Try again another time. You might be blocked.") | |
idx.append((-2, -2, author_name)) | |
return idx | |
if author is None: | |
idx.append((-1, -1, author_name)) | |
return idx # No author found | |
# Retrieve the author's detailed information | |
author = scholarly.fill(author) | |
name = author.get('name', 'No name available') | |
citedby = author.get('citedby', 0) | |
citedby5y = author.get('citedby5y', 0) | |
hindex = author.get('hindex', 0) | |
hindex5y = author.get('hindex5y', 0) | |
i10index = author.get('i10index', 0) | |
i10index5y = author.get('i10index5y', 0) | |
scholar_id = author.get('scholar_id', 0) | |
idx.append([citedby, citedby5y, hindex, hindex5y, i10index, i10index5y, scholar_id, name]) | |
return idx | |
# Get the publications per year. arg is author_name, return is list | |
def get_publications_per_year(author_name): | |
ppy = [] | |
try: | |
# Search for the author by name | |
search_query = scholarly.search_author(author_name) | |
author = next(search_query, None) | |
except: | |
print("Error: Connection failed. Try again another time. You might be blocked.") | |
ppy.append((-2, -2, author_name)) | |
return ppy | |
if author is None: | |
ppy.append((-1, -1, author_name)) | |
return ppy # No author found | |
# Fill in the author details to get publication info | |
author = scholarly.fill(author) | |
name = author.get('name', 'No name available') | |
# Dictionary to store number of publications per year | |
publications_per_year = defaultdict(int) | |
# Iterate over the publications and count the number per year | |
for pub in author['publications']: | |
year = pub['bib'].get('pub_year', 'no-year-available') | |
publications_per_year[year] += 1 | |
# Convert to a regular dictionary and sort by year | |
publications_per_year = dict(sorted(publications_per_year.items())) | |
# convert to a list, insert author and return | |
ppy = [] | |
for k, v in publications_per_year.items(): | |
ppy.append([k, v, name]) | |
return ppy | |
# Get the citations per year. arg is author_name, return is list | |
def get_citations_per_year(author_name): | |
cpy = [] | |
try: | |
# Search for the author by name | |
if byid: | |
search_query = scholarly.search_author_id(author_name) | |
else: | |
search_query = scholarly.search_author(author_name) | |
author = next(search_query, None) | |
if author is None: | |
cpy.append((-1, -1, author_name)) | |
return cpy # No author found | |
# Retrieve the author's detailed information | |
author = scholarly.fill(author) | |
name = author.get('name', 'No name available') | |
except: | |
print("Error: Connection failed. Try again another time. You might be blocked.") | |
cpy.append((-2, -2, author_name)) | |
return cpy | |
# Extract the citations per year | |
citations_per_year = author.get('cites_per_year', {}) | |
# Convert to a regular dictionary and sort by year | |
citations_per_year = dict(sorted(citations_per_year.items())) | |
# convert to a list, insert author and return | |
for k, v in citations_per_year.items(): | |
cpy.append([k, v, name]) | |
return cpy | |
# Get the publications. arg is author_name, return is list | |
def get_author_publications(author_name): | |
publications = [] | |
try: | |
# Search for the author | |
if byid: | |
search_query = scholarly.search_author_id(author_name) | |
else: | |
search_query = scholarly.search_author(author_name) | |
# Get the first result from the search query | |
author = next(search_query, None) | |
if author is None: | |
publications.append((-1, author_name, -1, -1)) | |
return publications # No author found | |
# Fill the author information | |
author = scholarly.fill(author) | |
name = author.get('name', 'No name available') | |
except: | |
print("Error: Connection failed. Try again another time. You might be blocked.") | |
publications.append((-2, author_name, -2, -2)) | |
return publications | |
# Extract the year and titles of the publications | |
for pub in author['publications']: | |
title = pub['bib'].get('title', 'No title available') | |
year = pub['bib'].get('pub_year', 'No year available') | |
journal = pub['bib'].get('citation', 'No journal available') | |
num_citations = pub.get('num_citations', 0) | |
publications.append((year, name, title, journal, num_citations)) | |
return publications | |
# handle args | |
def processargs(): | |
# Parse the command-line arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-c', '--citation', action='store_true', help='Get citations per year') | |
parser.add_argument('-i', '--indexes', action='store_true', help='Get author indexes') | |
parser.add_argument('-n', '--noheaders', action='store_true', help='Do not print headers') | |
parser.add_argument('-p', '--publications', action='store_true', help='Get publications') | |
parser.add_argument('-y', '--year', action='store_true', help='Get publications per year') | |
parser.add_argument('author_name', nargs='?') | |
# Display help and exit if no arguments are provided | |
if len(sys.argv) == 1: | |
parser.print_help() | |
sys.exit(1) | |
args = parser.parse_args() | |
# Display help and exit if no author name is provided | |
if args.author_name is None: | |
parser.print_help() | |
sys.exit(1) | |
return args | |
def main(): | |
args = processargs() | |
# Set up the CSV writer | |
csvout = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL) | |
# Get the citations per year | |
if args.citation: | |
citations_per_year = get_citations_per_year(args.author_name) | |
# Print the results | |
if not args.noheaders: | |
csvout.writerow(['Year', 'Num Citations', 'Author']) | |
csvout.writerows(citations_per_year) | |
# Get the author indexes | |
if args.indexes: | |
author_indexes = get_author_indexes(args.author_name) | |
# Print the results | |
if not args.noheaders: | |
csvout.writerow(['Citedby', 'Citedby5y', 'Hindex', 'Hindex5y', 'I10index', 'I10index5y', 'Author']) | |
csvout.writerows(author_indexes) | |
# Get the publications | |
if args.publications: | |
publications = get_author_publications(args.author_name) | |
# Print the results | |
if not args.noheaders: | |
csvout.writerow(['Year', 'Author', 'Title', 'Journal', 'Citations']) | |
csvout.writerows(publications) | |
# Get the publications per year | |
if args.year: | |
publications_per_year = get_publications_per_year(args.author_name) | |
# Print the results | |
if not args.noheaders: | |
csvout.writerow(['Year', 'Num Publications', 'Author']) | |
csvout.writerows(publications_per_year) | |
# Call the main function | |
if __name__ == '__main__': | |
main() |