scholar

#!/usr/bin/env python3

import sys
import csv
import argparse
import pprint
from scholarly import scholarly
from collections import defaultdict
#from scholarly import ProxyGenerator

###############
# TODO
###############
# Add caching with a sqlite3 database
# Look for unique identifiers for authors
# Take name data from stdin


# Can be fired off from a shell like:
# while read -r line; do ./scholar -p "$line"; done < names |grep -v "^\"Year\",\"Author\",\"Title\",\"Citations\"" |tee output

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
#pg = ProxyGenerator()
#pg.FreeProxies()
#scholarly.use_proxy(pg)

# Get the author indexes. arg is author_name, return is list
def get_author_indexes(author_name):
	idx = []
	try:
		# Search for the author by name
		search_query = scholarly.search_author(author_name)
		author = next(search_query, None)

	except:
		print(f"Error: Connection failed. Try again another time. You might be blocked.")
		idx.append((-2, -2, author_name))
		return idx

	if author is None:
		idx.append((-1, -1, author_name))
		return idx  # No author found

	# Retrieve the author's detailed information
	author = scholarly.fill(author)
	name = author.get('name', 'No name available')

	citedby = author.get('citedby', 0)
	citedby5y = author.get('citedby5y', 0)
	hindex = author.get('hindex', 0)
	hindex5y = author.get('hindex5y', 0)
	i10index = author.get('i10index', 0)
	i10index5y = author.get('i10index5y', 0)
	scholar_id = author.get('scholar_id', 0)
	idx.append([citedby, citedby5y, hindex, hindex5y, i10index, i10index5y, scholar_id, name])

	return idx

# Get the publications per year. arg is author_name, return is list
def get_publications_per_year(author_name):
	ppy = []

	try:
		# Search for the author by name
		search_query = scholarly.search_author(author_name)
		author = next(search_query, None)
	except:
		print("Error: Connection failed. Try again another time. You might be blocked.")
		ppy.append((-2, -2, author_name))
		return ppy

	if author is None:
		ppy.append((-1, -1, author_name))
		return ppy  # No author found

	# Fill in the author details to get publication info
	author = scholarly.fill(author)
	name = author.get('name', 'No name available')

	# Dictionary to store number of publications per year
	publications_per_year = defaultdict(int)

	# Iterate over the publications and count the number per year
	for pub in author['publications']:
		year = pub['bib'].get('pub_year', 'no-year-available')
		publications_per_year[year] += 1

	# Convert to a regular dictionary and sort by year
	publications_per_year = dict(sorted(publications_per_year.items()))

	# convert to a list, insert author and return
	ppy = []
	for k, v in publications_per_year.items():
		ppy.append([k, v, name])
	return ppy

# Get the citations per year. arg is author_name, return is list
def get_citations_per_year(author_name):
	cpy = []

	try:
		# Search for the author by name
		if byid:
			search_query = scholarly.search_author_id(author_name)
		else:
			search_query = scholarly.search_author(author_name)
		author = next(search_query, None)

		if author is None:
			cpy.append((-1, -1, author_name))
			return cpy  # No author found

		# Retrieve the author's detailed information
		author = scholarly.fill(author)
		name = author.get('name', 'No name available')
	except:
		print("Error: Connection failed. Try again another time. You might be blocked.")
		cpy.append((-2, -2, author_name))
		return cpy

	# Extract the citations per year
	citations_per_year = author.get('cites_per_year', {})

	# Convert to a regular dictionary and sort by year
	citations_per_year = dict(sorted(citations_per_year.items()))

	# convert to a list, insert author and return
	for k, v in citations_per_year.items():
		cpy.append([k, v, name])

	return cpy

# Get the publications. arg is author_name, return is list
def get_author_publications(author_name):
	publications = []

	try:
		# Search for the author
		if byid:
			search_query = scholarly.search_author_id(author_name)
		else:
			search_query = scholarly.search_author(author_name)

		# Get the first result from the search query
		author = next(search_query, None)

		if author is None:
			publications.append((-1, author_name, -1, -1))
			return publications  # No author found

		# Fill the author information
		author = scholarly.fill(author)
		name = author.get('name', 'No name available')
	except:
		print("Error: Connection failed. Try again another time. You might be blocked.")
		publications.append((-2, author_name, -2, -2))
		return publications

	# Extract the year and titles of the publications
	for pub in author['publications']:
		title = pub['bib'].get('title', 'No title available')
		year = pub['bib'].get('pub_year', 'No year available')
		journal = pub['bib'].get('citation', 'No journal available')
		num_citations = pub.get('num_citations', 0)
		publications.append((year, name, title, journal, num_citations))
	return publications

# handle args
def processargs():
	# Parse the command-line arguments
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--citation', action='store_true', help='Get citations per year')
	parser.add_argument('-i', '--indexes', action='store_true', help='Get author indexes')
	parser.add_argument('-n', '--noheaders', action='store_true', help='Do not print headers')
	parser.add_argument('-p', '--publications', action='store_true', help='Get publications')
	parser.add_argument('-y', '--year', action='store_true', help='Get publications per year')
	parser.add_argument('author_name', nargs='?')

	# Display help and exit if no arguments are provided
	if len(sys.argv) == 1:
		parser.print_help()
		sys.exit(1)
	args = parser.parse_args()

	# Display help and exit if no author name is provided
	if args.author_name is None:
		parser.print_help()
		sys.exit(1)

	return args


def main():
	args = processargs()

	# Set up the CSV writer
	csvout = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)

	# Get the citations per year
	if args.citation:
		citations_per_year = get_citations_per_year(args.author_name)
		# Print the results
		if not args.noheaders:
			csvout.writerow(['Year', 'Num Citations', 'Author'])
		csvout.writerows(citations_per_year)

	# Get the author indexes
	if args.indexes:
		author_indexes = get_author_indexes(args.author_name)
		# Print the results
		if not args.noheaders:
			csvout.writerow(['Citedby', 'Citedby5y', 'Hindex', 'Hindex5y', 'I10index', 'I10index5y', 'Author'])
		csvout.writerows(author_indexes)

	# Get the publications
	if args.publications:
		publications = get_author_publications(args.author_name)
		# Print the results
		if not args.noheaders:
			csvout.writerow(['Year', 'Author', 'Title', 'Journal', 'Citations'])
		csvout.writerows(publications)

	# Get the publications per year
	if args.year:
		publications_per_year = get_publications_per_year(args.author_name)
		# Print the results
		if not args.noheaders:
			csvout.writerow(['Year', 'Num Publications', 'Author'])
		csvout.writerows(publications_per_year)

# Call the main function
if __name__ == '__main__':
	main()
	#!/usr/bin/env python3

	import sys
	import csv
	import argparse
	import pprint
	from scholarly import scholarly
	from collections import defaultdict
	#from scholarly import ProxyGenerator

	###############
	# TODO
	###############
	# Add caching with a sqlite3 database
	# Look for unique identifiers for authors
	# Take name data from stdin


	# Can be fired off from a shell like:
	# while read -r line; do ./scholar -p "$line"; done < names \|grep -v "^\"Year\",\"Author\",\"Title\",\"Citations\"" \|tee output

	# Set up a ProxyGenerator object to use free proxies
	# This needs to be done only once per session
	#pg = ProxyGenerator()
	#pg.FreeProxies()
	#scholarly.use_proxy(pg)

	# Get the author indexes. arg is author_name, return is list
	def get_author_indexes(author_name):
	idx = []
	try:
	# Search for the author by name
	search_query = scholarly.search_author(author_name)
	author = next(search_query, None)

	except:
	print(f"Error: Connection failed. Try again another time. You might be blocked.")
	idx.append((-2, -2, author_name))
	return idx

	if author is None:
	idx.append((-1, -1, author_name))
	return idx # No author found

	# Retrieve the author's detailed information
	author = scholarly.fill(author)
	name = author.get('name', 'No name available')

	citedby = author.get('citedby', 0)
	citedby5y = author.get('citedby5y', 0)
	hindex = author.get('hindex', 0)
	hindex5y = author.get('hindex5y', 0)
	i10index = author.get('i10index', 0)
	i10index5y = author.get('i10index5y', 0)
	scholar_id = author.get('scholar_id', 0)
	idx.append([citedby, citedby5y, hindex, hindex5y, i10index, i10index5y, scholar_id, name])

	return idx

	# Get the publications per year. arg is author_name, return is list
	def get_publications_per_year(author_name):
	ppy = []

	try:
	# Search for the author by name
	search_query = scholarly.search_author(author_name)
	author = next(search_query, None)
	except:
	print("Error: Connection failed. Try again another time. You might be blocked.")
	ppy.append((-2, -2, author_name))
	return ppy

	if author is None:
	ppy.append((-1, -1, author_name))
	return ppy # No author found

	# Fill in the author details to get publication info
	author = scholarly.fill(author)
	name = author.get('name', 'No name available')

	# Dictionary to store number of publications per year
	publications_per_year = defaultdict(int)

	# Iterate over the publications and count the number per year
	for pub in author['publications']:
	year = pub['bib'].get('pub_year', 'no-year-available')
	publications_per_year[year] += 1

	# Convert to a regular dictionary and sort by year
	publications_per_year = dict(sorted(publications_per_year.items()))

	# convert to a list, insert author and return
	ppy = []
	for k, v in publications_per_year.items():
	ppy.append([k, v, name])
	return ppy

	# Get the citations per year. arg is author_name, return is list
	def get_citations_per_year(author_name):
	cpy = []

	try:
	# Search for the author by name
	if byid:
	search_query = scholarly.search_author_id(author_name)
	else:
	search_query = scholarly.search_author(author_name)
	author = next(search_query, None)

	if author is None:
	cpy.append((-1, -1, author_name))
	return cpy # No author found

	# Retrieve the author's detailed information
	author = scholarly.fill(author)
	name = author.get('name', 'No name available')
	except:
	print("Error: Connection failed. Try again another time. You might be blocked.")
	cpy.append((-2, -2, author_name))
	return cpy

	# Extract the citations per year
	citations_per_year = author.get('cites_per_year', {})

	# Convert to a regular dictionary and sort by year
	citations_per_year = dict(sorted(citations_per_year.items()))

	# convert to a list, insert author and return
	for k, v in citations_per_year.items():
	cpy.append([k, v, name])

	return cpy

	# Get the publications. arg is author_name, return is list
	def get_author_publications(author_name):
	publications = []

	try:
	# Search for the author
	if byid:
	search_query = scholarly.search_author_id(author_name)
	else:
	search_query = scholarly.search_author(author_name)

	# Get the first result from the search query
	author = next(search_query, None)

	if author is None:
	publications.append((-1, author_name, -1, -1))
	return publications # No author found

	# Fill the author information
	author = scholarly.fill(author)
	name = author.get('name', 'No name available')
	except:
	print("Error: Connection failed. Try again another time. You might be blocked.")
	publications.append((-2, author_name, -2, -2))
	return publications

	# Extract the year and titles of the publications
	for pub in author['publications']:
	title = pub['bib'].get('title', 'No title available')
	year = pub['bib'].get('pub_year', 'No year available')
	journal = pub['bib'].get('citation', 'No journal available')
	num_citations = pub.get('num_citations', 0)
	publications.append((year, name, title, journal, num_citations))
	return publications

	# handle args
	def processargs():
	# Parse the command-line arguments
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--citation', action='store_true', help='Get citations per year')
	parser.add_argument('-i', '--indexes', action='store_true', help='Get author indexes')
	parser.add_argument('-n', '--noheaders', action='store_true', help='Do not print headers')
	parser.add_argument('-p', '--publications', action='store_true', help='Get publications')
	parser.add_argument('-y', '--year', action='store_true', help='Get publications per year')
	parser.add_argument('author_name', nargs='?')

	# Display help and exit if no arguments are provided
	if len(sys.argv) == 1:
	parser.print_help()
	sys.exit(1)
	args = parser.parse_args()

	# Display help and exit if no author name is provided
	if args.author_name is None:
	parser.print_help()
	sys.exit(1)

	return args


	def main():
	args = processargs()

	# Set up the CSV writer
	csvout = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)

	# Get the citations per year
	if args.citation:
	citations_per_year = get_citations_per_year(args.author_name)
	# Print the results
	if not args.noheaders:
	csvout.writerow(['Year', 'Num Citations', 'Author'])
	csvout.writerows(citations_per_year)

	# Get the author indexes
	if args.indexes:
	author_indexes = get_author_indexes(args.author_name)
	# Print the results
	if not args.noheaders:
	csvout.writerow(['Citedby', 'Citedby5y', 'Hindex', 'Hindex5y', 'I10index', 'I10index5y', 'Author'])
	csvout.writerows(author_indexes)

	# Get the publications
	if args.publications:
	publications = get_author_publications(args.author_name)
	# Print the results
	if not args.noheaders:
	csvout.writerow(['Year', 'Author', 'Title', 'Journal', 'Citations'])
	csvout.writerows(publications)

	# Get the publications per year
	if args.year:
	publications_per_year = get_publications_per_year(args.author_name)
	# Print the results
	if not args.noheaders:
	csvout.writerow(['Year', 'Num Publications', 'Author'])
	csvout.writerows(publications_per_year)

	# Call the main function
	if __name__ == '__main__':
	main()