#!/usr/bin/env python
# -*- coding: utf8 -*-
"""
This module provides interfaces to "unofficial GitHub API",
i.e. data available in the user interface but not in the official API.
This includes:
- user contributions timeline (all repositories contributed to,
organizations joined publicly, created repos, reported issues, etc.).
There is no official API for this, and public datasets like GHTorrent
do not report some of these events.
- user contribution stats (just number of contributions per year).
You can get the same information from GHTorrent,
but this method is only taking one HTTP request and thus it's much faster.
- get weekly contributors stats for a projects
(number of Lines Of Code contributed per week by top 100 contributors
since the beginning of the project).
LOC information is not available via API, and similar stats for commits take
multiple requests via official API.
.. autoclass:: Scraper
:members: full_user_activity_timeline, project_contributor_stats,
user_daily_contrib_num, links_to_recent_user_activity
"""
from __future__ import print_function
import argparse
from collections import defaultdict
import datetime
from functools import wraps
import logging
import re
import threading
import time
import warnings
from xml.etree import ElementTree
from bs4 import BeautifulSoup
import feedparser
import pandas as pd
import requests
import six # Queue
__version__ = '0.1.0'
__author__ = "Marat (@cmu.edu)"
__license__ = "GPL v3"
BASE_URL = 'https://github.com'
HEADERS = { # browser headers for non-API URLs
'X-Requested-With': 'XMLHttpRequest',
'Accept-Encoding': "gzip,deflate,br",
'Accept': "*/*",
'Origin': BASE_URL,
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) "
"Gecko/20100101 Firefox/60.0",
"Host": 'github.com',
"Referer": BASE_URL,
"DNT": "1",
"Accept-Language": 'en-US,en;q=0.5',
"Connection": "keep-alive",
"Cache-Control": 'max-age=0',
}
class GitHubScrapingError(requests.HTTPError):
pass
def normalize_text(string):
# type: (six.string_types) -> six.string_types
""" Normalize spaces and newlines
>>> normalize_text("\\nHello world \\t\\n!")
'Hello world!'
"""
return " ".join(string.split())
def _int(value):
if isinstance(value, six.string_types):
value = value.replace(",", "")
if value.endswith('k'):
# in the detailed list, large numbers are reduced
# to something like "1.7k"
value = float(value[:-1]) * 1000
return int(value)
def extract_repo(link):
# type: (six.string_types) -> six.string_types
""" Extract repository slug from a GitHub link
>>> extract_repo("/org/repo/blabla?something=foo")
'org/repo'
>>> extract_repo("org/repo")
'org/repo'
"""
return "/".join(link.strip("/").split("/", 2)[:2])
def _parse_timeline_update_record(record_div):
# type(BeautifulSoup) -> dict
"""
Args:
record_div(BeautifulSoup): a BS4 HTML element object,
representing one chunk of GitHub user activity.
Returns:
Dict[str, Dict[str, int]]: {
repository1: {
'commits': ...,
'issues': ...,
'pull_requests': ...,
'reviews': ...,
'created_repository': {0|1},
}
}
"""
# Note: GitHub lists only first 25 repos for each activity
# data[repo][activity] = <number>
record_data = defaultdict(lambda: defaultdict(int))
# get record title:
if record_div.button:
# created commits, repositories, issues,
# reviewed pull requests
title = normalize_text(record_div.button.text)
if re.match(
r'Reviewed \d[\d,]* pull requests? in \d+ repositor(y|ies)', title):
for repo_div in record_div.find_all(
'div', class_='profile-rollup-summarized'):
repo_div_button = repo_div.button
if not repo_div_button:
# "N repositories not shown"
continue
repo_span, count_span = repo_div_button.find_all('span')
repo = repo_span.text.strip()
count = _int(count_span.text.split()[0])
record_data[repo]['reviews'] += count
elif re.match(r'Opened \d[\d,]* (?:other )?issues? in \d+ repositor(y|ies)',
title):
for repo_div in record_div.find_all(
'div', class_='profile-rollup-summarized'):
repo_div_button = repo_div.button
if not repo_div_button:
# "N repositories not shown"
continue
repo = repo_div_button.div.span.text.strip()
count = 0
count_span = repo_div.button.find_all(
'span', recursive=False)[0]
for span in count_span.find_all('span'):
count += _int(span.text)
record_data[repo]['issues'] += count
elif re.match(r'Created \d[\d,]*\+? (?:other )?repositor(y|ies)', title):
# e.g. Created 100+ repositories
for link in record_div.find_all(
'a', attrs={'data-hovercard-type': "repository"}):
record_data[link.text]['created_repository'] = 1
elif re.match(r'Opened \d[\d,]* (?:other )?pull requests? '
r'in \d+ repositor(y|ies)', title):
for repo_div in record_div.find_all(
'div', class_='profile-rollup-summarized'):
repo_div_button = repo_div.button
if not repo_div_button:
# "N repositories not shown"
continue
repo = repo_div_button.div.span.text.strip()
count = 0
count_span = repo_div.button.find_all('span', recursive=False)[
0]
for span in count_span.find_all('span'):
count += _int(span.text)
record_data[repo]['pull_requests'] += count
elif re.match(r'Created \d[\d,]* commits? in \d+ repositor(y|ies)', title):
for repo_li in record_div.ul.find_all('li', recursive=False):
li_div = repo_li.div
if not li_div:
continue # "N repositories not shown"
repo_link = li_div.find_all('a', recursive=False)[1]
repo = extract_repo(repo_link["href"])
count = _int(repo_link.text.strip().split(" ")[0])
record_data[repo]['commits'] += count
else:
raise ValueError("Unexpected title: %s\n%s"
"" % (title, str(record_div)))
elif record_div.h4:
title = normalize_text(record_div.h4.text)
repo = record_div.h4.a and record_div.h4.a.text
if title.startswith("Created an issue in"):
record_data[repo]['issues'] += 1
elif title.startswith("Joined the"):
record_data[record_div.a['href'].strip('/')]['joined_org'] = 1
elif title.startswith("Created a pull request in"):
# fist PR in a given month
record_data[repo]['pull_requests'] += 1
elif title == "Joined GitHub":
pass
elif title.startswith("Opened their first issue on GitHub in"):
record_data[repo]['issues'] += 1
elif title.startswith("Opened their first pull request on GitHub in"):
record_data[repo]['pull_requests'] += 1
elif title.startswith("Created their first repository"):
links = record_div.find_all(
'a', attrs={'data-hovercard-type': "repository"})
if not links: # private repository
repo = ''
else:
repo = extract_repo(links[0].get('href'))
record_data[repo]['created_repository'] = 1
else:
raise ValueError("Unexpected title: " + title)
elif len(record_div.span) == 3:
# private activity
title = normalize_text(record_div.find_all('span')[1].text)
if title.endswith(' in private repositories'):
record_data[None]['private_contrib'] += _int(title.split(" ", 1)[0])
else:
raise ValueError("Unexpected title: " + title)
else:
raise ValueError("Unexpected activity:" + str(record_div))
# convert defaultdict to dict
return {rep: dict(activities) for rep, activities in record_data.items()}
def _parse_timeline_update(bs4_tree):
# type(BeautifulSoup) -> tuple
""" Parse a chunk of activity acquired via Ajax, usually one month.
Yields:
Tuple[str, Dict[str, int]]:
(month, {output of _parse_timeline_update_record()})
<div class="contribution-activity-listing"> # month div
<div class="profile-timeline discussion-timeline"> # one extra wrapper
<h3> # month title
<div class="profile-rollup-wrapper"> # record divs
...
Terminology:
timeline consists of updates
updates contain one or more months. Only one month is non-empty
month cosists of records - a single chunk of reported activity
record might contain information about several repositories,
e.g. Created N commits in M repositories
"""
# sometimes next chunk includes several months.
# In these cases, all except one are empty;
# often empty "months" represent ranges, e.g. April 2018 - December 2018
# to handle such cases, month is lazily evaluated
for month_div in bs4_tree.find_all("div", class_="profile-timeline"):
record_month = None
month_data = {}
for record_div in month_div.find_all("div", class_="profile-rollup-wrapper"):
try:
parsed_record = _parse_timeline_update_record(record_div)
except:
logging.error("Failed to parse record. Please contact the "
"maintainer and send the following HTML, along "
"with the user profile you're scraping:")
logging.error(record_div.prettify())
raise
if not parsed_record: # ignore empty months
continue
for record_repo, record_activity in parsed_record.items():
if record_repo not in month_data:
month_data[record_repo] = {}
# we might have several activities in the same record repository
# in a given month, e.g. issues, PRs and commits
month_data[record_repo].update(record_activity)
record_month = record_month or pd.to_datetime(
month_div.h3.text.strip()).strftime('%Y-%m')
if month_data:
yield record_month, month_data
def _extract_activity_feed_links(text):
tree = BeautifulSoup(text, 'html.parser')
date = None
for span in tree.find_all('span'):
if 'f6' not in span['class']:
continue
try:
date = pd.to_datetime(span.text.strip()).strftime("%Y-%m-%d")
except ValueError:
continue
break
links = []
for link in tree.find_all('a'):
href = link.get('href', '')
chunks = href.split("/")
# hrefs start with "/" so chunks[0] is an empty string
# this is why 'commit/issue/tree' is chunks[3], not [2]
if len(chunks) < 5 or \
chunks[3] not in ('commit', 'issue', 'tree'):
continue
if href not in links:
links.append(href)
yield (date, href)
def guard(func):
# TODO: once released in stutils, reuse from there
semaphore = threading.Lock()
@wraps(func)
def wrapper(*args, **kwargs):
semaphore.acquire()
try:
return func(*args, **kwargs)
finally:
semaphore.release()
return wrapper
[docs]class Scraper(object):
""" A class to access "unofficial GitHub API"
.. note::
This "unofficial API" is rate limited, just as the official one.
The rate limit is 40 requests in 80 seconds, and some calls take
multiple requests. So, for example, parsing a user activity timeline
typically takes couple minutes.
Use this "API" with caution as it might be extremely slow.
"""
_instance = None # singleton instance
cookies = None # cookies for non-API URLs
# limit is imposed if over 40 requests are made in 80 seconds
# thus, keeping track of issued requests
queue = None
# after many experiments, 40/121 looks to be the fastest option
queue_max_size = 40
queue_time_length = 121
retries_on_timeout = 5
def __new__(cls, *args, **kwargs): # Singleton
if not isinstance(cls._instance, cls):
cls._instance = super(Scraper, cls).__new__(cls, *args, **kwargs)
return cls._instance
def __init__(self):
self.session = requests.Session()
self.queue = six.moves.queue.Queue(maxsize=self.queue_max_size)
@guard
def _request(self, url, params=None, headers=None):
headers = headers or HEADERS
if not url.startswith(BASE_URL):
url = BASE_URL + url
while True:
if self.queue.full():
sleep_interval = self.queue.get() - time.time() + self.queue_time_length
if sleep_interval > 0:
logging.info("Hibernating for %.2f seconds to maintain "
"GitHub XHR rate limit..", sleep_interval)
time.sleep(sleep_interval)
self.queue.put(time.time())
# handle network errors and GitHub downtimes
# also, internal errors, like joshaber March 2015
r = None
for _ in range(self.retries_on_timeout):
try:
r = self.session.get(url, headers=headers, params=params)
except requests.exceptions.RequestException:
time.sleep(1)
continue
if r.status_code < 500:
break
else:
r = None
if r is None:
raise GitHubScrapingError(
"GitHub is not responding to requests. Try again later.")
if r.status_code == 429:
logging.info("Hit GitHub XHR rate limit, retry in 10 seconds..")
time.sleep(10)
continue
break
r.raise_for_status()
return r
[docs] def project_contributor_stats(self, repo_slug):
# type: (str) -> list
"""Get top 100 contributors weekly commit stats over the project history
Args:
repo_slug (str): <owner_login>/<repo_name>
Returns:
list: A list of top 100 contributors in the repo, with their logins,
total number of commits and weekly contribution counts as number
of lines added, changed or deleted. Note that weeks are
started on Sunday and represented by a Unix timestamp.
>>> Scraper().project_contributor_stats('pandas-dev/pandas') # doctest: +SKIP
[{u'author': {u'avatar': u'https://avatars0.githubusercontent.com/...',
u'hovercard_url': u'/hovercards?user_id=1435085',
u'id': 1435085,
u'login': u'blbradley',
u'path': u'/blbradley'},
u'total': 8,
u'weeks': [{u'a': 0, u'c': 0, u'd': 0, u'w': 1249171200},
{u'a': 0, u'c': 0, u'd': 0, u'w': 1249776000},
{u'a': 0, u'c': 0, u'd': 0, u'w': 1250380800},
...
}]
"""
for i in range(self.retries_on_timeout):
try:
res = self._request(
"/%s/graphs/contributors-data" % repo_slug).json()
except ValueError:
# sometimes GitHub just returns empty page
# without throwing a timeout
time.sleep(1)
continue
else:
return res
raise GitHubScrapingError(
"GitHub returns empty responses. Try again later.")
[docs] def user_daily_contrib_num(self, user, year):
# type: (str, int) -> dict
""" Get number of daily contributions of a GitHub user in a given year.
This method represents the white and green grid in the profile page.
Args:
user (str): The GitHub login of the user to get stats for.
year (int): Year of contributions to get
Returns:
dict: A dictionary with keys being %Y-%m-%d formatted dates, and
values being the number of contributions. This method does not
differentiate types of contributions, i.e. it is a sum
of commits, issues, submitted and reviewed pull requests, etc.
>>> Scraper().user_daily_contrib_num('user2589', 2018)
{'2018-01-01': 0,
'2018-01-02': 15,
...
'2018-12-31': 0}
"""
url = "/users/%s/contributions?from=%d-12-01&to=%d-12-31&full_graph=1" \
% (user, year, year)
year = str(year)
start_token = '<svg'
stop_token = '/svg>'
response_text = self._request(url).text
# cut out first <svg> element,
# since HTML outside of it is sometimes malformed
response_text = start_token + response_text.split(
start_token, 1)[-1].split(stop_token, 1)[0] + stop_token
tree = ElementTree.fromstring(response_text)
return {rect.attrib['data-date']: _int(rect.attrib.get('data-count'))
for rect in tree.iter('rect')
if rect.attrib.get('class') == 'day'
and rect.attrib.get('data-date', '').startswith(year)}
[docs] def links_to_recent_user_activity(self, user):
""" Get user events as a 2-tuple generator: (date, link).
Events include: commits, issues and refs creation (tags/branches).
Internally, this method is using Atom feed.
The result includes up to couple month of activity;
sometimes it also misses up to one month of recent events.
.. note::
This method is know to return incomplete data.
Proceed with caution.
Args:
user (str): The GitHub login of the user.
Yields:
Tuple[str, str]: (<%Y-%m-%d date>, link to the activity)
It seems like this feed only includes tags and commits
>>> list(Scraper().links_to_recent_user_activity('user2589')) # doctest: +SKIP
[('2018-12-01', '/user2589/Q/tree/master'),
('2018-12-01',
'/user2589/Q/commit/9184f20f939a70e3930ef762cc83906220433fc8'),
('2018-11-20', '/user2589/TAC_Github/tree/master'),
...]
"""
warnings.warn(
"This method is know to return incomplete data."
"Proceed with caution.", DeprecationWarning)
page = None
while True:
request = self._request('/%s' % user, params={'page': page},
headers={'Accept': 'application/atom+xml'})
page = 1 if page is None else page + 1
activity_log = feedparser.parse(request.text).entries
if not activity_log:
return
for record in activity_log:
for chunk in record['content']:
for date, link in _extract_activity_feed_links(
chunk['value'].encode('utf8')):
yield date, link
[docs] def full_user_activity_timeline(self, user, start=None, to=None):
# type: (str, str, str) -> Generator[Tuple[str, Dict]]
""" Get a list of public user contributions, by month by repository.
.. note: User timeline sometimes does not include all contributions.
E.g., this issue is not reflected in the reporter timeline:
https://github.com/GoogleCloudPlatform/webapp2/issues/104
Maybe, it
Args:
user (str): GitHub login of the user to get activity for.
start (str): date to start with, e.g. '2017-01' or '2017-01-01'.
`datetime` objects should also work.
to (str): upper bound of date ranges to parse, same as `start`.
**Note**: the day is 1 by default, i.e. '2017-01'
will be interpreted as **1st** of January 2017.
Yields:
Dict[str, int]:
A generator of activity dictionaries.
Each dict has fields `month`, a `%Y-%m` formatted month, and
`repo`, a repository slug. Other fields indicate number of
contributions of a given type:
- `commits`: number of commits.
- `issues`: number of reported issues.
- `reviews`: number of reviewed pull requests.
GitHub counts any commented pull request as reviewed,
also ignoring any code comments.
- `pull_requests`: number of pull requsts submitted.
- `created_repository`: can be only 1.
- `joined_org`: can be only 1.
The repository slug in this case is the GitHub org name.
- `private`: all contributions in private repositories combined,
if user enabled anonymous reporting of private activities.
The repository slug in this case is an empty string.
The output of this method is suitable for a pd.DataFrame constructor:
>>> pd.DataFrame(
... Scraper().full_user_activity_timeline('user2589'))
commits ... reviews
...
111 NaN ... NaN
112 NaN ... NaN
113 1.0 ... NaN
<BLANKLINE>
[114 rows x 9 columns]
It is even better to index on month+repo and replace NaNs:
>>> pd.DataFrame(
... Scraper().full_user_activity_timeline('user2589')
... ).set_index(['month', 'repo']).fillna(0).astype(int)
commits ... reviews
month repo ...
...
2012-05 user2589/minicms 11 ... 0
2011-09 alsoicode/django-admin-sortable 0 ... 0
2011-08 user2589/django-rosetta 0 ... 0
mbi/django-rosetta 0 ... 0
2005-03 user2589/schooligan 1 ... 0
<BLANKLINE>
[114 rows x 7 columns]
"""
if start:
if not isinstance(start, datetime.datetime):
start = pd.to_datetime(start)
start = start.strftime('%Y-%m')
if to:
if not isinstance(to, datetime.datetime): # str or unicode
to = pd.to_datetime(to)
now = to.strftime('%Y-%m-%d')
else:
now = datetime.datetime.now().strftime('%Y-%m-%d')
url = '/%s?tab=overview&include_header=no&utf8=✓&from=%s&to=%s' % (
user, now[:8] + '01', now)
while True:
soup = BeautifulSoup(self._request(url).text, 'html.parser')
for month_div in soup.find_all('div', class_='contribution-activity-listing'):
for month, data in _parse_timeline_update(month_div):
if start and month < start:
return
for repo, activity in data.items():
activity['repo'] = repo
activity['month'] = month
yield activity
form = soup.form
if not form:
break
url = form.attrs['data-url']
if not form.button:
break
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Get a user contribution timeline")
parser.add_argument('user', type=str,
help='GitHub login of the user to parse')
parser.add_argument('--from', type=str, nargs='?',
help='Lower end of the date range, default: no limit')
parser.add_argument('--to', type=str, nargs='?',
help='Upper end of the date range, default: now')
parser.add_argument('-o', '--output', default="-",
type=argparse.FileType('w'),
help='Output filename, "-" or skip for stdin')
parser.add_argument('-v', '--verbose', action='store_true',
help="Log progress to stderr")
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s %(message)s',
level=logging.INFO if args.verbose else logging.WARNING)
COLUMNS = ('commits', 'issues', 'pull_requests', 'reviews',
'private_contrib', 'created_repository', 'joined_org')
df = pd.DataFrame(Scraper().full_user_activity_timeline(args.user))
df = df.set_index(['month', 'repo']).fillna(0).astype(int)
df.to_csv(args.output)