datalad-handbook/tools/rtd_analytics

157 lines
4.7 KiB
Python
Executable file

#!/usr/bin/env python
"""Query DataLad Handbook analytics
Usage: rtd_analytics <project-name>
Needs a readthedocs account with adequate permissions. Set user/password
via ENV variables RTD_USER and RTD_PASSWORD
Stats are printed to STDOUT
"""
import csv
import io
from os import environ
import sys
import mechanize
class ReadTheDocs:
traffic_url = \
"https://readthedocs.org/dashboard/{project}/traffic-analytics/"
search_url = \
"https://readthedocs.org/dashboard/{project}/search-analytics/"
def __init__(self, project, username, password):
self._project = project
self._sadata = None
self._tadata = None
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_redirect(mechanize.HTTPRedirectHandler)
br.addheaders = [
('User-agent',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) '
'Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
]
br.open("https://readthedocs.org")
# follow second link with element text matching regular expression
br._factory.is_html = True
base_url = br.geturl()
# login
br.open(base_url + "/accounts/login/")
# find first form
br.select_form(nr=0)
br.form['login'] = username
br.form['password'] = password
br.submit()
# keep browser running
self.br = br
def get_analytics_data(self, url):
self.br.open(url)
# the download is done via a form button. find the form
self.br.select_form(nr=0)
# press the button
r = self.br.submit()
# download CSV data as text
return r.get_data().decode('utf-8')
@property
def search_analytics(self):
if self._sadata is None:
self._sadata = self.get_analytics_data(
ReadTheDocs.search_url.format(project=self._project)
)
return self._sadata
@property
def traffic_analytics(self):
if self._tadata is None:
self._tadata = self.get_analytics_data(
ReadTheDocs.traffic_url.format(project=self._project)
)
return self._tadata
def get_aggregate_searches(rtd):
"""Yield a dict with queries as keys and (events, generated hits) as values
"""
stats = dict()
for d in csv.DictReader(io.StringIO(rtd.search_analytics)):
queries, hits = stats.get(d['Query'], [0, 0])
queries += 1
hits = max(hits, int(d['Total Results']))
stats[d['Query']] = (queries, hits)
return stats
def get_aggregate_traffic(rtd):
"""Yield a dict with (version, path) as keys and views as values
"""
stats = dict()
for d in csv.DictReader(io.StringIO(rtd.traffic_analytics)):
try:
# integer versions refer to PR-builds
pr_id = int(d['Version'])
d['Version'] = f'PR-{pr_id}'
except ValueError:
pass
key = (d['Version'], d["Path"])
views = stats.get(key, 0)
views += int(d['Views'])
stats[key] = views
return stats
def main(project, user=None, password=None):
rtd = ReadTheDocs(
project,
environ.get('RTD_USER', user),
environ.get('RTD_PASSWORD', password),
)
aggsearch = get_aggregate_searches(rtd)
aggtraffic = get_aggregate_traffic(rtd)
print(f'#### RTD Analytics: {project} (last 30 days)')
paths = {}
for k, v in aggtraffic.items():
views = paths.get(k[1], 0)
views += v
paths[k[1]] = views
versions = {}
for k, v in aggtraffic.items():
views = versions.get(k[0], 0)
views += v
versions[k[0]] = views
print(f'##### 15 most popular sections ({sum(versions.values())} total page views)')
print(', '.join(
f'{k}({v})'.replace('.html', '').strip('/')
for k, v in sorted(
((k, v) for k, v in paths.items()
if k not in (
'/search.html', '/r.html')),
key=lambda x: x[1],
reverse=True)[:15]))
print(f'##### Version popularity')
print(', '.join(f'{k}({v})' for k, v in sorted(versions.items(), reverse=True)))
print(f'##### 10 Most popular search terms ({len(aggsearch)} unique queries)')
print(', '.join(
list(f'{q}({h[0]})'
for q, h in sorted(((k, v) for k, v in aggsearch.items()),
key=lambda x: x[1], reverse=True)
)[:10]))
print('##### Repeated searches with no hits')
print(', '.join(
f'{q}({h[0]})' for q, h in sorted(
((k, v) for k, v in aggsearch.items()
if v[0] > 1 and v[1] == 0),
key=lambda x: x[1],
reverse=True)
))
if __name__ == '__main__':
main(sys.argv[1])