· 6 years ago · Mar 14, 2020, 10:42 PM
1'''Retrieve comment counts by date for a given tag, and write to csv.'''
2
3import argparse
4import json
5import re
6import urllib
7import urllib.request
8
9AVO_API_ROOT = 'https://the-avocado.org/wp-json/wp/v2'
10DISQUS_API_ROOT = 'http://disqus.com/api/3.0'
11MATCH_TOTAL = re.compile(rb'"total":(\d+)')
12
13def load_json(url):
14 ''' Return the contents of the URL as a JSON object '''
15 contents = urllib.request.urlopen(url).read()
16 return json.loads(contents)
17
18
19def find_tag_id(tag_name):
20 ''' Return the numeric id for the named tag '''
21 print('Searching for tag {}'.format(tag_name))
22 tag_search_url = "{}/tags?search={}&_fields=id".format(AVO_API_ROOT, tag_name)
23 tag_json = load_json(tag_search_url)
24 tag_id = tag_json[0]['id']
25 print('Tag id: {}'.format(tag_id))
26 return tag_id
27
28
29def count_comments(url, disqus_api_key):
30 ''' Return the number of comments for a given URL '''
31 safe_url = urllib.parse.quote(url, safe='')
32 forum = 'avocadotesting'
33 # Prefer using the API if we have a key
34 if disqus_api_key:
35 safe_key = urllib.parse.quote(disqus_api_key, safe='')
36 disqus_api_url = '{}/threads/list.json?forum={}&thread:link={}&api_key={}'.format(
37 DISQUS_API_ROOT, forum, safe_url, safe_key)
38 disqus_json = load_json(disqus_api_url)
39 return int(disqus_json['response'][0]['posts'])
40 # Otherwise do some really nasty HTML hackery
41 disqus_url = 'https://disqus.com/embed/comments/?base=default&f={}&t_u={}'.format(
42 forum, safe_url)
43 contents = urllib.request.urlopen(disqus_url).read()
44 match = MATCH_TOTAL.search(contents)
45 return int(match.group(1))
46
47
48def make_line(url, total):
49 ''' Construct a comma separated line for the page '''
50 parts = url.split('/')
51 year = parts[3]
52 month = parts[4]
53 day = parts[5]
54 return '{}-{}-{},{},{}\n'.format(year, month, day, total, url)
55
56
57def main():
58 ''' Main function for avocado_stats '''
59 parser = argparse.ArgumentParser(
60 description='Retrieve comment counts by date for a given tag, and write to csv.')
61 parser.add_argument('tag', nargs=1, help='The tag to process')
62 parser.add_argument('-k', '--key', help='Disqus API key')
63 parser.add_argument(
64 '-o', '--out', help='Name of the output csv. Default: <tag>.csv')
65
66 args = parser.parse_args()
67
68 tag = args.tag[0]
69 tag_id = find_tag_id(tag)
70 outpath = args.out if args.out else '{}.csv'.format(tag)
71 pages_base_url = '{}/posts?tags={}&_fields=link&per_page=100'.format(AVO_API_ROOT, tag_id)
72 page_number = 1
73 outfile = open(outpath, 'w')
74 outfile.write('Date,Comments,URL\n')
75 while True:
76 outfile.flush()
77 pages_url = '{}&page={}'.format(pages_base_url, page_number)
78 try:
79 links = load_json(pages_url)
80 except urllib.error.HTTPError:
81 break
82 for link in links:
83 url = link['link']
84 total = count_comments(url, args.key)
85 line = make_line(url, total)
86 print(line)
87 outfile.write(line)
88 page_number += 1
89 outfile.close()
90
91
92if __name__ == "__main__":
93 main()