· 6 years ago · Mar 14, 2020, 10:32 PM
1'''Retrieve comment counts by date for a given tag, and write to csv.'''
2
3import argparse
4import json
5import urllib
6import urllib.request
7
8AVO_API_ROOT = 'https://the-avocado.org/wp-json/wp/v2'
9DISQUS_API_ROOT = 'http://disqus.com/api/3.0'
10
11def load_json(url):
12 ''' Return the contents of the URL as a JSON object '''
13 contents = urllib.request.urlopen(url).read()
14 return json.loads(contents)
15
16
17def find_tag_id(tag_name):
18 ''' Return the numeric id for the named tag '''
19 print('Searching for tag {}'.format(tag_name))
20 tag_search_url = "{}/tags?search={}&_fields=id".format(AVO_API_ROOT, tag_name)
21 tag_json = load_json(tag_search_url)
22 tag_id = tag_json[0]['id']
23 print('Tag id: {}'.format(tag_id))
24 return tag_id
25
26
27def count_comments(url, disqus_api_key):
28 ''' Return the number of comments for a given URL '''
29 safe_url = urllib.parse.quote(url, safe='')
30 forum = 'avocadotesting'
31 # Prefer using the API if we have a key
32 if disqus_api_key:
33 safe_key = urllib.parse.quote(disqus_api_key, safe='')
34 disqus_api_url = '{}/threads/list.json?forum={}&thread:link={}&api_key={}'.format(
35 DISQUS_API_ROOT, forum, safe_url, safe_key)
36 disqus_json = load_json(disqus_api_url)
37 return int(disqus_json['response'][0]['posts'])
38 # Otherwise do some really nasty HTML hackery
39 disqus_url = 'https://disqus.com/embed/comments/?base=default&f={}&t_u={}'.format(
40 forum, safe_url)
41 contents = urllib.request.urlopen(disqus_url).read()
42 total_header = b'"total":'
43 total_index = contents.find(total_header) + len(total_header)
44 total_excerpt = contents[total_index:total_index+10]
45 return int(total_excerpt.split(b',')[0])
46
47
48def make_line(url, total):
49 ''' Construct a comma separated line for the page '''
50 parts = url.split('/')
51 year = parts[3]
52 month = parts[4]
53 day = parts[5]
54 return '{}-{}-{},{},{}\n'.format(year, month, day, total, url)
55
56
57def main():
58 ''' Main function for avocado_stats '''
59 parser = argparse.ArgumentParser(
60 description='Retrieve comment counts by date for a given tag, and write to csv.')
61 parser.add_argument('tag', nargs=1, help='The tag to process')
62 parser.add_argument('-k', '--key', help='Disqus API key')
63 parser.add_argument(
64 '-o', '--out', help='Name of the output csv. Default: <tag>.csv')
65
66 args = parser.parse_args()
67
68 tag = args.tag[0]
69 tag_id = find_tag_id(tag)
70 outpath = args.out if args.out else '{}.csv'.format(tag)
71 pages_base_url = '{}/posts?tags={}&_fields=link&per_page=100'.format(AVO_API_ROOT, tag_id)
72 page_number = 1
73 outfile = open(outpath, 'w')
74 outfile.write('Date,Comments,URL\n')
75 while True:
76 outfile.flush()
77 pages_url = '{}&page={}'.format(pages_base_url, page_number)
78 try:
79 links = load_json(pages_url)
80 except urllib.error.HTTPError:
81 break
82 for link in links:
83 url = link['link']
84 total = count_comments(url, args.key)
85 line = make_line(url, total)
86 print(line)
87 outfile.write(line)
88 page_number += 1
89 outfile.close()
90
91
92if __name__ == "__main__":
93 main()