bzKPy9s6

· 6 years ago · Mar 14, 2020, 10:32 PM
1'''Retrieve comment counts by date for a given tag, and write to csv.'''
2
3import argparse
4import json
5import urllib
6import urllib.request
7
8AVO_API_ROOT = 'https://the-avocado.org/wp-json/wp/v2'
9DISQUS_API_ROOT = 'http://disqus.com/api/3.0'
10
11def load_json(url):
12    ''' Return the contents of the URL as a JSON object '''
13    contents = urllib.request.urlopen(url).read()
14    return json.loads(contents)
15
16
17def find_tag_id(tag_name):
18    ''' Return the numeric id for the named tag '''
19    print('Searching for tag {}'.format(tag_name))
20    tag_search_url = "{}/tags?search={}&_fields=id".format(AVO_API_ROOT, tag_name)
21    tag_json = load_json(tag_search_url)
22    tag_id = tag_json[0]['id']
23    print('Tag id: {}'.format(tag_id))
24    return tag_id
25
26
27def count_comments(url, disqus_api_key):
28    ''' Return the number of comments for a given URL '''
29    safe_url = urllib.parse.quote(url, safe='')
30    forum = 'avocadotesting'
31    # Prefer using the API if we have a key
32    if disqus_api_key:
33        safe_key = urllib.parse.quote(disqus_api_key, safe='')
34        disqus_api_url = '{}/threads/list.json?forum={}&thread:link={}&api_key={}'.format(
35            DISQUS_API_ROOT, forum, safe_url, safe_key)
36        disqus_json = load_json(disqus_api_url)
37        return int(disqus_json['response'][0]['posts'])
38    # Otherwise do some really nasty HTML hackery
39    disqus_url = 'https://disqus.com/embed/comments/?base=default&f={}&t_u={}'.format(
40        forum, safe_url)
41    contents = urllib.request.urlopen(disqus_url).read()
42    total_header = b'"total":'
43    total_index = contents.find(total_header) + len(total_header)
44    total_excerpt = contents[total_index:total_index+10]
45    return int(total_excerpt.split(b',')[0])
46
47
48def make_line(url, total):
49    ''' Construct a comma separated line for the page '''
50    parts = url.split('/')
51    year = parts[3]
52    month = parts[4]
53    day = parts[5]
54    return '{}-{}-{},{},{}\n'.format(year, month, day, total, url)
55
56
57def main():
58    ''' Main function for avocado_stats '''
59    parser = argparse.ArgumentParser(
60        description='Retrieve comment counts by date for a given tag, and write to csv.')
61    parser.add_argument('tag', nargs=1, help='The tag to process')
62    parser.add_argument('-k', '--key', help='Disqus API key')
63    parser.add_argument(
64        '-o', '--out', help='Name of the output csv. Default: <tag>.csv')
65
66    args = parser.parse_args()
67
68    tag = args.tag[0]
69    tag_id = find_tag_id(tag)
70    outpath = args.out if args.out else '{}.csv'.format(tag)
71    pages_base_url = '{}/posts?tags={}&_fields=link&per_page=100'.format(AVO_API_ROOT, tag_id)
72    page_number = 1
73    outfile = open(outpath, 'w')
74    outfile.write('Date,Comments,URL\n')
75    while True:
76        outfile.flush()
77        pages_url = '{}&page={}'.format(pages_base_url, page_number)
78        try:
79            links = load_json(pages_url)
80        except urllib.error.HTTPError:
81            break
82        for link in links:
83            url = link['link']
84            total = count_comments(url, args.key)
85            line = make_line(url, total)
86            print(line)
87            outfile.write(line)
88        page_number += 1
89    outfile.close()
90
91
92if __name__ == "__main__":
93    main()