f95nf9dQ

· 5 years ago · Aug 03, 2020, 10:40 AM
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4import argparse
5import base64
6import csv
7import glob
8import json
9import time
10import os.path
11from collections import namedtuple
12from urllib.request import urlopen, Request
13
14BASEURL = 'https://e621.net'
15LISTURL = BASEURL + '/posts.json?limit=320&page={0:d}&tags={1:s}'
16FNAME = '{0:d}-{1:s}.{2:s}'
17KEEPCHARS = ['_', '-']  # For tag mangling
18destination = None
19
20lastrequest = 0 # UNIX time stamp of last request
21hdr = {'User-Agent': 'e6collector/2.1 (by stealthmode)'}
22Post = namedtuple("Post", ['id', 'deleted', 'tags', 'ext', 'url',
23                           'sources'])
24
25# Simple delay for adhering to 1 per second rate limit outlined in API doc
26def speedlimit():
27    global lastrequest
28    while lastrequest + 1 > time.time():
29        print('Zzzz')
30        time.sleep(0.2)
31    lastrequest = time.time()
32
33# Add HTTP basic auth headers manually. does not work via urllib handler
34# because the server never asks since it's optional
35def login(user, key):
36    global hdr
37    if user is not None and key is not None:
38        print('Using API key')
39        credentials = '{0:s}:{1:s}'.format(user, key)
40        b64credentials = base64.b64encode(str.encode(credentials)).decode()
41        hdr['Authorization'] = 'Basic {0:s}'.format(b64credentials)
42
43def readlist(query):
44    page = 1 # Starting to count from 1, how strange
45    posts = []
46    
47    while True:  # Ick
48        # Some error handling would be nice, this will bug out if e6 is slow.
49        print('Now fetching page {}'.format(page))
50        with urlopen(Request(LISTURL.format(page, query), headers=hdr)) as req:
51            jsondata = json.load(req)
52            if not 'posts' in jsondata:
53                raise KeyError('posts array not found in JSON data')
54                exit(1)
55            if len(jsondata['posts']) == 0:
56                print('Done reading list')
57                break
58            for post in jsondata['posts']:
59                # Flatten post tags into single set
60                tags = set()
61                try:
62                    for category in post['tags']:
63                        if not isinstance(post['tags'][category], list):
64                            continue
65                        tags = tags.union(post['tags'][category])
66                except KeyError as e:
67                    raise # XXX
68                tags = sorted(tags)
69
70                try:
71                    posts.append(Post(post['id'],
72                         post['flags']['deleted'],
73                         tags,
74                         post['file']['ext'],
75                         post['file']['url'],
76                         post['sources']))
77                except KeyError as e:
78                    raise # XXX
79
80            page += 1
81            speedlimit()
82    return(posts)
83
84def writetags(post_id, source, tags):
85    if source is None:
86        source = ""
87    fullpath = os.path.join(destination, 'tags.csv')
88    with open(fullpath, 'a', encoding='utf-8') as tagfile:
89        writer = csv.writer(tagfile, quoting=csv.QUOTE_MINIMAL)
90        row = [post_id, source]
91        row.extend(tags)
92        writer.writerow(row)
93
94def mirror(tags):
95    posts = readlist(tags)
96    downloaded = 0
97    for post in posts:
98        if post.deleted:
99            print('{0:d} was deleted'.format(post.id))
100            continue
101        if len(glob.glob(os.path.join(destination, '{0:d}-*'.format(post.id)))) > 0:
102            print('{0:d} already exists'.format(post.id))
103            continue
104        if post.url is None:
105            print('{0:d} could not be downloaded, you might have to log in'.format(post.id))
106            continue
107        speedlimit()
108        with urlopen(Request(post.url, headers=hdr)) as request:
109            downloaded += 1
110            print('{0:d} is being downloaded and saved...'.format(post.id))
111            imgdata = request.read()
112            
113            # Make tags safe for file name usage. Still unelegant as fuck.
114            fname_tags = []
115            for tag in post.tags:
116                mangled_tag = ''
117                for chara in tag:
118                    if chara.isalnum() or chara in KEEPCHARS:
119                        mangled_tag += chara
120                    else:
121                        print('Bad char?', ord(chara))
122                if len(mangled_tag) > 0:
123                    fname_tags.append(mangled_tag)
124
125            fname = FNAME.format(post.id, '-'.join(fname_tags)[:190], post.ext)
126            fullpath = os.path.join(destination, fname)
127            with open(fullpath.encode('utf-8'), 'wb') as imgfile:
128                imgfile.write(imgdata)
129                writetags(post.id, post.sources, post.tags)
130
131    print('Finished collecting {0:d} posts'.format(len(posts)))
132    print('{0:d} were newly downloaded'.format(downloaded))
133
134if __name__ == '__main__':
135    parser = argparse.ArgumentParser(description='Download files by tag from e621')
136    parser.add_argument('destination', help='Directory to store the files in', nargs='?')
137    parser.add_argument('tags', help='Tags to look for. Must be URL encoded already. Try "fav:yourname"', nargs='?')
138    parser.add_argument('user', help='User account to use with API key', default=None, nargs='?')
139    parser.add_argument('key', help='API key, needed for some downloads', default=None, nargs='?')
140    args = parser.parse_args()
141    if args.tags is None:
142        parser.print_help()
143        exit (1)
144    destination = args.destination
145    login(args.user, args.key)
146    mirror(args.tags)