pf7snA9k

· 6 years ago · Mar 09, 2020, 02:54 AM
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4"""
5e6collector
6
7Very simple Python 3.6 script that will download all images with a specific
8tag, save them named ID-SOME-TAGS.EXTENSION and makes sure it doesn't
9download the same image multiple times because the tags changed. Also
10all tags will be written to tags.csv in the same folder. Using the
11e621 2020 API described on https://e621.net/help/api
12
13PROTIP: To view images with a certain tag try this on UNIX-like systems:
14gthumb `for f in $(grep -i TAG tags.csv | cut -f 1 -d ","); do echo $f-*; done`
15
16THE AUTHOR DOES NOT TAKE ANY RESPONSIBILITIES, THERE IS NO WARRANTY AND
17YOU PROBABLY SHOULDN'T USE THIS IN A NUCLEAR POWER PLANT, JUST SAYING!
18
19License: Public domain, do whatever.
20
21Version 1.0 -- Initial release, if you can call it that
22Version 1.0.1 -- Fixed Unicode problem on Windows
23Version 1.0.2 -- Fixed API not working
24Version 2.0 -- Ported to new 2020 API, added rate limiting
25Version 2.1 -- Handle restricted images by allowing API key usage or skipping
26"""
27import argparse
28import base64
29import csv
30import glob
31import json
32import time
33import os.path
34from collections import namedtuple
35from urllib.request import urlopen, Request
36
37BASEURL = 'https://e621.net'
38LISTURL = BASEURL + '/posts.json?limit=320&page={0:d}&tags={1:s}'
39FNAME = '{0:d}-{1:s}.{2:s}'
40KEEPCHARS = ['_', '-']  # For tag mangling
41destination = None
42
43lastrequest = 0 # UNIX time stamp of last request
44hdr = {'User-Agent': 'e6collector/2.1 (by stealthmode)'}
45Post = namedtuple("Post", ['id', 'deleted', 'tags', 'ext', 'url',
46                           'sources'])
47
48# Simple delay for adhering to 1 per second rate limit outlined in API doc
49def speedlimit():
50    global lastrequest
51    while lastrequest + 1 > time.time():
52        print('Zzzz')
53        time.sleep(0.2)
54    lastrequest = time.time()
55
56# Add HTTP basic auth headers manually. does not work via urllib handler
57# because the server never asks since it's optional
58def login(user, key):
59    global hdr
60    if user is not None and key is not None:
61        print('Using API key')
62        credentials = '{0:s}:{1:s}'.format(user, key)
63        b64credentials = base64.b64encode(str.encode(credentials)).decode()
64        hdr['Authorization'] = 'Basic {0:s}'.format(b64credentials)
65
66def readlist(query):
67    page = 1 # Starting to count from 1, how strange
68    posts = []
69    
70    while True:  # Ick
71        # Some error handling would be nice, this will bug out if e6 is slow.
72        print('Now fetching page {}'.format(page))
73        with urlopen(Request(LISTURL.format(page, query), headers=hdr)) as req:
74            jsondata = json.load(req)
75            if not 'posts' in jsondata:
76                raise KeyError('posts array not found in JSON data')
77                exit(1)
78            if len(jsondata['posts']) == 0:
79                print('Done reading list')
80                break
81            for post in jsondata['posts']:
82                # Flatten post tags into single set
83                tags = set()
84                try:
85                    for category in post['tags']:
86                        if not isinstance(post['tags'][category], list):
87                            continue
88                        tags = tags.union(post['tags'][category])
89                except KeyError as e:
90                    raise # XXX
91                tags = sorted(tags)
92
93                try:
94                    posts.append(Post(post['id'],
95                         post['flags']['deleted'],
96                         tags,
97                         post['file']['ext'],
98                         post['file']['url'],
99                         post['sources']))
100                except KeyError as e:
101                    raise # XXX
102
103            page += 1
104            speedlimit()
105    return(posts)
106
107def writetags(post_id, source, tags):
108    if source is None:
109        source = ""
110    fullpath = os.path.join(destination, 'tags.csv')
111    with open(fullpath, 'a', encoding='utf-8') as tagfile:
112        writer = csv.writer(tagfile, quoting=csv.QUOTE_MINIMAL)
113        row = [post_id, source]
114        row.extend(tags)
115        writer.writerow(row)
116
117def mirror(tags):
118    posts = readlist(tags)
119    downloaded = 0
120    for post in posts:
121        if post.deleted:
122            print('{0:d} was deleted'.format(post.id))
123            continue
124        if len(glob.glob(os.path.join(destination, '{0:d}-*'.format(post.id)))) > 0:
125            print('{0:d} already exists'.format(post.id))
126            continue
127        if post.url is None:
128            print('{0:d} could not be downloaded, you might have to log in'.format(post.id))
129            continue
130        speedlimit()
131        with urlopen(Request(post.url, headers=hdr)) as request:
132            downloaded += 1
133            print('{0:d} is being downloaded and saved...'.format(post.id))
134            imgdata = request.read()
135            
136            # Make tags safe for file name usage. Still unelegant as fuck.
137            fname_tags = []
138            for tag in post.tags:
139                mangled_tag = ''
140                for chara in tag:
141                    if chara.isalnum() or chara in KEEPCHARS:
142                        mangled_tag += chara
143                    else:
144                        print('Bad char?', ord(chara))
145                if len(mangled_tag) > 0:
146                    fname_tags.append(mangled_tag)
147
148            fname = FNAME.format(post.id, '-'.join(fname_tags)[:190], post.ext)
149            fullpath = os.path.join(destination, fname)
150            with open(fullpath.encode('utf-8'), 'wb') as imgfile:
151                imgfile.write(imgdata)
152                writetags(post.id, post.sources, post.tags)
153
154    print('Finished collecting {0:d} posts'.format(len(posts)))
155    print('{0:d} were newly downloaded'.format(downloaded))
156
157if __name__ == '__main__':
158    parser = argparse.ArgumentParser(description='Download files by tag from e621')
159    parser.add_argument('destination', help='Directory to store the files in', nargs='?')
160    parser.add_argument('tags', help='Tags to look for. Must be URL encoded already. Try "fav:yourname"', nargs='?')
161    parser.add_argument('user', help='User account to use with API key', default=None, nargs='?')
162    parser.add_argument('key', help='API key, needed for some downloads', default=None, nargs='?')
163    args = parser.parse_args()
164    if args.tags is None:
165        parser.print_help()
166        exit (1)
167    destination = args.destination
168    login(args.user, args.key)
169    mirror(args.tags)