· 6 years ago · Mar 09, 2020, 02:54 AM
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4"""
5e6collector
6
7Very simple Python 3.6 script that will download all images with a specific
8tag, save them named ID-SOME-TAGS.EXTENSION and makes sure it doesn't
9download the same image multiple times because the tags changed. Also
10all tags will be written to tags.csv in the same folder. Using the
11e621 2020 API described on https://e621.net/help/api
12
13PROTIP: To view images with a certain tag try this on UNIX-like systems:
14gthumb `for f in $(grep -i TAG tags.csv | cut -f 1 -d ","); do echo $f-*; done`
15
16THE AUTHOR DOES NOT TAKE ANY RESPONSIBILITIES, THERE IS NO WARRANTY AND
17YOU PROBABLY SHOULDN'T USE THIS IN A NUCLEAR POWER PLANT, JUST SAYING!
18
19License: Public domain, do whatever.
20
21Version 1.0 -- Initial release, if you can call it that
22Version 1.0.1 -- Fixed Unicode problem on Windows
23Version 1.0.2 -- Fixed API not working
24Version 2.0 -- Ported to new 2020 API, added rate limiting
25Version 2.1 -- Handle restricted images by allowing API key usage or skipping
26"""
27import argparse
28import base64
29import csv
30import glob
31import json
32import time
33import os.path
34from collections import namedtuple
35from urllib.request import urlopen, Request
36
37BASEURL = 'https://e621.net'
38LISTURL = BASEURL + '/posts.json?limit=320&page={0:d}&tags={1:s}'
39FNAME = '{0:d}-{1:s}.{2:s}'
40KEEPCHARS = ['_', '-'] # For tag mangling
41destination = None
42
43lastrequest = 0 # UNIX time stamp of last request
44hdr = {'User-Agent': 'e6collector/2.1 (by stealthmode)'}
45Post = namedtuple("Post", ['id', 'deleted', 'tags', 'ext', 'url',
46 'sources'])
47
48# Simple delay for adhering to 1 per second rate limit outlined in API doc
49def speedlimit():
50 global lastrequest
51 while lastrequest + 1 > time.time():
52 print('Zzzz')
53 time.sleep(0.2)
54 lastrequest = time.time()
55
56# Add HTTP basic auth headers manually. does not work via urllib handler
57# because the server never asks since it's optional
58def login(user, key):
59 global hdr
60 if user is not None and key is not None:
61 print('Using API key')
62 credentials = '{0:s}:{1:s}'.format(user, key)
63 b64credentials = base64.b64encode(str.encode(credentials)).decode()
64 hdr['Authorization'] = 'Basic {0:s}'.format(b64credentials)
65
66def readlist(query):
67 page = 1 # Starting to count from 1, how strange
68 posts = []
69
70 while True: # Ick
71 # Some error handling would be nice, this will bug out if e6 is slow.
72 print('Now fetching page {}'.format(page))
73 with urlopen(Request(LISTURL.format(page, query), headers=hdr)) as req:
74 jsondata = json.load(req)
75 if not 'posts' in jsondata:
76 raise KeyError('posts array not found in JSON data')
77 exit(1)
78 if len(jsondata['posts']) == 0:
79 print('Done reading list')
80 break
81 for post in jsondata['posts']:
82 # Flatten post tags into single set
83 tags = set()
84 try:
85 for category in post['tags']:
86 if not isinstance(post['tags'][category], list):
87 continue
88 tags = tags.union(post['tags'][category])
89 except KeyError as e:
90 raise # XXX
91 tags = sorted(tags)
92
93 try:
94 posts.append(Post(post['id'],
95 post['flags']['deleted'],
96 tags,
97 post['file']['ext'],
98 post['file']['url'],
99 post['sources']))
100 except KeyError as e:
101 raise # XXX
102
103 page += 1
104 speedlimit()
105 return(posts)
106
107def writetags(post_id, source, tags):
108 if source is None:
109 source = ""
110 fullpath = os.path.join(destination, 'tags.csv')
111 with open(fullpath, 'a', encoding='utf-8') as tagfile:
112 writer = csv.writer(tagfile, quoting=csv.QUOTE_MINIMAL)
113 row = [post_id, source]
114 row.extend(tags)
115 writer.writerow(row)
116
117def mirror(tags):
118 posts = readlist(tags)
119 downloaded = 0
120 for post in posts:
121 if post.deleted:
122 print('{0:d} was deleted'.format(post.id))
123 continue
124 if len(glob.glob(os.path.join(destination, '{0:d}-*'.format(post.id)))) > 0:
125 print('{0:d} already exists'.format(post.id))
126 continue
127 if post.url is None:
128 print('{0:d} could not be downloaded, you might have to log in'.format(post.id))
129 continue
130 speedlimit()
131 with urlopen(Request(post.url, headers=hdr)) as request:
132 downloaded += 1
133 print('{0:d} is being downloaded and saved...'.format(post.id))
134 imgdata = request.read()
135
136 # Make tags safe for file name usage. Still unelegant as fuck.
137 fname_tags = []
138 for tag in post.tags:
139 mangled_tag = ''
140 for chara in tag:
141 if chara.isalnum() or chara in KEEPCHARS:
142 mangled_tag += chara
143 else:
144 print('Bad char?', ord(chara))
145 if len(mangled_tag) > 0:
146 fname_tags.append(mangled_tag)
147
148 fname = FNAME.format(post.id, '-'.join(fname_tags)[:190], post.ext)
149 fullpath = os.path.join(destination, fname)
150 with open(fullpath.encode('utf-8'), 'wb') as imgfile:
151 imgfile.write(imgdata)
152 writetags(post.id, post.sources, post.tags)
153
154 print('Finished collecting {0:d} posts'.format(len(posts)))
155 print('{0:d} were newly downloaded'.format(downloaded))
156
157if __name__ == '__main__':
158 parser = argparse.ArgumentParser(description='Download files by tag from e621')
159 parser.add_argument('destination', help='Directory to store the files in', nargs='?')
160 parser.add_argument('tags', help='Tags to look for. Must be URL encoded already. Try "fav:yourname"', nargs='?')
161 parser.add_argument('user', help='User account to use with API key', default=None, nargs='?')
162 parser.add_argument('key', help='API key, needed for some downloads', default=None, nargs='?')
163 args = parser.parse_args()
164 if args.tags is None:
165 parser.print_help()
166 exit (1)
167 destination = args.destination
168 login(args.user, args.key)
169 mirror(args.tags)