· 5 years ago · Aug 03, 2020, 10:40 AM
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4import argparse
5import base64
6import csv
7import glob
8import json
9import time
10import os.path
11from collections import namedtuple
12from urllib.request import urlopen, Request
13
14BASEURL = 'https://e621.net'
15LISTURL = BASEURL + '/posts.json?limit=320&page={0:d}&tags={1:s}'
16FNAME = '{0:d}-{1:s}.{2:s}'
17KEEPCHARS = ['_', '-'] # For tag mangling
18destination = None
19
20lastrequest = 0 # UNIX time stamp of last request
21hdr = {'User-Agent': 'e6collector/2.1 (by stealthmode)'}
22Post = namedtuple("Post", ['id', 'deleted', 'tags', 'ext', 'url',
23 'sources'])
24
25# Simple delay for adhering to 1 per second rate limit outlined in API doc
26def speedlimit():
27 global lastrequest
28 while lastrequest + 1 > time.time():
29 print('Zzzz')
30 time.sleep(0.2)
31 lastrequest = time.time()
32
33# Add HTTP basic auth headers manually. does not work via urllib handler
34# because the server never asks since it's optional
35def login(user, key):
36 global hdr
37 if user is not None and key is not None:
38 print('Using API key')
39 credentials = '{0:s}:{1:s}'.format(user, key)
40 b64credentials = base64.b64encode(str.encode(credentials)).decode()
41 hdr['Authorization'] = 'Basic {0:s}'.format(b64credentials)
42
43def readlist(query):
44 page = 1 # Starting to count from 1, how strange
45 posts = []
46
47 while True: # Ick
48 # Some error handling would be nice, this will bug out if e6 is slow.
49 print('Now fetching page {}'.format(page))
50 with urlopen(Request(LISTURL.format(page, query), headers=hdr)) as req:
51 jsondata = json.load(req)
52 if not 'posts' in jsondata:
53 raise KeyError('posts array not found in JSON data')
54 exit(1)
55 if len(jsondata['posts']) == 0:
56 print('Done reading list')
57 break
58 for post in jsondata['posts']:
59 # Flatten post tags into single set
60 tags = set()
61 try:
62 for category in post['tags']:
63 if not isinstance(post['tags'][category], list):
64 continue
65 tags = tags.union(post['tags'][category])
66 except KeyError as e:
67 raise # XXX
68 tags = sorted(tags)
69
70 try:
71 posts.append(Post(post['id'],
72 post['flags']['deleted'],
73 tags,
74 post['file']['ext'],
75 post['file']['url'],
76 post['sources']))
77 except KeyError as e:
78 raise # XXX
79
80 page += 1
81 speedlimit()
82 return(posts)
83
84def writetags(post_id, source, tags):
85 if source is None:
86 source = ""
87 fullpath = os.path.join(destination, 'tags.csv')
88 with open(fullpath, 'a', encoding='utf-8') as tagfile:
89 writer = csv.writer(tagfile, quoting=csv.QUOTE_MINIMAL)
90 row = [post_id, source]
91 row.extend(tags)
92 writer.writerow(row)
93
94def mirror(tags):
95 posts = readlist(tags)
96 downloaded = 0
97 for post in posts:
98 if post.deleted:
99 print('{0:d} was deleted'.format(post.id))
100 continue
101 if len(glob.glob(os.path.join(destination, '{0:d}-*'.format(post.id)))) > 0:
102 print('{0:d} already exists'.format(post.id))
103 continue
104 if post.url is None:
105 print('{0:d} could not be downloaded, you might have to log in'.format(post.id))
106 continue
107 speedlimit()
108 with urlopen(Request(post.url, headers=hdr)) as request:
109 downloaded += 1
110 print('{0:d} is being downloaded and saved...'.format(post.id))
111 imgdata = request.read()
112
113 # Make tags safe for file name usage. Still unelegant as fuck.
114 fname_tags = []
115 for tag in post.tags:
116 mangled_tag = ''
117 for chara in tag:
118 if chara.isalnum() or chara in KEEPCHARS:
119 mangled_tag += chara
120 else:
121 print('Bad char?', ord(chara))
122 if len(mangled_tag) > 0:
123 fname_tags.append(mangled_tag)
124
125 fname = FNAME.format(post.id, '-'.join(fname_tags)[:190], post.ext)
126 fullpath = os.path.join(destination, fname)
127 with open(fullpath.encode('utf-8'), 'wb') as imgfile:
128 imgfile.write(imgdata)
129 writetags(post.id, post.sources, post.tags)
130
131 print('Finished collecting {0:d} posts'.format(len(posts)))
132 print('{0:d} were newly downloaded'.format(downloaded))
133
134if __name__ == '__main__':
135 parser = argparse.ArgumentParser(description='Download files by tag from e621')
136 parser.add_argument('destination', help='Directory to store the files in', nargs='?')
137 parser.add_argument('tags', help='Tags to look for. Must be URL encoded already. Try "fav:yourname"', nargs='?')
138 parser.add_argument('user', help='User account to use with API key', default=None, nargs='?')
139 parser.add_argument('key', help='API key, needed for some downloads', default=None, nargs='?')
140 args = parser.parse_args()
141 if args.tags is None:
142 parser.print_help()
143 exit (1)
144 destination = args.destination
145 login(args.user, args.key)
146 mirror(args.tags)