· 6 years ago · Dec 06, 2018, 05:00 AM
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Github.com/Skarlett
4__version__ = '1.6.12'
5__author__ = 'Skarlett'
6##################
7# Tumblr APIv2 Basic Util Scraper
8import pytumblr as api
9import json
10import os
11import time
12import requests
13import hashlib
14import pwd
15import configparser
16import multiprocessing # moar power
17
18pprint = lambda obj: print(json.dumps(obj, separators=[',', ':'], indent=4, sort_keys=True))
19getusername = lambda : pwd.getpwuid(os.getuid())[0]
20DEBUGGING = False
21
22##############
23# Exceptions
24#####
25class TooManyRequestsWarning(Warning):
26 pass
27
28
29class NoKeysInDatabase(ValueError):
30 pass
31
32
33class HourLimitCap(TooManyRequestsWarning):
34 pass
35
36
37class DayLimitCap(TooManyRequestsWarning):
38 pass
39
40
41class CONFIG_FILE:
42 ''' Lets load up some configuration variables '''
43 DIR = "/home/{}/.config/tumblr_scraper/".format(getusername())
44 FP = os.path.join(DIR, "tumblr_scraper.conf")
45 _CONFIG = configparser.ConfigParser()
46
47 __base = '/home/{}/.config'.format(getusername())
48 for d in [__base, __base+'/tumblr_scraper']:
49 if not os.path.isdir(d):
50 os.mkdir(d)
51 del __base
52
53 if not os.path.isfile(FP):
54 _CONFIG['APP'] = {
55 'jsondb': os.path.join(DIR, 'keys.json'),
56 'reset': 24 * 60 * 60,
57 'total_limit': 5000,
58 'hour_limit': 1000,
59 'save_every_x_requests': 50,
60 'download_chunk_size': 4098,
61 'download_threads': 4,
62 'pass_tags_to_crawl_tags_if_crawl_tags_empty': True,
63 'debug': False,
64 'blacklist_fp': os.path.join(DIR, 'blacklist.lst')
65 }
66 _CONFIG['API'] = {
67 'maxpost': 20,
68 'default_img_cnt': 200,
69 'default_vid_cnt': 40,
70 'default_min_media': 20
71 }
72 with open(FP, 'w') as cf:
73 _CONFIG.write(cf)
74
75 print('configuration file wrote to {} - exitting, please restart.'.format(FP))
76 exit(0)
77 else:
78 DB = _CONFIG.get('app', 'jsondb', fallback=os.path.join(DIR, 'keys.json'))
79 BLACKLST = _CONFIG.get('app', 'blacklist_fp', fallback=os.path.join(DIR, 'blacklist.lst'))
80 RESET = _CONFIG.get('app', 'reset', fallback=24*60*60)
81 TLIMIT = _CONFIG.get('app', 'total_limit', fallback=5000)
82 HLIMIT = _CONFIG.get('app', 'hour_limit', fallback=1000)
83 DL_SIZE = _CONFIG.get('app', 'download_chunk_size', fallback=1028)
84 DL_THREADS = _CONFIG.get('app', 'download_threads', fallback=2)
85 SAVECNT = _CONFIG.get('app', 'save_every_x_requests', fallback=200)
86 PASS_TAGS = _CONFIG.get('app','pass_tags_to_crawl_tags_if_crawl_tags_empty', fallback=True)
87 DEBUG = _CONFIG.get('app','debug', fallback=False)
88
89 POST_SIZE = _CONFIG.get('api', 'maxpost', fallback=20)
90 IMG_CNT = _CONFIG.get('api', 'default_img_cnt', fallback=200)
91 VID_CNT = _CONFIG.get('api', 'default_vid_cnt', fallback=40)
92 MIN_CNT = _CONFIG.get('api', 'default_min_media', fallback=20)
93
94 if DEBUG:
95 global DEBUGGING
96 DEBUGGING = True
97
98
99class BlackList:
100 ''' This is hackish - and i should probably find another way of implementing this, but fuck it'''
101 _FP = CONFIG_FILE.BLACKLST
102 ignore = set()
103
104 if os.path.isfile(_FP):
105 with open(_FP) as fd:
106 for l in fd:
107 data = l.strip()
108 if data:
109 ignore.add(data)
110 else:
111 with open(_FP, 'w') as fd:
112 fd.write('')
113
114 @classmethod
115 def save(cls):
116 with open(CONFIG_FILE.BLACKLST, 'w') as fd:
117 fd.write('\n'.join(cls.ignore))
118
119 @classmethod
120 def add(cls, user):
121 cls.ignore.add(user)
122
123 @classmethod
124 def update(cls, iterable):
125 cls.ignore.update(iterable)
126
127 @classmethod
128 def remove(cls, user):
129 cls.ignore.remove(user)\
130
131
132class RotatedTumblrRestClient(api.TumblrRestClient):
133 def __init__(self, db, consumer_key):
134 self.db = db
135 self.dlimit = CONFIG_FILE.TLIMIT
136 self.reset = CONFIG_FILE.RESET
137 self.hlimit = CONFIG_FILE.HLIMIT
138 self.save_limit = CONFIG_FILE.SAVECNT
139
140 self.consumer_key = consumer_key
141 self.save_counter = 0
142 api.TumblrRestClient.__init__(self, *self.db._data[self.consumer_key]['keys'])
143
144 @property
145 def dcnt(self):
146 return self.db._data[self.consumer_key]['dcnt']
147
148 @property
149 def dts(self):
150 return self.db._data[self.consumer_key]['dts']
151
152 @property
153 def hcnt(self):
154 return self.db._data[self.consumer_key]['hcnt']
155
156 @property
157 def hts(self):
158 return self.db._data[self.consumer_key]['hts']
159
160 def is_ready(self):
161 return self.hlimit > self.hcnt and self.dlimit > self.dcnt+self.hcnt
162
163 def kill(self):
164 self.db._data[self.consumer_key]['hcnt'] = self.hlimit
165 self.db._data[self.consumer_key]['dcnt'] = self.dlimit
166
167 def send_api_request(self, method, url, params={}, valid_parameters=[], needs_api_key=False):
168 if self.save_counter >= self.save_limit:
169 self.save_counter = 0
170 self.db.save()
171 else:
172 self.save_counter += 1
173
174 if self.hts+60*60 >= time.time():
175 self.db._data[self.consumer_key]['dcnt'] += self.db._data[self.consumer_key]['hcnt']
176 self.db._data[self.consumer_key]['hts'] = time.time()
177 self.db._data[self.consumer_key]['hcnt'] = 0
178
179 if self.dts+self.reset >= time.time():
180 self.db._data[self.consumer_key]['dcnt'] = 0
181 self.db._data[self.consumer_key]['dts'] = time.time()
182
183 if not self.dcnt+self.hcnt > self.dlimit or not self.db._raise:
184 if not self.hcnt > self.hlimit or not self.db._raise:
185 if DEBUGGING:
186 print(method, url, params, needs_api_key)
187
188 resp = api.TumblrRestClient.send_api_request(
189 self, method, url,
190 params, valid_parameters, needs_api_key
191 )
192
193 if resp:
194 self.db._data[self.consumer_key]['hcnt'] += 1
195 return resp
196
197 else:
198 if self.db._raise:
199 raise HourLimitCap('Reached hour limit')
200 else:
201 if self.db._raise:
202 raise DayLimitCap('Reached day limit')
203
204
205class JSONAPIrotator:
206 '''
207 Saves all API keys in a json loadable format to
208 create multiple instances of clients with different api keys
209
210 '''
211
212 FP = CONFIG_FILE.DB
213
214 def __init__(self, client_class=RotatedTumblrRestClient):
215 self.clients = set()
216 self.client_cls = client_class
217 self._raise = True
218
219 if not os.path.isfile(self.FP):
220 with open(self.FP, 'w') as fd:
221 json.dump({}, fd)
222 else:
223 with open(self.FP) as fd:
224 self._data = json.load(fd)
225
226 self.init_clients()
227
228 def init_client(self, consumer_key):
229 if not consumer_key in self._data:
230 raise ValueError('Consumer Key not in database!')
231 client = self.client_cls(self, consumer_key)
232 self.clients.add(client)
233 return client
234
235 def init_clients(self):
236 for key in self._data.keys():
237 self.init_client(key)
238
239 def save(self):
240 with open(self.FP, 'w') as fd:
241 json.dump(self._data, fd)
242
243 def requests_used(self):
244 t=0
245 for _, v in self._data.items():
246 t += v['dcnt'] + v['hcnt']
247 return t
248
249 def requests_left_today(self):
250 t = 0
251 for _, v in self._data.items():
252 t += CONFIG_FILE.TLIMIT-v['dcnt']-v['hcnt']
253 return t
254
255 def requests_left_inhour(self):
256 t = 0
257 for _, v in self._data.items():
258 t += CONFIG_FILE.HLIMIT-v['hcnt']
259 return t
260
261 def add_key(self, consumer_key, consumer_secret, oauth_token, oauth_secret):
262 self._data[consumer_key] = {
263 'dts': time.time(),
264 'hts': time.time(),
265 'dcnt': 0,
266 'hcnt': 0,
267 'keys': [consumer_key, consumer_secret, oauth_token, oauth_secret]
268 }
269 return "Updated key"
270
271 def remove_key(self, consumer_key):
272 self._data.pop(consumer_key)
273 return "Removed key"
274
275 def no_raise(self):
276 self._raise = False
277 return "Api limits turned off...", False
278
279 def list_keys(self):
280 data = '\n\t\t'.join(k for k in self._data.keys())
281 return '''
282 Listing Keys...
283 {}
284
285 Total Left per 24h: {}
286 Total Left per 1h: {}
287 '''.format(data, self.requests_left_today(), self.requests_left_inhour()), True
288
289 def feed_client(self):
290 for x in self.clients:
291 if x.is_ready():
292 return x
293 self.save()
294
295 if self._raise:
296 raise NoKeysInDatabase('No keys found in database to return')
297 else:
298 return list(self.clients)[0]
299
300class RestlessCrawler:
301 #######
302 # Manages the amount of pictures received from one place
303 DEFAULT_POST_CNT = CONFIG_FILE.POST_SIZE
304 DEFAULT_IMAGE_COUNT = CONFIG_FILE.IMG_CNT
305 DEFAULT_VIDEO_COUNT = CONFIG_FILE.VID_CNT
306 DEFAULT_TAGS = [""]
307 DEFAULT_CRAWL_TAGS = [""]
308 DEFAULT_MIN_MEDIA = CONFIG_FILE.MIN_CNT
309 DEFAULT_IMAGE_OFFSET = 0
310 DEFAULT_VIDEO_OFFSET = 0
311 # Lots of samples yas
312 ########
313
314 def __init__(self, user, db, tags=None, crawl_tags=None,
315 max_imgs=None, max_vids=None, minimum_media=None,
316 ioffset=None, voffset=None, ignore=False, adult=False, nsfw=False):
317
318 self.db = db
319 self.client = db.feed_client()
320 self.name = user
321 self.shoutouts = set()
322 self.media = set()
323 self.ignore = ignore
324 self.max_img = max_imgs or self.DEFAULT_IMAGE_COUNT
325 self.max_vid = max_vids or self.DEFAULT_VIDEO_COUNT
326 self.tags = tags or self.DEFAULT_TAGS
327 self.crawl_tags = crawl_tags or self.DEFAULT_CRAWL_TAGS
328 self.minimum_media = minimum_media or self.DEFAULT_MIN_MEDIA
329 self.ioffset = ioffset or self.DEFAULT_IMAGE_OFFSET
330 self.voffset = voffset or self.DEFAULT_VIDEO_OFFSET
331 self.adult = adult
332 self.nsfw = nsfw
333
334 def __eq__(self, other):
335 return isinstance(other, self.__class__) and self.name == other.name
336
337 def __hash__(self):
338 return hash(self.name)
339
340 def _rotate_api(self):
341 self.client = self.db.feed_client()
342
343 def _rotate_posts(self, limit=None, start_at=0, push_limit=True, **kwargs):
344 roll_over = False
345 i = start_at
346 if limit and start_at and push_limit:
347 limit += start_at
348
349 while limit and limit >= i:
350 jpkg = self.client.posts(self.name, offset=i, **kwargs)
351 pprint(jpkg)
352 if jpkg:
353 if 'errors' in jpkg and 'meta' in jpkg:
354 print(jpkg['meta']['msg'])
355 if jpkg['meta']['msg'].lower() == 'limit exceeded':
356 self.client._data['dcnt'] = self.client.dcnt-self.client.hcnt
357 self.client._data['hcnt'] = 0
358 self._rotate_api()
359 roll_over = True
360 break
361
362 else:
363 _use = False
364 if self.adult or self.nsfw:
365 for _filter in ['is_adult', 'is_nsfw']:
366 if self.adult and _filter in jpkg['blog'] and jpkg['blog'][_filter]:
367 _use = True
368 else:
369 _use = True
370
371 if _use:
372 i += self.DEFAULT_POST_CNT
373 if 'posts' in jpkg:
374 if jpkg['posts']:
375 for post in jpkg['posts']:
376 yield jpkg, post
377 else:
378 break
379 else:
380 break
381 else:
382 continue
383 else:
384 break
385
386 if roll_over:
387 for x in self._rotate_posts(limit, start_at=i, push_limit=False, **kwargs):
388 yield x
389
390 def scrape(self, **kwargs):
391 for deprecated in ('type', 'tag', 'offset'):
392 if deprecated in kwargs:
393 kwargs.pop(deprecated)
394
395 for tag in self.tags:
396 for catagory, limit in (('photo', self.max_img), ('video', self.max_vid)):
397 if limit:
398 for jpkg, post in self._rotate_posts(limit=limit, type=catagory,tag=tag, **kwargs):
399 if DEBUGGING:
400 pprint(jpkg)
401
402 if 'source_url' in post:
403 user = post['source_url'].split('//')[1].split('.')[0]
404 if user != self.name and not user in BlackList.ignore:
405 self.shoutouts.add(self.__class__(
406 user, self.db, self.crawl_tags, self.crawl_tags,
407 self.max_img, self.max_vid, minimum_media=self.minimum_media, adult=self.adult, nsfw=self.nsfw
408 ))
409 else:
410 self.scrape_post(post)
411 else:
412 self.scrape_post(post)
413
414 def scrape_post(self, post):
415 if post['type'] == 'video' and 'video_url' in post:
416 self.media.add(post['video_url'])
417
418 if post['type'] == 'photo':
419 self.media.update(pic['original_size']['url'] for pic in post['photos'])
420
421def download_media(pkg, chunksize=CONFIG_FILE.DL_SIZE):
422 url, to = pkg
423 if url:
424 resp = requests.get(url)
425 generator = resp.iter_content(chunksize)
426 first_chunk = next(generator)
427 file_hash = hashlib.md5(first_chunk).hexdigest()
428 file_extension = url.split('.')[-1].lower()
429 fp = os.path.join(to, '{}.{}'.format(file_hash, file_extension))
430
431 if not os.path.isfile(fp):
432 with open(fp, 'wb') as f:
433 f.write(first_chunk)
434 for chunk in generator:
435 f.write(chunk)
436
437
438def main(user, user_limit=0):
439 assert isinstance(user, RestlessCrawler)
440
441 root_user = user
442 users = {root_user}
443 directory = os.getcwd()
444 remember = set()
445
446 with multiprocessing.Pool(processes=CONFIG_FILE.DL_THREADS) as pool:
447 closed = False
448 try:
449 while users:
450 user = users.pop()
451 remember.add(user)
452
453 download_to = os.path.join(directory, user.name)
454 user.scrape()
455
456 for u in remember:
457 if u in user.shoutouts:
458 user.shoutouts.remove(u)
459 users.update(user.shoutouts)
460
461 if user.media and not user.ignore and len(user.media) >= user.minimum_media:
462 if not os.path.isdir(download_to):
463 os.mkdir(download_to)
464
465 print('Scraping {} <{}>'.format(user.name, len(user.media)))
466 pool.map(download_media, ((url, download_to) for url in user.media))
467
468 if user_limit and len(remember) >= user_limit:
469 break
470
471 except KeyboardInterrupt:
472 if not closed:
473 pool.close()
474
475 if not closed:
476 pool.close()
477
478 pool.join()
479
480if __name__ == '__main__':
481 import sys
482
483 class CLI:
484 ''' this thing is stiff as hell '''
485
486
487 HELP = """
488 _______
489 ( Boobs )
490 -------
491 o ^__^
492 o (xx)\_______
493 (__)\ )\/\\
494 U ||----w |
495 || ||
496
497 tumblr scraper tool for APIv2 - crawls profiles pulling pictures and videos while scanning for more users
498 usage: {fp} <args> <blog_name>
499
500 It should be noted this application is not retard-proof
501 it probably wont work if you suffer from retardism,
502 skiddism, aufucktism or all the above. P.S, your computer will blow up
503 if you're a jew and use this application.
504
505 DESC
506 CMD <ARGUMENTS> | DEFAULT
507
508 Use "None" if you'd like to query without comparing tags
509 -t | --tags <tag1> <tag2> <tag3> | {tags_default}
510
511 When a name gets found, use these tags
512 -ct | --crawltags <tag1> <tag2> <tag3> | {tags_default}
513
514 Limits the amount of users this application may crawl. 0 = Infinite.
515 -ul | --user-limit <int> | 0
516
517 Maximum Retrieval of media
518 -i | --img <int> | {img_default}
519 -v | --vid <int> | {vid_default}
520
521 Forces it not to download ROOT BLOG CONTENTS but instead
522 iterates through them for crawl tags
523 -x | --ignore | False
524
525 Add API keyset for the application to use.
526 --db-add <consumer-key> <consumer-secret> <oauth_token> <oauth_secret>
527
528 Remove an API keyset from the database
529 --db-del <consumer-key>
530
531 List database keys
532 --keys
533
534 Only captures blogs marked with ...
535 -a | --adult | False
536 -n | --nsfw | False
537
538 Fine Tuning settings
539 --min-media <int> | {min_data} =>
540 --offset-images <int> | 0
541 --offset-videos <int> | 0
542 -il | --ignore-api-limitations | False
543
544 Extra output
545 --debug
546
547 spawns help message.
548 -h | --help
549
550 """.format(
551 fp=__file__ if __file__.count('/') < 2 else __file__.split('/')[-1],
552 tags_default=RestlessCrawler.DEFAULT_TAGS,
553 img_default=RestlessCrawler.DEFAULT_IMAGE_COUNT,
554 vid_default=RestlessCrawler.DEFAULT_VIDEO_COUNT,
555 min_data=RestlessCrawler.DEFAULT_MIN_MEDIA,
556 )
557
558 if len(sys.argv) == 1 or \
559 '-h' in sys.argv or \
560 '--help' in sys.argv:
561
562 print(HELP)
563 exit(2)
564
565 GLOBAL_CMDS = (
566 (('-ul', '--user-limit'), 'user_limit'),
567 )
568
569 DB_CMDS = (
570 (('--db-add',), 'add_key'), (('--db-del',), 'remove_key'),
571 )
572
573 DB_FLAGS = (
574 (('--keys',), 'list_keys'), (('--ignore-api-limitations', '-il'), 'no_raise')
575 )
576
577 CRAWL_CMDS = (
578 (('-t', '--tags'), 'tags'), (('-ct', '--crawltags'), 'crawl_tags'),
579 (('-i', '--img'), 'max_imgs'), (('-v', '--vid'), 'max_vids'),
580 (('--min-media',), 'minimum_media'), (('--offset-images',), 'ioffset'),
581 (('--offset-videos',), 'voffset')
582 )
583 CRAWL_FLAGS = (
584 (('-x', '--ignore'), 'ignore'), (('-a', '--adult'), 'adult'), (('-ns', '--nsfw'), 'nsfw')
585 )
586
587 @classmethod
588 def find_value(cls, key_tuple, kill_on_last_arg=True):
589 ret = []
590 for key in key_tuple:
591 if key in sys.argv:
592 collect = False
593 for i, part in enumerate(sys.argv):
594 if part == key:
595 collect = True
596 else:
597 if part.startswith('-') or (kill_on_last_arg and i+1 == len(sys.argv)):
598 collect = False
599
600 elif collect:
601 if key in ('-ct', '-t', '--crawltags', '--tags'):
602 if part.lower() == "none":
603 part = ""
604 else:
605 part = part.replace('_', ' ')
606
607 ret.append(part)
608
609 ret = [x for x in ret if x and not x == 0]
610
611 if not key in ('-ct', '-t', '--crawltags', '--tags'):
612 if len(ret) > 1:
613 return ret
614
615 elif len(ret) == 1:
616 data = ret[0]
617 if data.isdigit():
618 return int(data)
619 return data
620
621 elif len(ret) == 0:
622 return None
623 else:
624 return ret
625
626 @classmethod
627 def interface(cls):
628 global DEBUGGING
629 # for keys, reference in cls.GLOBAL_CMDS:
630 # resp = cls.find_value(keys)
631 db = JSONAPIrotator()
632
633 g = {} # global shit
634 for keys, reference in cls.GLOBAL_CMDS:
635 resp = cls.find_value(keys)
636 if isinstance(resp, int):
637 g[reference] = resp
638 elif resp:
639 g[reference] = resp
640
641 for keys, reference in cls.DB_CMDS:
642 resp = cls.find_value(keys, False)
643 if resp:
644 if hasattr(db, reference):
645 ret = getattr(db, reference)(*tuple(resp))
646 print(ret)
647 db.save()
648 exit(0)
649 else:
650 raise ValueError('{} had no method {}'.format(db, reference))
651
652 for keys, reference in cls.DB_FLAGS:
653 for k in keys:
654 if k in sys.argv:
655 if hasattr(db, reference):
656 ret, status = getattr(db, reference)()
657 print(ret)
658 if status:
659 exit(0)
660 else:
661 raise ValueError('{} had no method {}'.format(db, reference))
662
663 kwargs = {}
664 for keys, reference in cls.CRAWL_CMDS:
665 resp = cls.find_value(keys)
666 if isinstance(resp, int):
667 kwargs[reference] = resp
668 elif resp:
669 kwargs[reference] = resp
670
671 if 'tags' in kwargs and not 'crawl_tags' in kwargs and CONFIG_FILE.PASS_TAGS:
672 kwargs['crawl_tags'] = kwargs['tags']
673
674 for keys, reference in cls.CRAWL_FLAGS:
675 for k in keys:
676 if k in sys.argv:
677 kwargs[reference] = True
678 break
679
680 if '--debug' in sys.argv:
681 DEBUGGING = True
682
683 user = RestlessCrawler(sys.argv[-1], db, **kwargs)
684 main(user, **g)
685 db.save()
686 #try:
687 CLI.interface()
688 # except (KeyboardInterrupt, Exception) as e:
689 # logging.exception(str(e))
690 # print("exitting...")
691 # exit(127)