XkfDtgkc

· 6 years ago · Dec 06, 2018, 05:00 AM
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Github.com/Skarlett
4__version__ = '1.6.12'
5__author__ = 'Skarlett'
6##################
7# Tumblr APIv2 Basic Util Scraper
8import pytumblr as api
9import json
10import os
11import time
12import requests
13import hashlib
14import pwd
15import configparser
16import multiprocessing # moar power
17
18pprint = lambda obj: print(json.dumps(obj, separators=[',', ':'], indent=4, sort_keys=True))
19getusername = lambda : pwd.getpwuid(os.getuid())[0]
20DEBUGGING = False
21
22##############
23#  Exceptions
24#####
25class TooManyRequestsWarning(Warning):
26  pass
27
28
29class NoKeysInDatabase(ValueError):
30  pass
31
32
33class HourLimitCap(TooManyRequestsWarning):
34  pass
35
36
37class DayLimitCap(TooManyRequestsWarning):
38  pass
39
40
41class CONFIG_FILE:
42    ''' Lets load up some configuration variables '''
43    DIR = "/home/{}/.config/tumblr_scraper/".format(getusername())
44    FP = os.path.join(DIR, "tumblr_scraper.conf")
45    _CONFIG = configparser.ConfigParser()
46
47    __base = '/home/{}/.config'.format(getusername())
48    for d in [__base, __base+'/tumblr_scraper']:
49      if not os.path.isdir(d):
50        os.mkdir(d)
51    del __base
52
53    if not os.path.isfile(FP):
54        _CONFIG['APP'] = {
55            'jsondb': os.path.join(DIR, 'keys.json'),
56            'reset': 24 * 60 * 60,
57            'total_limit': 5000,
58            'hour_limit': 1000,
59            'save_every_x_requests': 50,
60            'download_chunk_size': 4098,
61            'download_threads': 4,
62            'pass_tags_to_crawl_tags_if_crawl_tags_empty': True,
63            'debug': False,
64            'blacklist_fp': os.path.join(DIR, 'blacklist.lst')
65        }
66        _CONFIG['API'] = {
67            'maxpost': 20,
68            'default_img_cnt': 200,
69            'default_vid_cnt': 40,
70            'default_min_media': 20
71        }
72        with open(FP, 'w') as cf:
73            _CONFIG.write(cf)
74        
75        print('configuration file wrote to {} - exitting, please restart.'.format(FP))
76        exit(0)
77    else:
78      DB = _CONFIG.get('app', 'jsondb', fallback=os.path.join(DIR, 'keys.json'))
79      BLACKLST = _CONFIG.get('app', 'blacklist_fp', fallback=os.path.join(DIR, 'blacklist.lst'))
80      RESET = _CONFIG.get('app', 'reset', fallback=24*60*60)
81      TLIMIT = _CONFIG.get('app', 'total_limit', fallback=5000)
82      HLIMIT = _CONFIG.get('app', 'hour_limit', fallback=1000)
83      DL_SIZE = _CONFIG.get('app', 'download_chunk_size', fallback=1028)
84      DL_THREADS = _CONFIG.get('app', 'download_threads', fallback=2)
85      SAVECNT = _CONFIG.get('app', 'save_every_x_requests', fallback=200)
86      PASS_TAGS = _CONFIG.get('app','pass_tags_to_crawl_tags_if_crawl_tags_empty', fallback=True)
87      DEBUG = _CONFIG.get('app','debug', fallback=False)
88
89      POST_SIZE = _CONFIG.get('api', 'maxpost', fallback=20)
90      IMG_CNT = _CONFIG.get('api', 'default_img_cnt', fallback=200)
91      VID_CNT = _CONFIG.get('api', 'default_vid_cnt', fallback=40)
92      MIN_CNT = _CONFIG.get('api', 'default_min_media', fallback=20)
93
94      if DEBUG:
95        global DEBUGGING
96        DEBUGGING = True
97
98
99class BlackList:
100  ''' This is hackish - and i should probably find another way of implementing this, but fuck it'''
101  _FP = CONFIG_FILE.BLACKLST
102  ignore = set()
103
104  if os.path.isfile(_FP):
105    with open(_FP) as fd:
106      for l in fd:
107        data = l.strip()
108        if data:
109          ignore.add(data)
110  else:
111    with open(_FP, 'w') as fd:
112      fd.write('')
113
114  @classmethod
115  def save(cls):
116    with open(CONFIG_FILE.BLACKLST, 'w') as fd:
117      fd.write('\n'.join(cls.ignore))
118
119  @classmethod
120  def add(cls, user):
121    cls.ignore.add(user)
122
123  @classmethod
124  def update(cls, iterable):
125    cls.ignore.update(iterable)
126
127  @classmethod
128  def remove(cls, user):
129    cls.ignore.remove(user)\
130
131
132class RotatedTumblrRestClient(api.TumblrRestClient):
133    def __init__(self, db, consumer_key):
134      self.db = db
135      self.dlimit = CONFIG_FILE.TLIMIT
136      self.reset = CONFIG_FILE.RESET
137      self.hlimit = CONFIG_FILE.HLIMIT
138      self.save_limit = CONFIG_FILE.SAVECNT
139
140      self.consumer_key = consumer_key
141      self.save_counter = 0
142      api.TumblrRestClient.__init__(self, *self.db._data[self.consumer_key]['keys'])
143
144    @property
145    def dcnt(self):
146      return self.db._data[self.consumer_key]['dcnt']
147
148    @property
149    def dts(self):
150      return self.db._data[self.consumer_key]['dts']
151
152    @property
153    def hcnt(self):
154      return self.db._data[self.consumer_key]['hcnt']
155
156    @property
157    def hts(self):
158      return self.db._data[self.consumer_key]['hts']
159
160    def is_ready(self):
161      return self.hlimit > self.hcnt and self.dlimit > self.dcnt+self.hcnt
162
163    def kill(self):
164      self.db._data[self.consumer_key]['hcnt'] = self.hlimit
165      self.db._data[self.consumer_key]['dcnt'] = self.dlimit
166
167    def send_api_request(self, method, url, params={}, valid_parameters=[], needs_api_key=False):
168        if self.save_counter >= self.save_limit:
169          self.save_counter = 0
170          self.db.save()
171        else:
172          self.save_counter += 1
173
174        if self.hts+60*60 >= time.time():
175          self.db._data[self.consumer_key]['dcnt'] += self.db._data[self.consumer_key]['hcnt']
176          self.db._data[self.consumer_key]['hts'] = time.time()
177          self.db._data[self.consumer_key]['hcnt'] = 0
178
179        if self.dts+self.reset >= time.time():
180          self.db._data[self.consumer_key]['dcnt'] = 0
181          self.db._data[self.consumer_key]['dts'] = time.time()
182
183        if not self.dcnt+self.hcnt > self.dlimit or not self.db._raise:
184          if not self.hcnt > self.hlimit or not self.db._raise:
185            if DEBUGGING:
186              print(method, url, params, needs_api_key)
187
188            resp = api.TumblrRestClient.send_api_request(
189              self, method, url,
190              params, valid_parameters, needs_api_key
191            )
192
193            if resp:
194              self.db._data[self.consumer_key]['hcnt'] += 1
195              return resp
196
197          else:
198            if self.db._raise:
199              raise HourLimitCap('Reached hour limit')
200        else:
201          if self.db._raise:
202            raise DayLimitCap('Reached day limit')
203
204
205class JSONAPIrotator:
206    '''
207    Saves all API keys in a json loadable format to
208    create multiple instances of clients with different api keys
209
210    '''
211
212    FP = CONFIG_FILE.DB
213
214    def __init__(self, client_class=RotatedTumblrRestClient):
215        self.clients = set()
216        self.client_cls = client_class
217        self._raise = True
218
219        if not os.path.isfile(self.FP):
220          with open(self.FP, 'w') as fd:
221              json.dump({}, fd)
222        else:
223            with open(self.FP) as fd:
224              self._data = json.load(fd)
225
226        self.init_clients()
227
228    def init_client(self, consumer_key):
229      if not consumer_key in self._data:
230        raise ValueError('Consumer Key not in database!')
231      client = self.client_cls(self, consumer_key)
232      self.clients.add(client)
233      return client
234
235    def init_clients(self):
236      for key in self._data.keys():
237        self.init_client(key)
238
239    def save(self):
240        with open(self.FP, 'w') as fd:
241          json.dump(self._data, fd)
242
243    def requests_used(self):
244        t=0
245        for _, v in self._data.items():
246            t += v['dcnt'] + v['hcnt']
247        return t
248
249    def requests_left_today(self):
250        t = 0
251        for _, v in self._data.items():
252            t += CONFIG_FILE.TLIMIT-v['dcnt']-v['hcnt']
253        return t
254
255    def requests_left_inhour(self):
256        t = 0
257        for _, v in self._data.items():
258            t += CONFIG_FILE.HLIMIT-v['hcnt']
259        return t
260
261    def add_key(self, consumer_key, consumer_secret, oauth_token, oauth_secret):
262        self._data[consumer_key] = {
263            'dts': time.time(),
264            'hts': time.time(),
265            'dcnt': 0,
266            'hcnt': 0,
267            'keys': [consumer_key, consumer_secret, oauth_token, oauth_secret]
268        }
269        return "Updated key"
270
271    def remove_key(self, consumer_key):
272        self._data.pop(consumer_key)
273        return "Removed key"
274
275    def no_raise(self):
276      self._raise = False
277      return "Api limits turned off...", False
278
279    def list_keys(self):
280        data = '\n\t\t'.join(k for k in self._data.keys())
281        return '''
282        Listing Keys...
283                {}
284
285        Total Left per 24h: {}
286        Total Left per 1h: {}
287        '''.format(data, self.requests_left_today(), self.requests_left_inhour()), True
288
289    def feed_client(self):
290      for x in self.clients:
291        if x.is_ready():
292          return x
293      self.save()
294
295      if self._raise:
296        raise NoKeysInDatabase('No keys found in database to return')
297      else:
298        return list(self.clients)[0]
299
300class RestlessCrawler:
301    #######
302    # Manages the amount of pictures received from one place
303    DEFAULT_POST_CNT = CONFIG_FILE.POST_SIZE
304    DEFAULT_IMAGE_COUNT = CONFIG_FILE.IMG_CNT
305    DEFAULT_VIDEO_COUNT = CONFIG_FILE.VID_CNT
306    DEFAULT_TAGS = [""]
307    DEFAULT_CRAWL_TAGS = [""]
308    DEFAULT_MIN_MEDIA = CONFIG_FILE.MIN_CNT
309    DEFAULT_IMAGE_OFFSET = 0
310    DEFAULT_VIDEO_OFFSET = 0
311    # Lots of samples yas
312    ########
313
314    def __init__(self, user, db, tags=None, crawl_tags=None,
315                 max_imgs=None, max_vids=None, minimum_media=None,
316                 ioffset=None, voffset=None, ignore=False, adult=False, nsfw=False):
317
318        self.db = db
319        self.client = db.feed_client()
320        self.name = user
321        self.shoutouts = set()
322        self.media = set()
323        self.ignore = ignore
324        self.max_img = max_imgs or self.DEFAULT_IMAGE_COUNT
325        self.max_vid = max_vids or self.DEFAULT_VIDEO_COUNT
326        self.tags = tags or self.DEFAULT_TAGS
327        self.crawl_tags = crawl_tags or self.DEFAULT_CRAWL_TAGS
328        self.minimum_media = minimum_media or self.DEFAULT_MIN_MEDIA
329        self.ioffset = ioffset or self.DEFAULT_IMAGE_OFFSET
330        self.voffset = voffset or self.DEFAULT_VIDEO_OFFSET
331        self.adult = adult
332        self.nsfw = nsfw
333
334    def __eq__(self, other):
335        return isinstance(other, self.__class__) and self.name == other.name
336
337    def __hash__(self):
338        return hash(self.name)
339
340    def _rotate_api(self):
341      self.client = self.db.feed_client()
342
343    def _rotate_posts(self, limit=None, start_at=0, push_limit=True, **kwargs):
344        roll_over = False
345        i = start_at
346        if limit and start_at and push_limit:
347          limit += start_at
348
349        while limit and limit >= i:
350            jpkg = self.client.posts(self.name, offset=i, **kwargs)
351            pprint(jpkg)
352            if jpkg:
353                if 'errors' in jpkg and 'meta' in jpkg:
354                  print(jpkg['meta']['msg'])
355                  if jpkg['meta']['msg'].lower() == 'limit exceeded':
356                    self.client._data['dcnt'] = self.client.dcnt-self.client.hcnt
357                    self.client._data['hcnt'] = 0
358                    self._rotate_api()
359                    roll_over = True
360                    break
361
362                else:
363                  _use = False
364                  if self.adult or self.nsfw:
365                    for _filter in ['is_adult', 'is_nsfw']:
366                      if self.adult and _filter in jpkg['blog'] and jpkg['blog'][_filter]:
367                        _use = True
368                  else:
369                    _use = True
370
371                  if _use:
372                    i += self.DEFAULT_POST_CNT
373                    if 'posts' in jpkg:
374                      if jpkg['posts']:
375                        for post in jpkg['posts']:
376                          yield jpkg, post
377                      else:
378                        break
379                    else:
380                      break
381                  else:
382                    continue
383            else:
384              break
385
386        if roll_over:
387          for x in self._rotate_posts(limit, start_at=i, push_limit=False, **kwargs):
388            yield x
389
390    def scrape(self, **kwargs):
391      for deprecated in ('type', 'tag', 'offset'):
392        if deprecated in kwargs:
393          kwargs.pop(deprecated)
394
395      for tag in self.tags:
396        for catagory, limit in (('photo', self.max_img), ('video', self.max_vid)):
397          if limit:
398            for jpkg, post in self._rotate_posts(limit=limit, type=catagory,tag=tag, **kwargs):
399              if DEBUGGING:
400                pprint(jpkg)
401
402              if 'source_url' in post:
403                user = post['source_url'].split('//')[1].split('.')[0]
404                if user != self.name and not user in BlackList.ignore:
405                  self.shoutouts.add(self.__class__(
406                      user, self.db, self.crawl_tags, self.crawl_tags,
407                      self.max_img, self.max_vid, minimum_media=self.minimum_media, adult=self.adult, nsfw=self.nsfw
408                  ))
409                else:
410                  self.scrape_post(post)
411              else:
412                self.scrape_post(post)
413
414    def scrape_post(self, post):
415        if post['type'] == 'video' and 'video_url' in post:
416          self.media.add(post['video_url'])
417
418        if post['type'] == 'photo':
419          self.media.update(pic['original_size']['url'] for pic in post['photos'])
420
421def download_media(pkg, chunksize=CONFIG_FILE.DL_SIZE):
422    url, to = pkg
423    if url:
424      resp = requests.get(url)
425      generator = resp.iter_content(chunksize)
426      first_chunk = next(generator)
427      file_hash = hashlib.md5(first_chunk).hexdigest()
428      file_extension = url.split('.')[-1].lower()
429      fp = os.path.join(to, '{}.{}'.format(file_hash, file_extension))
430
431      if not os.path.isfile(fp):
432        with open(fp, 'wb') as f:
433          f.write(first_chunk)
434          for chunk in generator:
435            f.write(chunk)
436
437
438def main(user, user_limit=0):
439      assert isinstance(user, RestlessCrawler)
440
441      root_user = user
442      users = {root_user}
443      directory = os.getcwd()
444      remember = set()
445
446      with multiprocessing.Pool(processes=CONFIG_FILE.DL_THREADS) as pool:
447        closed = False
448        try:
449          while users:
450            user = users.pop()
451            remember.add(user)
452
453            download_to = os.path.join(directory, user.name)
454            user.scrape()
455
456            for u in remember:
457              if u in user.shoutouts:
458                user.shoutouts.remove(u)
459            users.update(user.shoutouts)
460
461            if user.media and not user.ignore and len(user.media) >= user.minimum_media:
462              if not os.path.isdir(download_to):
463                os.mkdir(download_to)
464
465            print('Scraping {} <{}>'.format(user.name, len(user.media)))
466            pool.map(download_media, ((url, download_to) for url in user.media))
467
468            if user_limit and len(remember) >= user_limit:
469               break
470
471        except KeyboardInterrupt:
472          if not closed:
473            pool.close()
474
475        if not closed:
476          pool.close()
477
478        pool.join()
479
480if __name__ == '__main__':
481  import sys
482
483  class CLI:
484    ''' this thing is stiff as hell '''
485
486
487    HELP = """
488     _______ 
489    ( Boobs )
490     ------- 
491           o   ^__^
492            o  (xx)\_______
493               (__)\       )\/\\
494                U  ||----w |
495                   ||     ||
496
497    tumblr scraper tool for APIv2 - crawls profiles pulling pictures and videos while scanning for more users
498    usage: {fp} <args> <blog_name> 
499    
500    It should be noted this application is not retard-proof
501    it probably wont work if you suffer from retardism,
502    skiddism, aufucktism or all the above. P.S, your computer will blow up
503    if you're a jew and use this application.
504    
505    DESC
506    CMD <ARGUMENTS> | DEFAULT
507      
508      Use "None" if you'd like to query without comparing tags
509      -t | --tags <tag1> <tag2> <tag3> | {tags_default}
510      
511      When a name gets found, use these tags
512      -ct | --crawltags <tag1> <tag2> <tag3> | {tags_default}
513      
514      Limits the amount of users this application may crawl. 0 = Infinite.
515      -ul | --user-limit <int> | 0
516      
517      Maximum Retrieval of media
518      -i | --img <int> | {img_default}
519      -v | --vid <int> | {vid_default}
520      
521      Forces it not to download ROOT BLOG CONTENTS but instead
522      iterates through them for crawl tags
523      -x | --ignore | False
524      
525      Add API keyset for the application to use.
526      --db-add <consumer-key> <consumer-secret> <oauth_token> <oauth_secret>
527      
528      Remove an API keyset from the database
529      --db-del <consumer-key>
530      
531      List database keys
532      --keys
533      
534      Only captures blogs marked with ...
535      -a | --adult | False
536      -n | --nsfw | False
537      
538      Fine Tuning settings
539      --min-media <int> | {min_data} =>
540      --offset-images <int> | 0
541      --offset-videos <int> | 0
542      -il | --ignore-api-limitations | False
543      
544      Extra output
545      --debug
546      
547      spawns help message.
548      -h | --help
549        
550    """.format(
551        fp=__file__ if __file__.count('/') < 2 else __file__.split('/')[-1],
552        tags_default=RestlessCrawler.DEFAULT_TAGS,
553        img_default=RestlessCrawler.DEFAULT_IMAGE_COUNT,
554        vid_default=RestlessCrawler.DEFAULT_VIDEO_COUNT,
555        min_data=RestlessCrawler.DEFAULT_MIN_MEDIA,
556    )
557
558    if len(sys.argv) == 1 or \
559      '-h' in sys.argv or \
560      '--help' in sys.argv:
561
562      print(HELP)
563      exit(2)
564
565    GLOBAL_CMDS = (
566        (('-ul', '--user-limit'), 'user_limit'),
567    )
568
569    DB_CMDS = (
570        (('--db-add',), 'add_key'), (('--db-del',), 'remove_key'),
571    )
572
573    DB_FLAGS = (
574        (('--keys',), 'list_keys'), (('--ignore-api-limitations', '-il'), 'no_raise')
575    )
576
577    CRAWL_CMDS = (
578        (('-t', '--tags'), 'tags'), (('-ct', '--crawltags'), 'crawl_tags'),
579        (('-i', '--img'), 'max_imgs'), (('-v', '--vid'), 'max_vids'),
580        (('--min-media',), 'minimum_media'), (('--offset-images',), 'ioffset'),
581        (('--offset-videos',), 'voffset')
582    )
583    CRAWL_FLAGS = (
584        (('-x', '--ignore'), 'ignore'), (('-a', '--adult'), 'adult'), (('-ns', '--nsfw'), 'nsfw')
585    )
586
587    @classmethod
588    def find_value(cls, key_tuple, kill_on_last_arg=True):
589        ret = []
590        for key in key_tuple:
591          if key in sys.argv:
592            collect = False
593            for i, part in enumerate(sys.argv):
594              if part == key:
595                  collect = True
596              else:
597                  if part.startswith('-') or (kill_on_last_arg and i+1 == len(sys.argv)):
598                      collect = False
599
600                  elif collect:
601                      if key in ('-ct', '-t', '--crawltags', '--tags'):
602                          if part.lower() == "none":
603                              part = ""
604                          else:
605                            part = part.replace('_', ' ')
606
607                      ret.append(part)
608
609            ret = [x for x in ret if x and not x == 0]
610
611            if not key in ('-ct', '-t', '--crawltags', '--tags'):
612              if len(ret) > 1:
613                return ret
614
615              elif len(ret) == 1:
616                data = ret[0]
617                if data.isdigit():
618                    return int(data)
619                return data
620
621              elif len(ret) == 0:
622                return None
623            else:
624              return ret
625
626    @classmethod
627    def interface(cls):
628      global DEBUGGING
629      # for keys, reference in cls.GLOBAL_CMDS:
630      #   resp = cls.find_value(keys)
631      db = JSONAPIrotator()
632
633      g = {}  # global shit
634      for keys, reference in cls.GLOBAL_CMDS:
635        resp = cls.find_value(keys)
636        if isinstance(resp, int):
637          g[reference] = resp
638        elif resp:
639          g[reference] = resp
640
641      for keys, reference in cls.DB_CMDS:
642        resp = cls.find_value(keys, False)
643        if resp:
644          if hasattr(db, reference):
645            ret = getattr(db, reference)(*tuple(resp))
646            print(ret)
647            db.save()
648            exit(0)
649          else:
650            raise ValueError('{} had no method {}'.format(db, reference))
651
652      for keys, reference in cls.DB_FLAGS:
653        for k in keys:
654          if k in sys.argv:
655            if hasattr(db, reference):
656              ret, status = getattr(db, reference)()
657              print(ret)
658              if status:
659                exit(0)
660            else:
661              raise ValueError('{} had no method {}'.format(db, reference))
662
663      kwargs = {}
664      for keys, reference in cls.CRAWL_CMDS:
665        resp = cls.find_value(keys)
666        if isinstance(resp, int):
667          kwargs[reference] = resp
668        elif resp:
669          kwargs[reference] = resp
670
671      if 'tags' in kwargs and not 'crawl_tags' in kwargs and CONFIG_FILE.PASS_TAGS:
672        kwargs['crawl_tags'] = kwargs['tags']
673
674      for keys, reference in cls.CRAWL_FLAGS:
675        for k in keys:
676          if k in sys.argv:
677            kwargs[reference] = True
678            break
679
680      if '--debug' in sys.argv:
681        DEBUGGING = True
682
683      user = RestlessCrawler(sys.argv[-1], db, **kwargs)
684      main(user, **g)
685      db.save()
686  #try:
687  CLI.interface()
688  # except (KeyboardInterrupt, Exception) as e:
689  #   logging.exception(str(e))
690  #   print("exitting...")
691  #   exit(127)