· 6 years ago · Feb 05, 2020, 03:10 PM
1import requests # has built-in json!
2from urllib import request # query imgur
3import time # for rate-limiting?
4import os.path # write files locally
5import json # wanted to do this manually because dependency but too much work with url ':'
6
7# hastily written by: https://github.com/jonobrien
8# doing this now because:
9# I thought of it
10# reddit api queries for subreddit posts limits the 'listing' to max size of 1000
11# the lack of interest in learning advanced reddit searches
12# not sure if reddit has a hardcoded limit on amount of data you can see previously
13# reached via scrolling/retrieving searched posts
14
15# I limited this to just imgur searches
16# but could/SHOULD be further extended/properly implemented for usability as needed
17# Obviously just the subreddit url requested for...
18
19# you can change any post on reddit to json by adding '.json' to the search query (see below)
20# https://github.com/reddit/reddit/wiki/JSON#listing
21
22# get newest posts to the subreddit, has NULL 'before'
23# so can verify newest posts with 'after' key/val
24# 'before' tag is irrelevant here as we get the newest posts first anyway
25baseQuery = 'http://www.reddit.com/r/Eve/search.json?q=site%3Aimgur.com&restrict_sr=on&sort=new&t=all' #newest
26# query for older posts with the 'after' parameter in the request, has 'before' and 'after'
27olderPosts = 'https://www.reddit.com/r/Eve/search.json?q=site%3Aimgur.com&restrict_sr=on&sort=new&t=all&after='#['after'] for more posts
28# reddit api needs a custom user-agent header
29redHdr = {
30 'User-Agent': 'reddit-imgur-parser 1.0',
31 'Connection':'close'
32}
33# register an application with imgur and get the client ID and secret:
34imgurID = 'registered id here'
35imgurSecret = 'registered secret here'
36# TODO -- query imgur properly and download the images, maybe folders by date...imgur albums and gallery pics
37# should make new session to get all imgur links dynamically from the reddit posts
38imgHdr = {
39 'Authorization: Client-ID ' + imgurID
40}
41# TODO -- clean up all these variables
42jsonBackup = {} # all the data retrieved, save it, why not
43jsonBackup['allUrls'] = {} # a separated dictionary for ease of use, why not
44newUrls = {} # all the urls for debugging and comparison
45errorUrls = [] # these urls caused 404 errors or could not be saved
46albumUrls = {} # TODO -- use imgur api to get all album images
47sess = requests.Session() # session used to query reddit for json
48pageCount = 1 # the reddit page being parsed
49afterParam = '&after=' # query string for searching
50beforeTs = None # previous post
51afterTs = None # next post
52resp = None # the json from that reddit query
53
54
55# cleaned up all the replacements for reuse
56def cleanUrl(url):
57 return(url.replace('http://','').replace('https://','').replace('i.imgur.com/','').replace('imgur.com/','').replace('m.imgur.com/','').replace('gallery/','').replace('?','').replace('/new',''))
58
59
60# get previously run data and use it to save new links posted to the subreddit
61print('parsing any previous data')
62previousRunData = None
63with open("prevRun.txt",mode='a+', encoding='utf-8') as f:
64 f.seek(0)
65 try: # has been run before
66 previousRunData = json.loads(f.read())
67 print('has previous data')
68 except json.decoder.JSONDecodeError: # setup the obj
69 previousRunData = {}
70 previousRunData['allUrls'] = {}
71 print('created initial data for first use')
72print('done')
73#### print(str(previousRunData))
74
75
76
77print('\ngetting recent /r/eve imgur posts\n')
78# GET last pages on /r/eve that have imgur links
79# TODO -- add input args, eliminate all the hardcoding
80while pageCount <= 300:
81 time.sleep(.2) # reddit rate-limit 60 req/min, didn't want to risk problems here
82 if afterTs: # sequential posts after initial GET
83 ##### print(beforeTs) # assuming successful GET this works fine
84 resp = sess.get(olderPosts + afterParam + afterTs, headers=redHdr).json()['data']
85 else: # initial GET for newest posts
86 resp = sess.get(baseQuery, headers=redHdr).json()['data']
87 # parse all the things
88 jsonBackup[str(pageCount)] = {}
89 jsonBackup[str(pageCount)]['resp'] = resp
90 #### print(str(resp).encode('utf-8'))
91 page = resp['children']
92 beforeTs = resp['before'] # don't actually need it, as previously thought
93 afterTs = resp['after']
94 #### print(afterTs)
95 for post in page:
96 ##### print(post['data']['url'])
97 # TODO - refactor and get each image from imgur directly via authed api calls
98 # f = open('stripped-down-id.jpg', 'wb')
99 # f.write(sess.urlopen("imgur-link.jpg").read())
100 # f.close()
101
102 # ensure no duplicate links and tie url and reddit post together for sanity
103 jsonBackup['allUrls'][post['data']['url']] = post['data']['title'] # {'url':'title'}
104 newUrls[post['data']['url']] = post['data']['url']
105 pageCount+=1
106
107sess.close() # done getting that pesky info
108print(str(pageCount) + ' pages retrieved and stored')
109
110
111######print(jsonBackup['allUrls'])
112print()
113print('parsing urls')
114""" delete when cleaned up and verified more
115savedUrls = {}
116with open("urlsSaved.txt", encoding='utf-8') as f:
117 savedUrls['saved'] = f.read()
118# convert file str to dict with no dependencies...
119for kv in savedUrls['saved'].split(','):
120 print(kv.split(':'))
121"""
122
123
124# move this into the while loop
125# get all the images from imgur
126for url in newUrls.values():
127 if url in previousRunData['allUrls']: # checks for key existence in the dict
128 print('already downloaded: ' + str(url))
129 continue # go to next url, skip the parsing
130 # else unnecessary
131 urlFixed = url
132 imageName = None
133 if '/a/' in url:
134 print('ignored album: ' + url)
135 albumUrls[url] = url
136 # get api list of all images
137 continue
138 else:
139 imageName = cleanUrl(url)
140 if '.' in imageName: # has extension already, can be downloaded properly, mostly
141 imageName = imageName[:11]
142 else:
143 imageName = imageName[:7]
144 imageName += '.gif' # static gif is still a png/jpg anyway...safest to guess on type of gif
145 urlFixed = urlFixed.replace('/gallery/','').replace('/new','') # gallery in url breaks it too
146 urlFixed += '.gif'
147 print('retrieving: ' + imageName)
148 try:
149 path = os.path.join('./imgurEVE', imageName)
150 f = open(path, mode='wb+')
151 f.write(request.urlopen(urlFixed).read())
152 f.close()
153 except Exception as e:
154 print('\n[!!] exception usually 404')
155 print('imageName, urlFixed, url')
156 print(imageName)
157 print(urlFixed)
158 print(url)
159 errorUrls.append(url)
160 print(e)
161 print()
162
163print('\ndone saving images')
164print(' new image urls total: ' + str(len(newUrls)))
165print('the num of albums skipped: ' + str(len(albumUrls)))
166print(' individual images saved: ' + str(len(newUrls)-len(albumUrls)))
167
168# overwrite previous run data with new data
169print('writing this run to prevRun.txt')
170with open("prevRun.txt", mode='w', encoding='utf-8') as f:
171 f.write(json.dumps(jsonBackup))
172print('done')
173print('manually save these albums:')
174for url in albumUrls:
175 print(url)
176print('\nthese urls broke and should be looked into and downloaded manually')
177for url in errorUrls:
178 print(url)
179print('done for real')