wEh2RMAT

· 6 years ago · Feb 05, 2020, 03:10 PM
1import requests             # has built-in json!
2from urllib import request  # query imgur
3import time                 # for rate-limiting?
4import os.path              # write files locally
5import json                 # wanted to do this manually because dependency but too much work with url ':'
6
7# hastily written by: https://github.com/jonobrien
8# doing this now because: 
9# I thought of it
10# reddit api queries for subreddit posts limits the 'listing' to max size of 1000
11# the lack of interest in learning advanced reddit searches
12# not sure if reddit has a hardcoded limit on amount of data you can see previously
13#     reached via scrolling/retrieving searched posts
14
15# I limited this to just imgur searches
16# but could/SHOULD be further extended/properly implemented for usability as needed
17# Obviously just the subreddit url requested for...
18
19# you can change any post on reddit to json by adding '.json' to the search query (see below)
20# https://github.com/reddit/reddit/wiki/JSON#listing
21
22# get newest posts to the subreddit, has NULL 'before' 
23# so can verify newest posts with 'after' key/val
24# 'before' tag is irrelevant here as we get the newest posts first anyway
25baseQuery = 'http://www.reddit.com/r/Eve/search.json?q=site%3Aimgur.com&restrict_sr=on&sort=new&t=all' #newest
26# query for older posts with the 'after' parameter in the request, has 'before' and 'after'
27olderPosts = 'https://www.reddit.com/r/Eve/search.json?q=site%3Aimgur.com&restrict_sr=on&sort=new&t=all&after='#['after'] for more posts 
28# reddit api needs a custom user-agent header
29redHdr = {
30    'User-Agent': 'reddit-imgur-parser 1.0',
31	'Connection':'close'
32}
33# register an application with imgur and get the client ID and secret:
34imgurID = 'registered id here'
35imgurSecret = 'registered secret here'
36# TODO -- query imgur properly and download the images, maybe folders by date...imgur albums and gallery pics
37# should make new session to get all imgur links dynamically from the reddit posts
38imgHdr = {
39	'Authorization: Client-ID ' + imgurID
40}
41# TODO -- clean up all these variables
42jsonBackup = {}               # all the data retrieved, save it, why not
43jsonBackup['allUrls'] = {}    # a separated dictionary for ease of use, why not
44newUrls = {}                  # all the urls for debugging and comparison
45errorUrls = []                # these urls caused 404 errors or could not be saved
46albumUrls = {}                # TODO -- use imgur api to get all album images
47sess = requests.Session()     # session used to query reddit for json
48pageCount = 1                 # the reddit page being parsed
49afterParam = '&after='        # query string for searching
50beforeTs = None               # previous post
51afterTs = None                # next post
52resp = None                   # the json from that reddit query
53
54
55# cleaned up all the replacements for reuse
56def cleanUrl(url):
57	return(url.replace('http://','').replace('https://','').replace('i.imgur.com/','').replace('imgur.com/','').replace('m.imgur.com/','').replace('gallery/','').replace('?','').replace('/new',''))
58
59
60# get previously run data and use it to save new links posted to the subreddit
61print('parsing any previous data')
62previousRunData = None
63with open("prevRun.txt",mode='a+', encoding='utf-8') as f:
64	f.seek(0)
65	try: # has been run before
66		previousRunData = json.loads(f.read())
67		print('has previous data')
68	except json.decoder.JSONDecodeError: # setup the obj
69		previousRunData = {}
70		previousRunData['allUrls'] = {}
71		print('created initial data for first use')
72print('done')
73#### print(str(previousRunData))
74
75
76
77print('\ngetting recent /r/eve imgur posts\n')
78# GET last pages on /r/eve that have imgur links
79# TODO -- add input args, eliminate all the hardcoding
80while pageCount <= 300:
81	time.sleep(.2) # reddit rate-limit 60 req/min, didn't want to risk problems here
82	if afterTs: # sequential posts after initial GET
83		##### print(beforeTs) # assuming successful GET this works fine
84		resp = sess.get(olderPosts + afterParam + afterTs, headers=redHdr).json()['data']
85	else: # initial GET for newest posts
86		resp = sess.get(baseQuery, headers=redHdr).json()['data']
87	# parse all the things
88	jsonBackup[str(pageCount)] = {}
89	jsonBackup[str(pageCount)]['resp'] = resp
90	#### print(str(resp).encode('utf-8'))
91	page = resp['children']
92	beforeTs = resp['before'] # don't actually need it, as previously thought
93	afterTs = resp['after']
94	#### print(afterTs)
95	for post in page:
96		##### print(post['data']['url'])
97		# TODO - refactor and get each image from imgur directly via authed api calls
98		# f = open('stripped-down-id.jpg', 'wb')
99		# f.write(sess.urlopen("imgur-link.jpg").read())
100		# f.close()
101
102		# ensure no duplicate links and tie url and reddit post together for sanity
103		jsonBackup['allUrls'][post['data']['url']] = post['data']['title'] # {'url':'title'}
104		newUrls[post['data']['url']] = post['data']['url']
105	pageCount+=1
106
107sess.close() # done getting that pesky info
108print(str(pageCount) + ' pages retrieved and stored')
109
110
111######print(jsonBackup['allUrls'])
112print()
113print('parsing urls')
114"""   delete when cleaned up and verified more
115savedUrls = {}
116with open("urlsSaved.txt", encoding='utf-8') as f:
117	savedUrls['saved'] = f.read()
118# convert file str to dict with no dependencies...
119for kv in savedUrls['saved'].split(','):
120	print(kv.split(':'))
121"""
122
123
124# move this into the while loop
125# get all the images from imgur
126for url in newUrls.values():
127	if url in previousRunData['allUrls']: # checks for key existence in the dict
128		print('already downloaded: ' + str(url))
129		continue # go to next url, skip the parsing
130	# else unnecessary
131	urlFixed = url
132	imageName = None
133	if '/a/' in url:
134		print('ignored album: ' + url)
135		albumUrls[url] = url
136		# get api list of all images
137		continue
138	else:
139		imageName = cleanUrl(url)
140		if '.' in imageName: # has extension already, can be downloaded properly, mostly
141			imageName = imageName[:11]
142		else:
143			imageName = imageName[:7]
144			imageName += '.gif' # static gif is still a png/jpg anyway...safest to guess on type of gif
145			urlFixed = urlFixed.replace('/gallery/','').replace('/new','') # gallery in url breaks it too
146			urlFixed += '.gif'
147			print('retrieving: ' + imageName)
148		try:
149			path = os.path.join('./imgurEVE', imageName)
150			f = open(path, mode='wb+')
151			f.write(request.urlopen(urlFixed).read())
152			f.close()
153		except Exception as e:
154			print('\n[!!] exception usually 404')
155			print('imageName, urlFixed, url')
156			print(imageName)
157			print(urlFixed)
158			print(url)
159			errorUrls.append(url)
160			print(e)
161			print()
162
163print('\ndone saving images')
164print('     new image urls total: ' + str(len(newUrls)))
165print('the num of albums skipped: ' + str(len(albumUrls)))
166print('  individual images saved: ' + str(len(newUrls)-len(albumUrls)))
167
168# overwrite previous run data with new data
169print('writing this run to prevRun.txt')
170with open("prevRun.txt", mode='w', encoding='utf-8') as f:
171	f.write(json.dumps(jsonBackup))
172print('done')
173print('manually save these albums:')
174for url in albumUrls:
175	print(url)
176print('\nthese urls broke and should be looked into and downloaded manually')
177for url in errorUrls:
178	print(url)
179print('done for real')