· 6 years ago · Mar 29, 2020, 11:08 AM
1import os
2import re
3import csv
4import urllib.request
5import requests
6import pickle
7
8def slugify(value):
9 import unicodedata
10 value = unicodedata.normalize('NFKD', value)
11 value = re.sub('[^\w\s-]', '', value).strip()
12 value = re.sub('[-\s]+', '_', value)
13 return value
14
15titles = {}
16with open('nma_images_titles.csv') as csvfile:
17 reader = csv.reader(csvfile)
18 next(reader)
19 for row in reader:
20 folder = slugify(row[2])
21 titles[row[3]] = folder
22 path = 'rip/%s' % folder
23 if not os.path.exists(path):
24 print('making %s' % path)
25 os.makedirs(path)
26if os.path.exists('progress.pickle'):
27 progress = pickle.load(open('progress.pickle', 'rb'))
28else:
29 progress = []
30
31count = 0
32with open('nma_images.csv') as csvfile:
33 reader = csv.reader(csvfile)
34 next(reader)
35 total = sum(1 for row in reader)
36 csvfile.seek(0)
37 reader = csv.reader(csvfile)
38 next(reader)
39 for row in reader:
40 identifier, secret, _ = row[-1].split('/')[-1].split('_')
41 if identifier in progress:
42 print('[%s/%s] skipping already downloaded image' % (count, total))
43 count += 1
44 continue
45 data = requests.get('https://www.flickr.com/services/rest/?method=flickr.photos.getInfo&api_key=[PUT YOUR API KEY HERE]&photo_id=%s&secret=%s&format=json&nojsoncallback=1' % (identifier, secret)).json()
46 original = 'https://live.staticflickr.com/%s/%s_%s_o.%s' % (data['photo']['server'], identifier, data['photo']['originalsecret'], data['photo']['originalformat'])
47 title = '%s.%s' % (slugify(data['photo']['title']['_content']), data['photo']['originalformat'])
48 folder = titles[row[3]]
49 path = 'rip/%s/%s' % (folder, title)
50
51 print('[%s/%s] downloading (%s/%s) %s' % (count, total, folder, title, original))
52 testfile = urllib.request.URLopener()
53 testfile.retrieve(original, path)
54 progress.append(identifier)
55 count += 1
56 pickle.dump(progress, open('progress.pickle', 'wb'))