· 6 years ago · May 20, 2019, 02:20 AM
1# https://stackoverflow.com/questions/40919936/calculating-entropy-from-glcm-of-an-image
2# https://www.ucalgary.ca/mhallbey/glcm1
3# http://scikit-image.org/docs/dev/auto_examples/features_detection/plot_glcm.html
4
5# https://stackoverflow.com/a/24672188 # not really sure what this is a measure of, but the code does work
6print("Begin importing modules")
7import os
8from datetime import datetime
9import shutil
10import pandas
11import pickle
12import hashlib
13import imohash
14import imagesize
15import skimage
16import cv2
17import numpy as np
18# from scipy.stats import kurtosis, skew # https://stackoverflow.com/questions/53092196/calculating-kurtosis-from-an-image
19import scipy
20from multiprocessing import Pool
21from pk2_color import *
22import pk2_image_stats as pk2is
23
24def pk2_timestamp():
25 return datetime.now().replace(microsecond=0).isoformat('_').replace(":","-")
26
27def pk2_sha256(filepath):
28 BLOCKSIZE = 256*1024
29 hasher = hashlib.sha256()
30 with open(filepath, 'rb') as afile:
31 buf = afile.read(BLOCKSIZE)
32 while len(buf) > 0:
33 hasher.update(buf)
34 buf = afile.read(BLOCKSIZE)
35 return hasher.hexdigest()
36
37# https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_ml/py_kmeans/py_kmeans_opencv/py_kmeans_opencv.html
38def get_k_most_common_colors(img, K):
39 Z = img.reshape((-1,3)) # not sure
40 Z = np.float32(Z) # convert to np.float32
41 # define criteria, number of clusters(K) and apply kmeans()
42 criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
43 ret, label, center = cv2.kmeans(Z, K, None, criteria, 10, cv2.KMEANS_PP_CENTERS)
44 return center # center is a list of K BGR lists [[B, G, R], [B, G, R], ...]
45
46def worker(imagepath):
47 imagename = imagepath.split(os.path.sep)[-1]
48 imgcontainingfolder = (os.path.sep).join(imagepath.split(os.path.sep)[:-1])
49 try:
50 sha256_value = pk2_sha256(imagepath)
51 imohash_value = imohash.hashfile(imagepath, hexdigest=True)
52
53 filesize_in_bytes = os.stat(imagepath).st_size
54 img = cv2.imread(imagepath, cv2.IMREAD_UNCHANGED)
55
56 num_channels = img.shape[2]
57 if num_channels == 3: # normal BGR image
58 pass
59 elif num_channels == 4: # has an alpha channel
60 img = img[:,:,:3] # ensure the alpha channel is ignored, only get BGR
61 pass
62 else: # something is wrong with this image, an exception will probably be thrown somewhere below
63 print(MAGENTA("Not sure how to deal with this: num_channels: "), num_channels)
64 pass
65
66 blue_img, green_img, red_img = cv2.split(img)
67 gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # produces slightly different results than using cv2.imread(imagepath, cv2.IMREAD_GRAYSCALE) (very minor differences, see here: https://stackoverflow.com/a/37208336)
68
69 width = gray_img.shape[1]
70 height = gray_img.shape[0]
71
72 # other metrics #
73 colorfulness = pk2is.calc_colorfulness(img)
74
75 # # cv2 #
76 # color_1, color_2, color_3 = get_k_most_common_colors(img, 3)
77 # prevalent_colors = [tuple(color_1), tuple(color_2), tuple(color_3)]
78
79 # scipy stats #
80 # bw_channeldata = [info_bw.minmax[0], info_bw.minmax[1], info_bw.mean, info_bw.variance, info_bw.skewness, info_bw.kurtosis]
81
82 info_b = scipy.stats.describe(blue_img, axis = None)
83 info_g = scipy.stats.describe(green_img, axis = None)
84 info_r = scipy.stats.describe(red_img, axis = None)
85 info_bw = scipy.stats.describe(gray_img, axis = None)
86 entropy_b = skimage.measure.shannon_entropy(blue_img)
87 entropy_g = skimage.measure.shannon_entropy(green_img)
88 entropy_r = skimage.measure.shannon_entropy(red_img)
89 entropy_bw = skimage.measure.shannon_entropy(gray_img)
90 channels = [(info_b, entropy_b), (info_g, entropy_g), (info_r, entropy_r), (info_bw, entropy_bw)]
91
92 bgrbw_channeldata = [[c[0].minmax[0], c[0].minmax[1], c[0].mean, c[0].variance, c[0].skewness, c[0].kurtosis, c[1]] for c in channels]
93 bgrbw_channeldata = list(list(zip(*bgrbw_channeldata))) # take the transpose so you have minmax, mean, ... each in (b g r bw) format
94 bgrbw_channeldata = [j for i in bgrbw_channeldata for j in i] # flatten the list so we have min b, g, r, bw, max b, g, r, bw, mean b, g, r, bw, etc.
95 # print(bgrbw_channeldata)
96 # skimage #
97 # mydatagray = skimage.io.imread(imagepath, as_gray = True)
98 # mydatagray = skimage.color.rgb2gray(img_data)
99 # entropy = skimage.measure.shannon_entropy(gray_img) # lower entropy value => more homogenous, to an extent
100 # contourvals = len(skimage.measure.find_contours(mydatagray, 0.5))
101 # entropy = 500
102
103 print(GREEN("Successfully completed image stat calcs for {} of dimensions ({}, {}):".format(imagename, width, height)))
104 return [sha256_value, imohash_value, filesize_in_bytes, imgcontainingfolder, imagepath, imagename, width, height, colorfulness] + bgrbw_channeldata
105 except Exception as e:
106 print(RED("Something went wrong with {}!".format(imagepath)))
107 print(e)
108 return [None, imagepath, imagename, e]
109
110def update_df_entry(df, index, path):
111 df.at[index, 'imgpath'] = path
112 df.at[index, 'imgname'] = path.split(os.path.sep)[-1]
113 df.at[index, 'imgcontainingfolder'] = os.path.sep.join(path.split(os.path.sep)[:-1])
114 # index values cannot be changed here because it could result in duplicates; this must be done at the end
115
116
117if __name__ == '__main__':
118 ###########################################################
119 # get the list of pictures to perform the calculations on #
120 ###########################################################
121 image_paths = [] # list containing paths of all images to be analyzed
122 image_directories = open('../usable_image_directories.txt', 'r').read().split('\n') # get list of directories to scan
123 for dir in image_directories:
124 for entry in os.scandir(dir):
125 if entry.name.endswith((".png", ".jpg", ".jpeg")):
126 image_paths.append(entry.path)
127
128 ####################################################
129 # update the database if it already exists on disk #
130 ####################################################
131 print("Deciding to update database for each entry")
132 update_existing = os.path.isfile("info/img_stat_df.pkl")
133 df = None
134 if update_existing:
135 df = pandas.read_pickle("info/img_stat_df.pkl")
136
137 # copy the existing database files into a new folder (the copies will remain while the originals will be overwritten)
138 if not os.path.isdir("info/backups"):
139 os.mkdir("info/backups")
140 newdir = "info/backups/before {} update".format(pk2_timestamp())
141 os.mkdir(newdir)
142 for x in ["info/{}".format(i) for i in ("img_stat_df.pkl", "image_stat_df.xlsx", "exceptionlog.xlsx")]:
143 shutil.copy(x, newdir)
144
145 imgpaths_that_need_stats_calculated = []
146 imghashes = []
147 for path in image_paths: # iterate over all image files on disk we want to have an entry for in the database
148 xx = imohash.hashfile(path, hexdigest = True)
149 imghashes.append(xx)
150 if path in df.index: # path is in database # https://stackoverflow.com/a/43298438
151 if df.loc[path]['imohash'] == xx: # both the path and the imohash hash match between file and database entry, so do nothing
152 pass
153 else: # the hash doesn't match that at the path, so search to see if another entry's hash matches
154 sha = pk2_sha256(path)
155 for index, row in df.iterrows():
156 if xx == row['imohash'] and sha == row['sha256']: # only update database entry if sha256 and imohash matches # the sha value is only calculated if the imohash hash matches
157 update_df_entry(df, index, path) # dataframes are mutable
158 break
159 else: # we didn't find the hash in the database, so stats must be recalculated
160 imgpaths_that_need_stats_calculated.append(path)
161 else: # path is not in database
162 sha = pk2_sha256(path)
163 j = df.index[(df['imohash'] == xx) & (df['sha256'] == sha)].tolist()
164 if len(j) > 0: # if hash in database, update the filenamees
165 update_df_entry(df, j[0], path)
166 else: # the hashes were not found in the database, so stats need to be calculated
167 imgpaths_that_need_stats_calculated.append(path)
168
169 # remove entries whose hashes match nothing on disk
170 removebyhash = []
171 for index, row in df.iterrows():
172 if row['imohash'] not in imghashes:
173 removebyhash.append(index)
174 for i in removebyhash:
175 df = df.drop([i])
176
177 df.index = df['imgpath'] # set the indices to the imgpaths column since they might not be the same after updating entries earlier
178
179 # remove entries whose paths match nothing on disk
180 toremove = []
181 for path in df.index:
182 if path not in image_paths:
183 toremove.append(path)
184 for i in toremove:
185 df = df.drop([i])
186
187 else: # if database does not already exist on disk, then create it from scratch
188 imgpaths_that_need_stats_calculated = image_paths
189
190 ######################################################################
191 # use multiprocessing to begin performing calculations on the images #
192 ######################################################################
193 if len(imgpaths_that_need_stats_calculated) >= 1:
194 with Pool(processes=min(len(imgpaths_that_need_stats_calculated), os.cpu_count())) as pool:
195 print(CYAN("Multiprocessing is starting now"))
196 mydata = pool.map(worker, imgpaths_that_need_stats_calculated)
197 print(CYAN("Multiprocessing finished!"))
198 else:
199 mydata = []
200 print("No statistics needed to be calculated to update the database, so no multiprocessing was used.")
201
202 ##################################
203 # write results to a spreadsheet #
204 ##################################
205 print("Begin creating and exporting dataframes")
206 statnames = ['min', 'max', 'mean', 'variance', 'skewness', 'kurtosis', 'entropy']
207 channelnames = ["{}_{}".format(j, i) for j in statnames for i in ['b', 'g', 'r', 'bw']]
208 # prevalent_colornames = ['color_1', 'color_2', 'color_3']
209 a = ['sha256', 'imohash', 'filesize_in_bytes', 'imgcontainingfolder', 'imgpath', 'imgname', 'width', 'height', 'colorfulness'] + channelnames
210 b = ['imgpath', 'imgname', 'exception']
211 datadict = {i:[] for i in a}
212 faildict = {i:[] for i in b}
213 if update_existing:
214 datadict = df.to_dict(orient='list') # keep the old records and just append to them
215 for imgdata in mydata:
216 if imgdata[0] != None: # do not contain entries for the images that were failed to be read
217 for i in range(0, len(a)):
218 datadict[a[i]].append(imgdata[i])
219 else:
220 for i in range(0, len(b)):
221 faildict[b[i]].append(imgdata[i+1]) # just chop off the None identifier
222
223 # create the dataframes
224 df = pandas.DataFrame(data = datadict, index = datadict['imgpath'])
225 df.sort_index(inplace = True) # keep the sort consistent, whether updating or starting from scratch
226 ################################################################################################
227 # openpyxl formatting stuff to create an easily viewable and sortable version
228 ################################################################################################
229 import openpyxl as opxl
230 import pk2_xl
231 from openpyxl.utils.dataframe import dataframe_to_rows
232 from openpyxl.styles import PatternFill
233 from openpyxl.styles import Font
234 wb = opxl.Workbook()
235 ws = wb.active
236 ws.title = "Sheet1"
237
238 # https://openpyxl.readthedocs.io/en/2.4/pandas.html
239 for i, r in enumerate(dataframe_to_rows(df, index=True, header=True)):
240 if i == 0: # have to convert the first row (header) to strings for openpyxl to work property
241 ws.append(list(map(str, r)))
242 ws['A1'] = "index" # without this, it just says "None"
243 elif i != 1: # for some reason a blank row gets inserted underneath the header unless I include this condition
244 ws.append(r)
245
246 hei, wid = df.shape
247 tablerange = pk2_xl.xlc(0, 0, hei, wid) # inclusive on both ends
248 table = opxl.worksheet.table.Table(ref=tablerange, displayName="image_data")
249 ws.add_table(table)
250
251 # add colors (+ 1 is added to the dimensions to include the index and header)
252 for col in range(0, wid + 1):
253 ws[pk2_xl.xlc(0, col)].fill = PatternFill(fgColor="bdd5ef", fill_type = "solid")
254 colname = ws[pk2_xl.xlc(0, col)].value
255 docolor = True
256 if colname.endswith("_b"): # https://htmlcolorcodes.com/
257 mycolor = 'A9E4FF'
258 mycolorlight = 'D2F1FF'
259 elif colname.endswith("_g"):
260 mycolor = 'A2FFA2'
261 mycolorlight = 'D0FFD0'
262 elif colname.endswith("_r"):
263 mycolor = 'FDB7B7'
264 mycolorlight = 'FFD0D0'
265 elif colname.endswith("_bw"):
266 mycolor = 'C9C9C9'
267 mycolorlight = 'DEDEDE'
268 else:
269 docolor = False
270 if docolor:
271 for row in range(1, hei + 1):
272 ccc = mycolor if row % 2 == 1 else mycolorlight
273 ws[pk2_xl.xlc(row, col)].fill = PatternFill(fgColor = ccc, fill_type = "solid")
274
275 # hyperlink all images
276 font = Font(color="0000FF", underline = 'single') # typical hyperlink color with underline
277 xx = df.columns.get_loc("imgpath") + 1 # get x coordinate of imagpath in the dataframe (plus 1 to account for index)
278 for yy in range(1, hei + 1):
279 ws[pk2_xl.xlc(yy, xx)].hyperlink = str(ws[pk2_xl.xlc(yy, xx)].value) # make the imgpath link to itself
280 ws[pk2_xl.xlc(yy, xx)].font = font
281
282
283 wb.save("info/image_stat_df.xlsx") # for viewing purposes only
284 ################################################################################################
285 # also write to a pickle file to preserve internal formats
286 df.to_pickle("info/img_stat_df.pkl")
287 # export the exception log
288 fdf = pandas.DataFrame(data = faildict)
289 writer = pandas.ExcelWriter('info/exceptionlog.xlsx') # for viewing purposes only
290 fdf.to_excel(writer, 'Sheet1')
291 writer.save()
292
293
294
295
296### non-multiprocessing way below ###
297
298# allpicsdir = r'D:\Pictures\ZRUP\Good Backgrounds'
299
300# imagetests = []
301# for entry in os.scandir(allpicsdir):
302# if entry.name.endswith((".png", ".jpg", ".jpeg")):
303# imagetests.append(entry.name)
304
305
306# datadict = {'imgpath':[], 'imgname':[], 'width':[], 'height':[], 'entropy':[], 'colorfulness':[], 'extrema':[], 'mean':[], 'variance':[], 'skewness':[], 'kurtosis':[]}
307
308# for imagename in imagetests:
309# print(CYAN("Performing calculations on {}:".format(imagename)))
310
311# imagepath = os.path.join(allpicsdir, imagename)
312
313# try:
314# img = cv2.imread(imagepath)
315# gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # produces slightly different results than using cv2.imread(imagepath, cv2.IMREAD_GRAYSCALE) (very minor differences, see here: https://stackoverflow.com/a/37208336)
316# print("Dims: ({}, {})".format(gray_img.shape[1], gray_img.shape[0]))
317
318# # scipy stats #
319# info = scipy.stats.describe(gray_img, axis = None)
320
321# # skimage #
322# mydata = skimage.io.imread(imagepath)
323# mydatagray = skimage.color.rgb2gray(mydata)
324# entropy = skimage.measure.shannon_entropy(mydatagray)
325# # contourvals = len(skimage.measure.find_contours(mydatagray, 0.5))
326
327# # other metrics #
328# colorfulness = calc_colorfulness(img)
329
330# # printing all the statistics #
331# print("Entropy:\t", entropy)
332# print(MAGENTA("Colorfulness:\t", colorfulness))
333# print("Extrema:\t", info.minmax)
334# print(YELLOW("Mean:\t\t", info.mean))
335# print("Variance:\t", info.variance)
336# print(YELLOW("Skewness:\t", info.skewness))
337# print("Kurosis:\t", info.kurtosis)
338# print("-"*60)
339
340# datadict['imgpath'].append(imagepath)
341# datadict['imgname'].append(imagename)
342# datadict['width'].append(gray_img.shape[1])
343# datadict['height'].append(gray_img.shape[0])
344# datadict['entropy'].append(entropy)
345# datadict['colorfulness'].append(colorfulness)
346# datadict['extrema'].append(info.minmax)
347# datadict['mean'].append(info.mean)
348# datadict['variance'].append(info.variance)
349# datadict['skewness'].append(info.skewness)
350# datadict['kurtosis'].append(info.kurtosis)
351
352# except:
353# print(RED("Something went wrong with {}!".format(imagepath)))
354
355# import pandas
356# df = pandas.DataFrame(data=datadict)
357# writer = pandas.ExcelWriter('imagestatcache.xlsx')
358# df.to_excel(writer, 'Sheet1')
359# writer.save()