fzfN6SAr

· 6 years ago · May 20, 2019, 02:20 AM
1# https://stackoverflow.com/questions/40919936/calculating-entropy-from-glcm-of-an-image
2# https://www.ucalgary.ca/mhallbey/glcm1
3# http://scikit-image.org/docs/dev/auto_examples/features_detection/plot_glcm.html
4
5# https://stackoverflow.com/a/24672188 # not really sure what this is a measure of, but the code does work
6print("Begin importing modules")
7import os
8from datetime import datetime
9import shutil
10import pandas
11import pickle
12import hashlib
13import imohash
14import imagesize
15import skimage
16import cv2
17import numpy as np
18# from scipy.stats import kurtosis, skew # https://stackoverflow.com/questions/53092196/calculating-kurtosis-from-an-image
19import scipy
20from multiprocessing import Pool
21from pk2_color import *
22import pk2_image_stats as pk2is
23
24def pk2_timestamp():
25  return datetime.now().replace(microsecond=0).isoformat('_').replace(":","-")
26
27def pk2_sha256(filepath):
28  BLOCKSIZE = 256*1024
29  hasher = hashlib.sha256()
30  with open(filepath, 'rb') as afile:
31    buf = afile.read(BLOCKSIZE)
32    while len(buf) > 0:
33      hasher.update(buf)
34      buf = afile.read(BLOCKSIZE)
35  return hasher.hexdigest()
36
37# https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_ml/py_kmeans/py_kmeans_opencv/py_kmeans_opencv.html
38def get_k_most_common_colors(img, K):
39  Z = img.reshape((-1,3)) # not sure
40  Z = np.float32(Z) # convert to np.float32
41  # define criteria, number of clusters(K) and apply kmeans()
42  criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
43  ret, label, center = cv2.kmeans(Z, K, None, criteria, 10, cv2.KMEANS_PP_CENTERS)
44  return center # center is a list of K BGR lists [[B, G, R], [B, G, R], ...]
45
46def worker(imagepath):
47  imagename = imagepath.split(os.path.sep)[-1]
48  imgcontainingfolder = (os.path.sep).join(imagepath.split(os.path.sep)[:-1])
49  try:
50    sha256_value = pk2_sha256(imagepath)
51    imohash_value = imohash.hashfile(imagepath, hexdigest=True)
52    
53    filesize_in_bytes = os.stat(imagepath).st_size
54    img = cv2.imread(imagepath, cv2.IMREAD_UNCHANGED)
55    
56    num_channels = img.shape[2]
57    if num_channels == 3: # normal BGR image
58      pass
59    elif num_channels == 4: # has an alpha channel
60      img = img[:,:,:3] # ensure the alpha channel is ignored, only get BGR
61      pass
62    else: # something is wrong with this image, an exception will probably be thrown somewhere below
63      print(MAGENTA("Not sure how to deal with this: num_channels: "), num_channels)
64      pass
65    
66    blue_img, green_img, red_img = cv2.split(img)
67    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # produces slightly different results than using cv2.imread(imagepath, cv2.IMREAD_GRAYSCALE) (very minor differences, see here: https://stackoverflow.com/a/37208336)
68
69    width = gray_img.shape[1]
70    height = gray_img.shape[0]
71  
72    # other metrics #
73    colorfulness = pk2is.calc_colorfulness(img)
74  
75    # # cv2 #
76    # color_1, color_2, color_3 = get_k_most_common_colors(img, 3)
77    # prevalent_colors = [tuple(color_1), tuple(color_2), tuple(color_3)]
78    
79    # scipy stats #
80    # bw_channeldata = [info_bw.minmax[0], info_bw.minmax[1], info_bw.mean, info_bw.variance, info_bw.skewness, info_bw.kurtosis]
81    
82    info_b = scipy.stats.describe(blue_img, axis = None)
83    info_g = scipy.stats.describe(green_img, axis = None)
84    info_r = scipy.stats.describe(red_img, axis = None)
85    info_bw = scipy.stats.describe(gray_img, axis = None)
86    entropy_b = skimage.measure.shannon_entropy(blue_img)
87    entropy_g = skimage.measure.shannon_entropy(green_img)
88    entropy_r = skimage.measure.shannon_entropy(red_img)
89    entropy_bw = skimage.measure.shannon_entropy(gray_img)
90    channels = [(info_b, entropy_b), (info_g, entropy_g), (info_r, entropy_r), (info_bw, entropy_bw)]
91    
92    bgrbw_channeldata = [[c[0].minmax[0], c[0].minmax[1], c[0].mean, c[0].variance, c[0].skewness, c[0].kurtosis, c[1]] for c in channels]
93    bgrbw_channeldata = list(list(zip(*bgrbw_channeldata))) # take the transpose so you have minmax, mean, ... each in (b g r bw) format
94    bgrbw_channeldata = [j for i in bgrbw_channeldata for j in i] # flatten the list so we have min b, g, r, bw, max b, g, r, bw, mean b, g, r, bw, etc.
95    # print(bgrbw_channeldata)
96    # skimage #
97    # mydatagray = skimage.io.imread(imagepath, as_gray = True)
98    # mydatagray = skimage.color.rgb2gray(img_data)
99    # entropy = skimage.measure.shannon_entropy(gray_img) # lower entropy value => more homogenous, to an extent
100    # contourvals = len(skimage.measure.find_contours(mydatagray, 0.5))
101    # entropy = 500
102    
103    print(GREEN("Successfully completed image stat calcs for {} of dimensions ({}, {}):".format(imagename, width, height)))
104    return [sha256_value, imohash_value, filesize_in_bytes, imgcontainingfolder, imagepath, imagename, width, height, colorfulness] + bgrbw_channeldata
105  except Exception as e:
106    print(RED("Something went wrong with {}!".format(imagepath)))
107    print(e)
108    return [None, imagepath, imagename, e]
109    
110def update_df_entry(df, index, path):
111  df.at[index, 'imgpath'] = path
112  df.at[index, 'imgname'] = path.split(os.path.sep)[-1]
113  df.at[index, 'imgcontainingfolder'] = os.path.sep.join(path.split(os.path.sep)[:-1])
114  # index values cannot be changed here because it could result in duplicates; this must be done at the end
115  
116
117if __name__ == '__main__':
118  ###########################################################
119  # get the list of pictures to perform the calculations on #
120  ###########################################################
121  image_paths = [] # list containing paths of all images to be analyzed
122  image_directories = open('../usable_image_directories.txt', 'r').read().split('\n') # get list of directories to scan
123  for dir in image_directories:
124    for entry in os.scandir(dir):
125      if entry.name.endswith((".png", ".jpg", ".jpeg")):
126        image_paths.append(entry.path)
127  
128  ####################################################
129  # update the database if it already exists on disk #
130  ####################################################
131  print("Deciding to update database for each entry")
132  update_existing = os.path.isfile("info/img_stat_df.pkl")
133  df = None
134  if update_existing:
135    df = pandas.read_pickle("info/img_stat_df.pkl")
136    
137    # copy the existing database files into a new folder (the copies will remain while the originals will be overwritten)
138    if not os.path.isdir("info/backups"):
139      os.mkdir("info/backups")
140    newdir = "info/backups/before {} update".format(pk2_timestamp())
141    os.mkdir(newdir)
142    for x in ["info/{}".format(i) for i in ("img_stat_df.pkl", "image_stat_df.xlsx", "exceptionlog.xlsx")]:
143      shutil.copy(x, newdir)
144    
145    imgpaths_that_need_stats_calculated = []
146    imghashes = []
147    for path in image_paths: # iterate over all image files on disk we want to have an entry for in the database
148      xx = imohash.hashfile(path, hexdigest = True)
149      imghashes.append(xx)
150      if path in df.index: # path is in database # https://stackoverflow.com/a/43298438
151        if df.loc[path]['imohash'] == xx: # both the path and the imohash hash match between file and database entry, so do nothing
152          pass
153        else: # the hash doesn't match that at the path, so search to see if another entry's hash matches
154          sha = pk2_sha256(path)
155          for index, row in df.iterrows():
156            if xx == row['imohash'] and sha == row['sha256']: # only update database entry if sha256 and imohash matches # the sha value is only calculated if the imohash hash matches
157              update_df_entry(df, index, path) # dataframes are mutable
158              break
159          else: # we didn't find the hash in the database, so stats must be recalculated
160            imgpaths_that_need_stats_calculated.append(path)
161      else: # path is not in database
162        sha = pk2_sha256(path)
163        j = df.index[(df['imohash'] == xx) & (df['sha256'] == sha)].tolist()
164        if len(j) > 0: # if hash in database, update the filenamees
165          update_df_entry(df, j[0], path)
166        else: # the hashes were not found in the database, so stats need to be calculated
167          imgpaths_that_need_stats_calculated.append(path)
168    
169    # remove entries whose hashes match nothing on disk
170    removebyhash = []
171    for index, row in df.iterrows():
172      if row['imohash'] not in imghashes:
173        removebyhash.append(index)
174    for i in removebyhash:
175      df = df.drop([i])
176        
177    df.index = df['imgpath'] # set the indices to the imgpaths column since they might not be the same after updating entries earlier
178    
179    # remove entries whose paths match nothing on disk
180    toremove = []
181    for path in df.index:
182      if path not in image_paths:
183        toremove.append(path)
184    for i in toremove:
185      df = df.drop([i])
186  
187  else: # if database does not already exist on disk, then create it from scratch
188    imgpaths_that_need_stats_calculated = image_paths
189  
190  ######################################################################
191  # use multiprocessing to begin performing calculations on the images #
192  ######################################################################
193  if len(imgpaths_that_need_stats_calculated) >= 1:
194    with Pool(processes=min(len(imgpaths_that_need_stats_calculated), os.cpu_count())) as pool:
195      print(CYAN("Multiprocessing is starting now"))
196      mydata = pool.map(worker, imgpaths_that_need_stats_calculated)
197    print(CYAN("Multiprocessing finished!"))
198  else:
199    mydata = []
200    print("No statistics needed to be calculated to update the database, so no multiprocessing was used.")
201  
202  ##################################
203  # write results to a spreadsheet #
204  ##################################
205  print("Begin creating and exporting dataframes")
206  statnames = ['min', 'max', 'mean', 'variance', 'skewness', 'kurtosis', 'entropy']
207  channelnames = ["{}_{}".format(j, i) for j in statnames for i in ['b', 'g', 'r', 'bw']]
208  # prevalent_colornames = ['color_1', 'color_2', 'color_3']
209  a = ['sha256', 'imohash', 'filesize_in_bytes', 'imgcontainingfolder', 'imgpath', 'imgname', 'width', 'height', 'colorfulness'] + channelnames
210  b = ['imgpath', 'imgname', 'exception']
211  datadict = {i:[] for i in a}
212  faildict = {i:[] for i in b}
213  if update_existing:
214    datadict = df.to_dict(orient='list') # keep the old records and just append to them
215  for imgdata in mydata:
216    if imgdata[0] != None: # do not contain entries for the images that were failed to be read
217      for i in range(0, len(a)):
218        datadict[a[i]].append(imgdata[i])
219    else:
220      for i in range(0, len(b)):
221        faildict[b[i]].append(imgdata[i+1]) # just chop off the None identifier
222  
223  # create the dataframes  
224  df = pandas.DataFrame(data = datadict, index = datadict['imgpath'])
225  df.sort_index(inplace = True) # keep the sort consistent, whether updating or starting from scratch
226  ################################################################################################
227  # openpyxl formatting stuff to create an easily viewable and sortable version
228  ################################################################################################
229  import openpyxl as opxl
230  import pk2_xl
231  from openpyxl.utils.dataframe import dataframe_to_rows
232  from openpyxl.styles import PatternFill
233  from openpyxl.styles import Font
234  wb = opxl.Workbook()
235  ws = wb.active
236  ws.title = "Sheet1"
237
238  # https://openpyxl.readthedocs.io/en/2.4/pandas.html
239  for i, r in enumerate(dataframe_to_rows(df, index=True, header=True)):
240    if i == 0: # have to convert the first row (header) to strings for openpyxl to work property
241      ws.append(list(map(str, r)))
242      ws['A1'] = "index" # without this, it just says "None"
243    elif i != 1: # for some reason a blank row gets inserted underneath the header unless I include this condition
244      ws.append(r)
245  
246  hei, wid = df.shape
247  tablerange = pk2_xl.xlc(0, 0, hei, wid) # inclusive on both ends
248  table = opxl.worksheet.table.Table(ref=tablerange, displayName="image_data")
249  ws.add_table(table)
250  
251  # add colors (+ 1 is added to the dimensions to include the index and header)
252  for col in range(0, wid + 1):
253    ws[pk2_xl.xlc(0, col)].fill = PatternFill(fgColor="bdd5ef", fill_type = "solid")
254    colname = ws[pk2_xl.xlc(0, col)].value
255    docolor = True
256    if colname.endswith("_b"): # https://htmlcolorcodes.com/
257      mycolor = 'A9E4FF'
258      mycolorlight = 'D2F1FF'
259    elif colname.endswith("_g"):
260      mycolor = 'A2FFA2'
261      mycolorlight = 'D0FFD0'
262    elif colname.endswith("_r"):
263      mycolor = 'FDB7B7'
264      mycolorlight = 'FFD0D0'
265    elif colname.endswith("_bw"):
266      mycolor = 'C9C9C9'
267      mycolorlight = 'DEDEDE'
268    else:
269      docolor = False
270    if docolor:
271      for row in range(1, hei + 1):
272        ccc = mycolor if row % 2 == 1 else mycolorlight
273        ws[pk2_xl.xlc(row, col)].fill = PatternFill(fgColor = ccc, fill_type = "solid")
274
275  # hyperlink all images
276  font = Font(color="0000FF", underline = 'single') # typical hyperlink color with underline
277  xx = df.columns.get_loc("imgpath") + 1 # get x coordinate of imagpath in the dataframe (plus 1 to account for index)
278  for yy in range(1, hei + 1):
279    ws[pk2_xl.xlc(yy, xx)].hyperlink = str(ws[pk2_xl.xlc(yy, xx)].value) # make the imgpath link to itself
280    ws[pk2_xl.xlc(yy, xx)].font = font
281    
282
283  wb.save("info/image_stat_df.xlsx") # for viewing purposes only
284  ################################################################################################
285  # also write to a pickle file to preserve internal formats
286  df.to_pickle("info/img_stat_df.pkl")
287  # export the exception log
288  fdf = pandas.DataFrame(data = faildict)
289  writer = pandas.ExcelWriter('info/exceptionlog.xlsx') # for viewing purposes only
290  fdf.to_excel(writer, 'Sheet1')
291  writer.save()
292
293
294
295
296### non-multiprocessing way below ###
297
298# allpicsdir = r'D:\Pictures\ZRUP\Good Backgrounds'
299
300# imagetests = []
301# for entry in os.scandir(allpicsdir):
302#   if entry.name.endswith((".png", ".jpg", ".jpeg")):
303#     imagetests.append(entry.name)
304
305
306# datadict = {'imgpath':[], 'imgname':[], 'width':[], 'height':[], 'entropy':[], 'colorfulness':[], 'extrema':[], 'mean':[], 'variance':[], 'skewness':[], 'kurtosis':[]}
307
308# for imagename in imagetests:
309#   print(CYAN("Performing calculations on {}:".format(imagename)))
310  
311#   imagepath = os.path.join(allpicsdir, imagename)
312  
313#   try:
314#     img = cv2.imread(imagepath)
315#     gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # produces slightly different results than using cv2.imread(imagepath, cv2.IMREAD_GRAYSCALE) (very minor differences, see here: https://stackoverflow.com/a/37208336)
316#     print("Dims: ({}, {})".format(gray_img.shape[1], gray_img.shape[0]))
317    
318#     # scipy stats #
319#     info = scipy.stats.describe(gray_img, axis = None)
320    
321#     # skimage #
322#     mydata = skimage.io.imread(imagepath)
323#     mydatagray = skimage.color.rgb2gray(mydata)
324#     entropy = skimage.measure.shannon_entropy(mydatagray)
325#     # contourvals = len(skimage.measure.find_contours(mydatagray, 0.5))
326    
327#     # other metrics #
328#     colorfulness = calc_colorfulness(img)
329    
330#     # printing all the statistics #
331#     print("Entropy:\t", entropy)
332#     print(MAGENTA("Colorfulness:\t", colorfulness))
333#     print("Extrema:\t", info.minmax)
334#     print(YELLOW("Mean:\t\t", info.mean))
335#     print("Variance:\t", info.variance)
336#     print(YELLOW("Skewness:\t", info.skewness))
337#     print("Kurosis:\t", info.kurtosis)
338#     print("-"*60)
339    
340#     datadict['imgpath'].append(imagepath)
341#     datadict['imgname'].append(imagename)
342#     datadict['width'].append(gray_img.shape[1])
343#     datadict['height'].append(gray_img.shape[0])
344#     datadict['entropy'].append(entropy)
345#     datadict['colorfulness'].append(colorfulness)
346#     datadict['extrema'].append(info.minmax)
347#     datadict['mean'].append(info.mean)
348#     datadict['variance'].append(info.variance)
349#     datadict['skewness'].append(info.skewness)
350#     datadict['kurtosis'].append(info.kurtosis)
351    
352#   except:
353#     print(RED("Something went wrong with {}!".format(imagepath)))
354
355# import pandas
356# df = pandas.DataFrame(data=datadict)
357# writer = pandas.ExcelWriter('imagestatcache.xlsx')
358# df.to_excel(writer, 'Sheet1')
359# writer.save()