|
| 1 | +import skimage.measure |
| 2 | +import matplotlib.pyplot as plt |
| 3 | +import numpy as np |
| 4 | +import cv2 |
| 5 | +import os |
| 6 | +import imghdr |
| 7 | + |
| 8 | +""" |
| 9 | +Duplicate Image Finder (DIF): function that searches a given directory for images and finds duplicate/similar images among them. |
| 10 | +Outputs the number of found duplicate/similar image pairs with a list of the filenames having lower resolution. |
| 11 | +""" |
| 12 | + |
| 13 | +def compare_images(directory, show_imgs=True, similarity="high", compression=50): |
| 14 | + """ |
| 15 | + directory (str).........folder to search for duplicate/similar images |
| 16 | + show_imgs (bool)........True = shows the duplicate/similar images found in output |
| 17 | + False = doesn't show found images |
| 18 | + similarity (str)........"high" = searches for duplicate images, more precise |
| 19 | + "low" = finds similar images |
| 20 | + compression (int).......recommended not to change default value |
| 21 | + compression in px (height x width) of the images before being compared |
| 22 | + the higher the compression i.e. the higher the pixel size, the more computational ressources and time required |
| 23 | + """ |
| 24 | + # list where the found duplicate/similar images are stored |
| 25 | + duplicates = [] |
| 26 | + lower_res = [] |
| 27 | + |
| 28 | + imgs_matrix = create_imgs_matrix(directory, compression) |
| 29 | + |
| 30 | + # search for similar images |
| 31 | + if similarity == "low": |
| 32 | + ref = 1000 |
| 33 | + # search for 1:1 duplicate images |
| 34 | + else: |
| 35 | + ref = 200 |
| 36 | + |
| 37 | + main_img = 0 |
| 38 | + compared_img = 1 |
| 39 | + nrows, ncols = compression, compression |
| 40 | + srow_A = 0 |
| 41 | + erow_A = nrows |
| 42 | + srow_B = erow_A |
| 43 | + erow_B = srow_B + nrows |
| 44 | + |
| 45 | + while erow_B <= imgs_matrix.shape[0]: |
| 46 | + while compared_img < (len(image_files)): |
| 47 | + # select two images from imgs_matrix |
| 48 | + imgA = imgs_matrix[srow_A : erow_A, # rows |
| 49 | + 0 : ncols] # columns |
| 50 | + imgB = imgs_matrix[srow_B : erow_B, # rows |
| 51 | + 0 : ncols] # columns |
| 52 | + # compare the images |
| 53 | + rotations = 0 |
| 54 | + while image_files[main_img] not in duplicates and rotations <= 3: |
| 55 | + if rotations != 0: |
| 56 | + imgB = rotate_img(imgB) |
| 57 | + err = mse(imgA, imgB) |
| 58 | + if err < ref: |
| 59 | + if show_imgs == True: |
| 60 | + show_img_figs(imgA, imgB, err) |
| 61 | + show_file_info(compared_img, main_img) |
| 62 | + add_to_list(image_files[main_img], duplicates) |
| 63 | + check_img_quality(directory, image_files[main_img], image_files[compared_img], lower_res) |
| 64 | + rotations += 1 |
| 65 | + srow_B += nrows |
| 66 | + erow_B += nrows |
| 67 | + compared_img += 1 |
| 68 | + |
| 69 | + srow_A += nrows |
| 70 | + erow_A += nrows |
| 71 | + srow_B = erow_A |
| 72 | + erow_B = srow_B + nrows |
| 73 | + main_img += 1 |
| 74 | + compared_img = main_img + 1 |
| 75 | + |
| 76 | + msg = "\n***\n DONE: found " + str(len(duplicates)) + " duplicate image pairs in " + str(len(image_files)) + " total images.\n The following files have lower resolution:" |
| 77 | + print(msg) |
| 78 | + return set(lower_res) |
| 79 | + |
| 80 | +# Function that searches the folder for image files, converts them to a matrix |
| 81 | +def create_imgs_matrix(directory, compression): |
| 82 | + global image_files |
| 83 | + image_files = [] |
| 84 | + # create list of all files in directory |
| 85 | + folder_files = [filename for filename in os.listdir(directory)] |
| 86 | + |
| 87 | + # create images matrix |
| 88 | + counter = 0 |
| 89 | + for filename in folder_files: |
| 90 | + if not os.path.isdir(directory + filename) and imghdr.what(directory + filename): |
| 91 | + img = cv2.imdecode(np.fromfile(directory + filename, dtype=np.uint8), cv2.IMREAD_UNCHANGED) |
| 92 | + if type(img) == np.ndarray: |
| 93 | + img = img[...,0:3] |
| 94 | + img = cv2.resize(img, dsize=(compression, compression), interpolation=cv2.INTER_CUBIC) |
| 95 | + if counter == 0: |
| 96 | + imgs_matrix = img |
| 97 | + image_files.append(filename) |
| 98 | + counter += 1 |
| 99 | + else: |
| 100 | + imgs_matrix = np.concatenate((imgs_matrix, img)) |
| 101 | + image_files.append(filename) |
| 102 | + return imgs_matrix |
| 103 | + |
| 104 | +# Function that calulates the mean squared error (mse) between two image matrices |
| 105 | +def mse(imageA, imageB): |
| 106 | + err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2) |
| 107 | + err /= float(imageA.shape[0] * imageA.shape[1]) |
| 108 | + return err |
| 109 | + |
| 110 | +# Function that plots two compared image files and their mse |
| 111 | +def show_img_figs(imageA, imageB, err): |
| 112 | + fig = plt.figure() |
| 113 | + plt.suptitle("MSE: %.2f" % (err)) |
| 114 | + # plot first image |
| 115 | + ax = fig.add_subplot(1, 2, 1) |
| 116 | + plt.imshow(imageA, cmap = plt.cm.gray) |
| 117 | + plt.axis("off") |
| 118 | + # plot second image |
| 119 | + ax = fig.add_subplot(1, 2, 2) |
| 120 | + plt.imshow(imageB, cmap = plt.cm.gray) |
| 121 | + plt.axis("off") |
| 122 | + # show the images |
| 123 | + plt.show() |
| 124 | + |
| 125 | +#Function for rotating an image matrix by a 90 degree angle |
| 126 | +def rotate_img(image): |
| 127 | + image = np.rot90(image, k=1, axes=(0, 1)) |
| 128 | + return image |
| 129 | + |
| 130 | +# Function for printing filename info of plotted image files |
| 131 | +def show_file_info(compared_img, main_img): |
| 132 | + print("Duplicate file: " + image_files[main_img] + " and " + image_files[compared_img]) |
| 133 | + |
| 134 | +# Function for appending items to a list |
| 135 | +def add_to_list(filename, list): |
| 136 | + list.append(filename) |
| 137 | + |
| 138 | +# Function for checking the quality of compared images, appends the lower quality image to the list |
| 139 | +def check_img_quality(directory, imageA, imageB, list): |
| 140 | + size_imgA = os.stat(directory + imageA).st_size |
| 141 | + size_imgB = os.stat(directory + imageB).st_size |
| 142 | + if size_imgA > size_imgB: |
| 143 | + add_to_list(imageB, list) |
| 144 | + else: |
| 145 | + add_to_list(imageA, list) |
0 commit comments