In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from keras.models import Model
from keras.preprocessing import image
from scipy.spatial import distance_matrix
from keras.layers import Dense, GlobalAveragePooling2D
from keras.applications.inception_v3 import InceptionV3

In [9]:
# Set a working path for images
working_path = '/fitzpatrick17k/images/all/'
save_path = '/fitzpatrick17k/images/processed/'
os.chdir(working_path)

# List of image filenames
imgs = [fn for fn in os.listdir() if fn.upper().endswith('.JPG')]

# Iterate each filename
for fn in imgs: 

 # Set working path
 os.chdir(working_path)

 # Open the image, normalize size and color
 temp = Image.open(fn, 'r')
 temp = temp.resize((299,299), Image.ANTIALIAS)
 temp = temp.convert('L')
 temp = temp.convert('1') 

 # Save in a new directory
 os.chdir(save_path)
 temp.save(fn, temp.format)
 temp.close()

Number of images: 16577


In [None]:
# Set path
os.chdir(save_path)

# Create base model
base_model = InceptionV3(weights = 'imagenet', include_top = False, input_shape = (299, 299, 3))

# Adjust layers
for layer in base_model.layers:
 layer.trainable = False

# Set features
pool_2d = GlobalAveragePooling2D(name = 'pool_2d')(base_model.output)
dense = Dense(1024, name = 'dense', activation = 'relu')(pool_2d)
predictions = Dense(1000, name = 'pred', activation = 'relu')(dense)

# Reformat model
model = Model(inputs = base_model.input, outputs = predictions)

# Adjust inputs
use_images = [image.load_img(c, target_size = (299,299)) for c in imgs]
use_tensor = np.array([image.img_to_array(img) for img in use_images])

# Output similarity matrix and save
model_output = pd.DataFrame(model.predict(use_tensor, batch_size = 32, verbose = 0), index = imgs)
df = pd.DataFrame(distance_matrix(model_output.values, model_output.values),index = model_output.index, columns = model_output.index)
df.to_csv('/fitzpatrick17k/similarity_matrix.csv', index = False)

In [4]:
# Read similarity matrix
df = pd.read_csv('/fitzpatrick17k/similarity_matrix.csv')

# Create a map of img --> duplicate image
duplicates = {}
count = 0
cols = df.columns
for i, val in df.iterrows():
 arr = np.where(df.loc[i, :] < 15)
 arr = np.delete(arr, count)
 if len(arr) > 0:
 duplicates[i] = [cols[j] for j in list(arr)]
 
duplicates

{'5f567374f688120109c2bab349ff21cf.jpg': ['22f1d783dd6821defafcc915a8146c41.jpg'],
 '6de74d3051ceafe10cf3f3e8c342bad8.jpg': ['be030bdb35c3c8cbf9145dfcebc0c0d2.jpg'],
 '19a1b36184861141cd3b1cd43e74b5b6.jpg': ['dbee4a80595e78f281e1a0938f9857be.jpg'],
 'a7ef35e99387ff1227baced72467dc1f.jpg': ['33068b46548282d53e5fee2f15792728.jpg'],
 '6cd57e29acb9071a6c5e5aa23aeaf0ee.jpg': ['11e1eeb117aafe575e95f9c2ee9de3f3.jpg'],
 'adb5b7253c21d274f9b1a793b01b84d2.jpg': ['3554761709cc4906ab9db13e5e46aa25.jpg'],
 'ea7c258aade6d510197d02b8d5012ba5.jpg': ['e6b375b8b1b7e27706623977dace3721.jpg'],
 'dbee4a80595e78f281e1a0938f9857be.jpg': ['dbee4a80595e78f281e1a0938f9857be.jpg'],
 '38682083d6f7539a88c17d57559dcbd6.jpg': ['2b16c87c36fa56721ab867e53560ec9b.jpg'],
 '1cc5a0ffcaf2f370cd65f10eeeb7fddd.jpg': ['9b82bbff48d88f3bea9d30cfd96606dc.jpg'],
 '8633a0075168ec7587c636bde73fcb21.jpg': ['09d46db9589ff45436cda87c4abc946b.jpg'],
 '771fff032a6c7854eb8509f5849e929e.jpg': ['bf77bafaa320f244f2331ca466b96f50.jpg'],
 '95