{ "cells": [ { "cell_type": "code", "execution_count": 1, "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "from PIL import Image\n", "from keras.models import Model\n", "from keras.preprocessing import image\n", "from scipy.spatial import distance_matrix\n", "from keras.layers import Dense, GlobalAveragePooling2D\n", "from keras.applications.inception_v3 import InceptionV3" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 9, "source": [ "# Set a working path for images\n", "working_path = '/fitzpatrick17k/images/all/'\n", "save_path = '/fitzpatrick17k/images/processed/'\n", "os.chdir(working_path)\n", "\n", "# List of image filenames\n", "imgs = [fn for fn in os.listdir() if fn.upper().endswith('.JPG')]\n", "\n", "# Iterate each filename\n", "for fn in imgs: \n", "\n", " # Set working path\n", " os.chdir(working_path)\n", "\n", " # Open the image, normalize size and color\n", " temp = Image.open(fn, 'r')\n", " temp = temp.resize((299,299), Image.ANTIALIAS)\n", " temp = temp.convert('L')\n", " temp = temp.convert('1') \n", "\n", " # Save in a new directory\n", " os.chdir(save_path)\n", " temp.save(fn, temp.format)\n", " temp.close()" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Number of images: 16577\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [ "# Set path\n", "os.chdir(save_path)\n", "\n", "# Create base model\n", "base_model = InceptionV3(weights = 'imagenet', include_top = False, input_shape = (299, 299, 3))\n", "\n", "# Adjust layers\n", "for layer in base_model.layers:\n", " layer.trainable = False\n", "\n", "# Set features\n", "pool_2d = GlobalAveragePooling2D(name = 'pool_2d')(base_model.output)\n", "dense = Dense(1024, name = 'dense', activation = 'relu')(pool_2d)\n", "predictions = Dense(1000, name = 'pred', activation = 'relu')(dense)\n", "\n", "# Reformat model\n", "model = Model(inputs = base_model.input, outputs = predictions)\n", "\n", "# Adjust inputs\n", "use_images = [image.load_img(c, target_size = (299,299)) for c in imgs]\n", "use_tensor = np.array([image.img_to_array(img) for img in use_images])\n", "\n", "# Output similarity matrix and save\n", "model_output = pd.DataFrame(model.predict(use_tensor, batch_size = 32, verbose = 0), index = imgs)\n", "df = pd.DataFrame(distance_matrix(model_output.values, model_output.values),index = model_output.index, columns = model_output.index)\n", "df.to_csv('/fitzpatrick17k/similarity_matrix.csv', index = False)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "# Read similarity matrix\n", "df = pd.read_csv('/fitzpatrick17k/similarity_matrix.csv')\n", "\n", "# Create a map of img --> duplicate image\n", "duplicates = {}\n", "count = 0\n", "cols = df.columns\n", "for i, val in df.iterrows():\n", " arr = np.where(df.loc[i, :] < 15)\n", " arr = np.delete(arr, count)\n", " if len(arr) > 0:\n", " duplicates[i] = [cols[j] for j in list(arr)]\n", " \n", "duplicates" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'5f567374f688120109c2bab349ff21cf.jpg': ['22f1d783dd6821defafcc915a8146c41.jpg'],\n", " '6de74d3051ceafe10cf3f3e8c342bad8.jpg': ['be030bdb35c3c8cbf9145dfcebc0c0d2.jpg'],\n", " '19a1b36184861141cd3b1cd43e74b5b6.jpg': ['dbee4a80595e78f281e1a0938f9857be.jpg'],\n", " 'a7ef35e99387ff1227baced72467dc1f.jpg': ['33068b46548282d53e5fee2f15792728.jpg'],\n", " '6cd57e29acb9071a6c5e5aa23aeaf0ee.jpg': ['11e1eeb117aafe575e95f9c2ee9de3f3.jpg'],\n", " 'adb5b7253c21d274f9b1a793b01b84d2.jpg': ['3554761709cc4906ab9db13e5e46aa25.jpg'],\n", " 'ea7c258aade6d510197d02b8d5012ba5.jpg': ['e6b375b8b1b7e27706623977dace3721.jpg'],\n", " 'dbee4a80595e78f281e1a0938f9857be.jpg': ['dbee4a80595e78f281e1a0938f9857be.jpg'],\n", " '38682083d6f7539a88c17d57559dcbd6.jpg': ['2b16c87c36fa56721ab867e53560ec9b.jpg'],\n", " '1cc5a0ffcaf2f370cd65f10eeeb7fddd.jpg': ['9b82bbff48d88f3bea9d30cfd96606dc.jpg'],\n", " '8633a0075168ec7587c636bde73fcb21.jpg': ['09d46db9589ff45436cda87c4abc946b.jpg'],\n", " '771fff032a6c7854eb8509f5849e929e.jpg': ['bf77bafaa320f244f2331ca466b96f50.jpg'],\n", " '95727ec19e5e87adf63dc241cb5e4af8.jpg': ['9829c11b6a2ea0a47031a865d761a670.jpg'],\n", " 'a027f129d0ed0d55f8c0c455d9de0035.jpg': ['ba8e7927c71912e42cde00184b691376.jpg'],\n", " 'fd5dd82519cfa82bd6f23d9f83b421aa.jpg': ['34b5d983d901815b931e28ee357ddf74.jpg'],\n", " 'bb7eaaccb79a069d59db68a3cae983cb.jpg': ['d519b2976ab7e3da9c334540018cad79.jpg'],\n", " '22f1d783dd6821defafcc915a8146c41.jpg': ['22f1d783dd6821defafcc915a8146c41.jpg'],\n", " '34b5d983d901815b931e28ee357ddf74.jpg': ['34b5d983d901815b931e28ee357ddf74.jpg'],\n", " '33068b46548282d53e5fee2f15792728.jpg': ['33068b46548282d53e5fee2f15792728.jpg'],\n", " 'f7a234c59d01011f64c9633409a30086.jpg': ['a36d079aeee1bd073859a3af6041c4f4.jpg'],\n", " 'be030bdb35c3c8cbf9145dfcebc0c0d2.jpg': ['be030bdb35c3c8cbf9145dfcebc0c0d2.jpg'],\n", " 'a874b1654043ff9c19375a4b04be79a3.jpg': ['8e4fcec9d635f8e9c152a23aad631eec.jpg'],\n", " 'fcb25afb92552dceba953b8254544a19.jpg': ['0455b31fb640b89ee7375711168f318b.jpg'],\n", " 'a36d079aeee1bd073859a3af6041c4f4.jpg': ['a36d079aeee1bd073859a3af6041c4f4.jpg'],\n", " '3554761709cc4906ab9db13e5e46aa25.jpg': ['3554761709cc4906ab9db13e5e46aa25.jpg'],\n", " 'bf77bafaa320f244f2331ca466b96f50.jpg': ['bf77bafaa320f244f2331ca466b96f50.jpg'],\n", " 'ba8e7927c71912e42cde00184b691376.jpg': ['ba8e7927c71912e42cde00184b691376.jpg'],\n", " '9829c11b6a2ea0a47031a865d761a670.jpg': ['9829c11b6a2ea0a47031a865d761a670.jpg'],\n", " '0455b31fb640b89ee7375711168f318b.jpg': ['0455b31fb640b89ee7375711168f318b.jpg'],\n", " '09d46db9589ff45436cda87c4abc946b.jpg': ['09d46db9589ff45436cda87c4abc946b.jpg'],\n", " '5041af68a1e046acff0fc93da11de68f.jpg': ['0e2a24d28767bea1a4a37d1c6a4d4f31.jpg'],\n", " '2b16c87c36fa56721ab867e53560ec9b.jpg': ['2b16c87c36fa56721ab867e53560ec9b.jpg'],\n", " 'bf0403884214daf1e41bdb522df8c8a1.jpg': ['1efcd7e523f9cdf487d1be5ec88e546a.jpg'],\n", " '59c60187f6e65c32cbbadbdfc8b8df1d.jpg': ['8e8674abd53e4d087da3798f478edb8c.jpg'],\n", " 'd519b2976ab7e3da9c334540018cad79.jpg': ['d519b2976ab7e3da9c334540018cad79.jpg'],\n", " '81ccde6038a8c7bf6860774ebaf4bad8.jpg': ['5eebe4328896867cce5c841d7d15d765.jpg'],\n", " 'e2e1cf7d3e85b5bcbb00223931f84487.jpg': ['d56d52145880b5280cc2d18c9b4a89f7.jpg'],\n", " 'd56d52145880b5280cc2d18c9b4a89f7.jpg': ['d56d52145880b5280cc2d18c9b4a89f7.jpg'],\n", " '1efcd7e523f9cdf487d1be5ec88e546a.jpg': ['1efcd7e523f9cdf487d1be5ec88e546a.jpg'],\n", " '5eebe4328896867cce5c841d7d15d765.jpg': ['5eebe4328896867cce5c841d7d15d765.jpg'],\n", " '8e8674abd53e4d087da3798f478edb8c.jpg': ['8e8674abd53e4d087da3798f478edb8c.jpg'],\n", " '0e2a24d28767bea1a4a37d1c6a4d4f31.jpg': ['0e2a24d28767bea1a4a37d1c6a4d4f31.jpg'],\n", " '9b82bbff48d88f3bea9d30cfd96606dc.jpg': ['9b82bbff48d88f3bea9d30cfd96606dc.jpg'],\n", " '11e1eeb117aafe575e95f9c2ee9de3f3.jpg': ['11e1eeb117aafe575e95f9c2ee9de3f3.jpg'],\n", " '8e4fcec9d635f8e9c152a23aad631eec.jpg': ['8e4fcec9d635f8e9c152a23aad631eec.jpg'],\n", " 'e6b375b8b1b7e27706623977dace3721.jpg': ['e6b375b8b1b7e27706623977dace3721.jpg']}" ] }, "metadata": {}, "execution_count": 4 } ], "metadata": {} }, { "cell_type": "code", "execution_count": null, "source": [], "outputs": [], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python", "version": "3.8.10", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernelspec": { "name": "python3", "display_name": "Python 3.8.10 64-bit" }, "interpreter": { "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" } }, "nbformat": 4, "nbformat_minor": 2 }