{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from PIL import Image\n",
    "from keras.models import Model\n",
    "from keras.preprocessing import image\n",
    "from scipy.spatial import distance_matrix\n",
    "from keras.layers import Dense, GlobalAveragePooling2D\n",
    "from keras.applications.inception_v3 import InceptionV3"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "source": [
    "# Set a working path for images\n",
    "working_path = '/fitzpatrick17k/images/all/'\n",
    "save_path = '/fitzpatrick17k/images/processed/'\n",
    "os.chdir(working_path)\n",
    "\n",
    "# List of image filenames\n",
    "imgs = [fn for fn in os.listdir() if fn.upper().endswith('.JPG')]\n",
    "\n",
    "# Iterate each filename\n",
    "for fn in imgs: \n",
    "\n",
    "    # Set working path\n",
    "    os.chdir(working_path)\n",
    "\n",
    "    # Open the image, normalize size and color\n",
    "    temp = Image.open(fn, 'r')\n",
    "    temp = temp.resize((299,299), Image.ANTIALIAS)\n",
    "    temp = temp.convert('L')\n",
    "    temp = temp.convert('1') \n",
    "\n",
    "    # Save in a new directory\n",
    "    os.chdir(save_path)\n",
    "    temp.save(fn, temp.format)\n",
    "    temp.close()"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Number of images:  16577\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Set path\n",
    "os.chdir(save_path)\n",
    "\n",
    "# Create base model\n",
    "base_model = InceptionV3(weights = 'imagenet', include_top = False, input_shape = (299, 299, 3))\n",
    "\n",
    "# Adjust layers\n",
    "for layer in base_model.layers:\n",
    "    layer.trainable = False\n",
    "\n",
    "# Set features\n",
    "pool_2d = GlobalAveragePooling2D(name = 'pool_2d')(base_model.output)\n",
    "dense = Dense(1024, name = 'dense', activation = 'relu')(pool_2d)\n",
    "predictions = Dense(1000, name = 'pred', activation = 'relu')(dense)\n",
    "\n",
    "# Reformat model\n",
    "model = Model(inputs = base_model.input, outputs = predictions)\n",
    "\n",
    "# Adjust inputs\n",
    "use_images = [image.load_img(c, target_size = (299,299)) for c in imgs]\n",
    "use_tensor = np.array([image.img_to_array(img) for img in use_images])\n",
    "\n",
    "# Output similarity matrix and save\n",
    "model_output = pd.DataFrame(model.predict(use_tensor, batch_size = 32, verbose = 0), index = imgs)\n",
    "df = pd.DataFrame(distance_matrix(model_output.values, model_output.values),index = model_output.index, columns = model_output.index)\n",
    "df.to_csv('/fitzpatrick17k/similarity_matrix.csv', index = False)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "source": [
    "# Read similarity matrix\n",
    "df = pd.read_csv('/fitzpatrick17k/similarity_matrix.csv')\n",
    "\n",
    "# Create a map of img --> duplicate image\n",
    "duplicates = {}\n",
    "count = 0\n",
    "cols = df.columns\n",
    "for i, val in df.iterrows():\n",
    "    arr = np.where(df.loc[i, :] < 15)\n",
    "    arr = np.delete(arr, count)\n",
    "    if len(arr) > 0:\n",
    "        duplicates[i] = [cols[j] for j in list(arr)]\n",
    "        \n",
    "duplicates"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "{'5f567374f688120109c2bab349ff21cf.jpg': ['22f1d783dd6821defafcc915a8146c41.jpg'],\n",
       " '6de74d3051ceafe10cf3f3e8c342bad8.jpg': ['be030bdb35c3c8cbf9145dfcebc0c0d2.jpg'],\n",
       " '19a1b36184861141cd3b1cd43e74b5b6.jpg': ['dbee4a80595e78f281e1a0938f9857be.jpg'],\n",
       " 'a7ef35e99387ff1227baced72467dc1f.jpg': ['33068b46548282d53e5fee2f15792728.jpg'],\n",
       " '6cd57e29acb9071a6c5e5aa23aeaf0ee.jpg': ['11e1eeb117aafe575e95f9c2ee9de3f3.jpg'],\n",
       " 'adb5b7253c21d274f9b1a793b01b84d2.jpg': ['3554761709cc4906ab9db13e5e46aa25.jpg'],\n",
       " 'ea7c258aade6d510197d02b8d5012ba5.jpg': ['e6b375b8b1b7e27706623977dace3721.jpg'],\n",
       " 'dbee4a80595e78f281e1a0938f9857be.jpg': ['dbee4a80595e78f281e1a0938f9857be.jpg'],\n",
       " '38682083d6f7539a88c17d57559dcbd6.jpg': ['2b16c87c36fa56721ab867e53560ec9b.jpg'],\n",
       " '1cc5a0ffcaf2f370cd65f10eeeb7fddd.jpg': ['9b82bbff48d88f3bea9d30cfd96606dc.jpg'],\n",
       " '8633a0075168ec7587c636bde73fcb21.jpg': ['09d46db9589ff45436cda87c4abc946b.jpg'],\n",
       " '771fff032a6c7854eb8509f5849e929e.jpg': ['bf77bafaa320f244f2331ca466b96f50.jpg'],\n",
       " '95727ec19e5e87adf63dc241cb5e4af8.jpg': ['9829c11b6a2ea0a47031a865d761a670.jpg'],\n",
       " 'a027f129d0ed0d55f8c0c455d9de0035.jpg': ['ba8e7927c71912e42cde00184b691376.jpg'],\n",
       " 'fd5dd82519cfa82bd6f23d9f83b421aa.jpg': ['34b5d983d901815b931e28ee357ddf74.jpg'],\n",
       " 'bb7eaaccb79a069d59db68a3cae983cb.jpg': ['d519b2976ab7e3da9c334540018cad79.jpg'],\n",
       " '22f1d783dd6821defafcc915a8146c41.jpg': ['22f1d783dd6821defafcc915a8146c41.jpg'],\n",
       " '34b5d983d901815b931e28ee357ddf74.jpg': ['34b5d983d901815b931e28ee357ddf74.jpg'],\n",
       " '33068b46548282d53e5fee2f15792728.jpg': ['33068b46548282d53e5fee2f15792728.jpg'],\n",
       " 'f7a234c59d01011f64c9633409a30086.jpg': ['a36d079aeee1bd073859a3af6041c4f4.jpg'],\n",
       " 'be030bdb35c3c8cbf9145dfcebc0c0d2.jpg': ['be030bdb35c3c8cbf9145dfcebc0c0d2.jpg'],\n",
       " 'a874b1654043ff9c19375a4b04be79a3.jpg': ['8e4fcec9d635f8e9c152a23aad631eec.jpg'],\n",
       " 'fcb25afb92552dceba953b8254544a19.jpg': ['0455b31fb640b89ee7375711168f318b.jpg'],\n",
       " 'a36d079aeee1bd073859a3af6041c4f4.jpg': ['a36d079aeee1bd073859a3af6041c4f4.jpg'],\n",
       " '3554761709cc4906ab9db13e5e46aa25.jpg': ['3554761709cc4906ab9db13e5e46aa25.jpg'],\n",
       " 'bf77bafaa320f244f2331ca466b96f50.jpg': ['bf77bafaa320f244f2331ca466b96f50.jpg'],\n",
       " 'ba8e7927c71912e42cde00184b691376.jpg': ['ba8e7927c71912e42cde00184b691376.jpg'],\n",
       " '9829c11b6a2ea0a47031a865d761a670.jpg': ['9829c11b6a2ea0a47031a865d761a670.jpg'],\n",
       " '0455b31fb640b89ee7375711168f318b.jpg': ['0455b31fb640b89ee7375711168f318b.jpg'],\n",
       " '09d46db9589ff45436cda87c4abc946b.jpg': ['09d46db9589ff45436cda87c4abc946b.jpg'],\n",
       " '5041af68a1e046acff0fc93da11de68f.jpg': ['0e2a24d28767bea1a4a37d1c6a4d4f31.jpg'],\n",
       " '2b16c87c36fa56721ab867e53560ec9b.jpg': ['2b16c87c36fa56721ab867e53560ec9b.jpg'],\n",
       " 'bf0403884214daf1e41bdb522df8c8a1.jpg': ['1efcd7e523f9cdf487d1be5ec88e546a.jpg'],\n",
       " '59c60187f6e65c32cbbadbdfc8b8df1d.jpg': ['8e8674abd53e4d087da3798f478edb8c.jpg'],\n",
       " 'd519b2976ab7e3da9c334540018cad79.jpg': ['d519b2976ab7e3da9c334540018cad79.jpg'],\n",
       " '81ccde6038a8c7bf6860774ebaf4bad8.jpg': ['5eebe4328896867cce5c841d7d15d765.jpg'],\n",
       " 'e2e1cf7d3e85b5bcbb00223931f84487.jpg': ['d56d52145880b5280cc2d18c9b4a89f7.jpg'],\n",
       " 'd56d52145880b5280cc2d18c9b4a89f7.jpg': ['d56d52145880b5280cc2d18c9b4a89f7.jpg'],\n",
       " '1efcd7e523f9cdf487d1be5ec88e546a.jpg': ['1efcd7e523f9cdf487d1be5ec88e546a.jpg'],\n",
       " '5eebe4328896867cce5c841d7d15d765.jpg': ['5eebe4328896867cce5c841d7d15d765.jpg'],\n",
       " '8e8674abd53e4d087da3798f478edb8c.jpg': ['8e8674abd53e4d087da3798f478edb8c.jpg'],\n",
       " '0e2a24d28767bea1a4a37d1c6a4d4f31.jpg': ['0e2a24d28767bea1a4a37d1c6a4d4f31.jpg'],\n",
       " '9b82bbff48d88f3bea9d30cfd96606dc.jpg': ['9b82bbff48d88f3bea9d30cfd96606dc.jpg'],\n",
       " '11e1eeb117aafe575e95f9c2ee9de3f3.jpg': ['11e1eeb117aafe575e95f9c2ee9de3f3.jpg'],\n",
       " '8e4fcec9d635f8e9c152a23aad631eec.jpg': ['8e4fcec9d635f8e9c152a23aad631eec.jpg'],\n",
       " 'e6b375b8b1b7e27706623977dace3721.jpg': ['e6b375b8b1b7e27706623977dace3721.jpg']}"
      ]
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.8.10",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.10 64-bit"
  },
  "interpreter": {
   "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}