diff --git a/kmodes/tests/test_kmodes.py b/kmodes/tests/test_kmodes.py index 2811159..6aecc0a 100644 --- a/kmodes/tests/test_kmodes.py +++ b/kmodes/tests/test_kmodes.py @@ -9,7 +9,7 @@ from sklearn.utils.testing import assert_equal from kmodes.kmodes import KModes -from kmodes.util.dissim import ng_dissim +from kmodes.util.dissim import ng_dissim, jaccard_dissim_binary, jaccard_dissim_label SOYBEAN = np.array([ @@ -124,6 +124,102 @@ # Drop target column SOYBEAN2 = SOYBEAN2[:, :35] +# test data with categorical variables that have been label encoded +TEST_DATA = np.array([ + [2, 22, 14, 45, 2, 0, 1, 2, 5], + [2, 13, 13, 19, 2, 0, 1, 2, 5], + [3, 25, 4, 3, 0, 1, 2, 0, 4], + [2, 13, 15, 18, 0, 1, 2, 2, 3], + [3, 10, 4, 42, 0, 2, 1, 1, 2], + [2, 16, 21, 14, 0, 1, 2, 2, 2], + [2, 16, 19, 37, 0, 2, 1, 2, 2], + [2, 20, 9, 34, 0, 1, 2, 3, 5], + [2, 14, 21, 44, 0, 1, 2, 3, 2], + [2, 26, 5, 30, 0, 1, 2, 3, 3], + [3, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 20, 1, 27, 3, 3, 3, 2, 0], + [3, 6, 8, 19, 0, 1, 2, 1, 2], + [2, 13, 8, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 19, 42, 0, 1, 2, 2, 5], + [7, 7, 5, 43, 0, 2, 1, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [3, 3, 5, 12, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [7, 15, 19, 17, 0, 1, 2, 2, 2], + [1, 1, 15, 24, 0, 1, 2, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 5, 7, 9, 0, 1, 2, 3, 5], + [2, 24, 6, 10, 0, 2, 1, 2, 2], + [2, 13, 16, 29, 0, 2, 1, 2, 2], + [3, 6, 8, 1, 0, 1, 2, 2, 5], + [2, 16, 15, 34, 0, 1, 2, 2, 1], + [0, 24, 14, 12, 3, 3, 3, 2, 0], + [3, 8, 21, 13, 3, 3, 3, 2, 0], + [2, 17, 15, 42, 3, 3, 3, 2, 0], + [2, 25, 18, 16, 3, 3, 3, 2, 0], + [2, 3, 15, 42, 3, 3, 3, 2, 0], + [6, 13, 15, 22, 3, 3, 3, 2, 0], + [3, 8, 18, 24, 1, 0, 2, 2, 5], + [7, 20, 15, 26, 1, 0, 2, 2, 1], + [2, 20, 7, 35, 0, 1, 2, 2, 5], + [2, 16, 12, 28, 0, 1, 2, 2, 5], + [2, 16, 5, 39, 0, 1, 2, 2, 2], + [3, 6, 11, 8, 0, 1, 2, 2, 2], + [7, 6, 15, 44, 1, 0, 2, 2, 4], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 7, 6, 3, 3, 3, 2, 0], + [1, 13, 2, 46, 3, 3, 3, 2, 0], + [0, 14, 5, 41, 3, 3, 3, 2, 0], + [2, 24, 19, 0, 3, 3, 3, 2, 0], + [2, 14, 3, 35, 3, 3, 3, 2, 0], + [6, 19, 7, 5, 0, 2, 1, 2, 2], + [5, 6, 11, 44, 3, 3, 3, 2, 0], + [7, 16, 21, 21, 3, 3, 3, 2, 0], + [2, 19, 7, 44, 3, 3, 3, 2, 0], + [2, 24, 18, 33, 1, 0, 2, 1, 4], + [2, 16, 8, 44, 0, 2, 1, 2, 1], + [3, 2, 5, 15, 0, 1, 2, 2, 2], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 4, 15, 47, 0, 1, 2, 2, 2], + [7, 13, 15, 25, 0, 1, 2, 2, 1], + [1, 19, 10, 15, 3, 3, 3, 2, 0], + [2, 13, 5, 44, 0, 1, 2, 1, 2], + [5, 11, 18, 20, 3, 3, 3, 2, 0], + [7, 9, 5, 40, 0, 1, 2, 1, 4], + [3, 6, 16, 38, 3, 3, 3, 2, 0], + [2, 24, 22, 12, 0, 1, 2, 2, 3], + [5, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 16, 15, 23, 0, 1, 2, 2, 5], + [2, 13, 0, 25, 1, 0, 2, 2, 2], + [2, 23, 15, 36, 3, 3, 3, 2, 0], + [2, 25, 10, 2, 1, 0, 2, 2, 5], + [2, 21, 7, 4, 1, 0, 2, 2, 1], + [1, 18, 17, 41, 3, 3, 3, 2, 0], + [2, 18, 17, 41, 3, 3, 3, 2, 0], + [6, 9, 1, 0, 3, 3, 3, 2, 0], + [1, 7, 20, 47, 3, 3, 3, 2, 0], + [2, 25, 10, 7, 0, 1, 2, 2, 2], + [7, 0, 4, 32, 1, 2, 0, 2, 5], + [1, 12, 12, 15, 0, 1, 2, 3, 3], + [2, 26, 15, 25, 0, 1, 2, 0, 5], + [2, 20, 15, 19, 0, 1, 2, 2, 1], + [4, 6, 9, 11, 2, 0, 1, 1, 4], + [2, 13, 15, 42, 0, 2, 1, 2, 2], + [3, 5, 21, 31, 0, 1, 2, 3, 5], + [2, 13, 19, 33, 0, 2, 1, 2, 2], + [1, 11, 10, 0, 0, 2, 1, 0, 2] +]) + +TEST_DATA_PREDICT = np.array([ + [2, 22, 14, 45, 2, 0, 1, 2, 5], + [7, 13, 13, 19, 2, 0, 1, 2, 5], + [5, 18, 19, 33, 0, 2, 1, 2, 2], + [1, 11, 10, 0, 0, 2, 1, 0, 2] +]) + def assert_cluster_splits_equal(array1, array2): @@ -334,6 +430,79 @@ def test_kmodes_nunique_nclusters_ng(self): np.array([[0, 2], [0, 1]])) + def test_kmodes_huang_soybean_jaccard_dissim_binary(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_binary, random_state=42) + # binary encoded variables are required + bin_variables = SOYBEAN.astype(bool).astype(int) + result = kmodes_huang.fit_predict(bin_variables) + expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, + 3, 3, 1, 1, 3, 1, 3, 1, 1]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_cao_soybean_jaccard_dissim_binary(self): + kmodes_Cao = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, + cat_dissim=jaccard_dissim_binary, random_state=42) + # binary encoded variables are required + bin_variables = SOYBEAN.astype(bool).astype(int) + result = kmodes_Cao.fit_predict(bin_variables) + expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]) + + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_predict_soybean_jaccard_dissim_binary(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_binary, random_state=42) + # binary encoded variables are required + bin_variables = SOYBEAN.astype(bool).astype(int) + kmodes_huang = kmodes_huang.fit(bin_variables) + # binary encoded variables required for prediction as well + bin_variables_pred = SOYBEAN2.astype(bool).astype(int) + result = kmodes_huang.fit_predict(bin_variables_pred) + expected = np.array([0, 1, 2, 3]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_huang_soybean_jaccard_dissim_label(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_label, random_state=42) + result = kmodes_huang.fit_predict(TEST_DATA) + expected = np.array([3, 3, 2, 1, 1, 3, 3, 3, 3, 3, 0, 2, 2, 0, 0, 3, 3, 0, 0, + 0, 2, 2, 0, 3, 2, 3, 2, 2, 0, 1, 1, 0, 1, 1, 0, 2, 3, 3, + 3, 2, 2, 0, 0, 2, 1, 0, 0, 0, 2, 3, 0, 0, 2, 3, 2, 0, 2, + 2, 2, 3, 0, 3, 2, 2, 0, 0, 3, 2, 1, 3, 2, 0, 0, 2, 2, 2, + 3, 2, 2, 2, 2, 1, 3, 2, 2]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_cao_soybean_jaccard_dissim_label(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Cao', verbose=2, + cat_dissim=jaccard_dissim_label, random_state=42) + result = kmodes_huang.fit_predict(TEST_DATA) + expected = np.array([3, 3, 1, 0, 0, 1, 1, 3, 2, 3, 0, 3, 2, 0, 0, 3, 3, 0, + 0, 0, 1, 1, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, + 3, 1, 1, 2, 2, 0, 0, 2, 0, 0, 0, 0, 3, 2, 2, 2, 0, 1, + 1, 0, 1, 1, 1, 3, 0, 3, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, + 0, 2, 2, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1]) + + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_predict_soybean_jaccard_dissim_label(self): + kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2, + cat_dissim=jaccard_dissim_label, random_state=42) + kmodes_huang = kmodes_huang.fit(TEST_DATA) + result = kmodes_huang.fit_predict(TEST_DATA_PREDICT) + expected = np.array([1, 0, 1, 2]) + assert_cluster_splits_equal(result, expected) + self.assertTrue(result.dtype == np.dtype(np.uint16)) + + def test_kmodes_ninit(self): kmodes = KModes(n_init=10, init='Huang') self.assertEqual(kmodes.n_init, 10) diff --git a/kmodes/util/dissim.py b/kmodes/util/dissim.py index 4f27471..08a15f6 100644 --- a/kmodes/util/dissim.py +++ b/kmodes/util/dissim.py @@ -10,6 +10,34 @@ def matching_dissim(a, b, **_): return np.sum(a != b, axis=1) +def jaccard_dissim_binary(a, b, **__): + """Jaccard dissimilarity function for binary encoded variables""" + if ((a == 0) | (a == 1)).all() and ((b == 0) | (b == 1)).all(): + numerator = np.sum(np.bitwise_and(a, b), axis=1) + denominator = np.sum(np.bitwise_or(a, b), axis=1) + if (denominator == 0).any(0): + raise ValueError("Insufficient Number of data since union is 0") + else: + return 1 - numerator / denominator + raise ValueError("Missing or non Binary values detected in Binary columns.") + + +def jaccard_dissim_label(a, b, **__): + """Jaccard dissimilarity function for label encoded variables""" + if np.isnan(a.astype('float64')).any() or np.isnan(b.astype('float64')).any(): + raise ValueError("Missing values detected in Numeric columns.") + intersect_len = np.empty(len(a), dtype=int) + union_len = np.empty(len(a), dtype=int) + i = 0 + for row in a: + intersect_len[i] = len(np.intersect1d(row, b)) + union_len[i] = len(np.unique(row)) + len(np.unique(b)) - intersect_len[i] + i += 1 + if (union_len == 0).any(): + raise ValueError("Insufficient Number of data since union is 0") + return 1 - intersect_len / union_len + + def euclidean_dissim(a, b, **_): """Euclidean distance dissimilarity function""" if np.isnan(a).any() or np.isnan(b).any(): diff --git a/kmodes/util/tests/test_dissim.py b/kmodes/util/tests/test_dissim.py index 4d8404c..24a2be0 100644 --- a/kmodes/util/tests/test_dissim.py +++ b/kmodes/util/tests/test_dissim.py @@ -8,6 +8,7 @@ from sklearn.utils.testing import assert_equal, assert_array_equal from kmodes.util.dissim import matching_dissim, euclidean_dissim, ng_dissim +from kmodes.util.dissim import jaccard_dissim_binary, jaccard_dissim_label class TestDissimilarityMeasures(unittest.TestCase): @@ -25,6 +26,53 @@ def test_matching_dissim(self): b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']]) assert_array_equal(np.array([0, 4]), matching_dissim(a, b)) + def test_jaccard_dissim_binary(self): + a = np.array([[0, 1, 1, 0, 1, 1]]) + b = np.array([[0, 1, 1, 0, 1, 0]]) + assert_equal(0.25, jaccard_dissim_binary(a, b)) + + a = np.array([[0, 1, 1, 0, 1, 1]]) + b = np.array([[0, np.NaN, 1, 0, 1, 0]]) + with self.assertRaises(ValueError): + jaccard_dissim_binary(a, b) + + # test where values are non binary but also not having np.NaN + a = np.array([[0, 1, 2, 0, 1, 2]]) + b = np.array([[0, 1, 2, 0, 1, 0]]) + with self.assertRaises(ValueError): + jaccard_dissim_binary(a, b) + + # test for dissimilarity = 0 both sets are same + a = np.array([[1, 1, 0, 1, 1, 0]]) + b = np.array([[1, 1, 0, 1, 1, 0]]) + assert_equal(0, jaccard_dissim_binary(a, b)) + + # test for dissimilarity = 0 sets are different + a = np.array([[0, 0, 1, 0, 0, 1]]) + b = np.array([[1, 1, 0, 1, 1, 0]]) + assert_equal(1, jaccard_dissim_binary(a, b)) + + def test_jaccard_dissim_label(self): + a = np.array([[0, 1, 2, 0, 1, 2]]) + b = np.array([[0, 1, 2, 0, 3, 0]]) + assert_equal(0.25, jaccard_dissim_label(a, b)) + + a = np.array([[np.NaN, 1, 2, 0, 1, 2]]) + b = np.array([[0, 1, 2, 0, 1, 0]]) + with self.assertRaises(ValueError): + jaccard_dissim_label(a, b) + + # test for dissimilarity = 0 Both sets are same + a = np.array([[1, 2, 0, 3, 1, 0]]) + b = np.array([[1, 2, 0, 3, 1, 0]]) + assert_equal(0, jaccard_dissim_label(a, b)) + + # test for dissimilarity = 1 Both sets are different + a = np.array([[1, 2, 0, 3, 1, 0]]) + b = np.array([[5, 4, 6, 7, 8, 9]]) + assert_equal(1, jaccard_dissim_label(a, b)) + + def test_euclidian_dissim(self): a = np.array([[0., 1., 2., 0., 1., 2.]]) b = np.array([[3., 1., 3., 0., 1., 0.]])