simplified the optional task

Machine-Learning-Foundations · Sep 14, 2024 · 281724a · 281724a
1 parent 8a9e554
commit 281724a
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -48,10 +48,9 @@ Now, we will implement our own random forest for classification that will be abl
 
 1. Implement the ``entropy()`` function.
 2. Now use your ``entropy()`` function to implement the ``information_gain()`` function.
-3. Next, use the implemented functions ``split()`` and``best_split()`` functions to find the best split and implement the function ``build_tree()`` to build a decision tree. Hint: You can use recursion for that. This function should return the resulting root node.
-4. Look at the class ``RandomForest`` and implement the ``fit()`` function including bootstrapping and random feature selection.
-5. Finally, implement the ``predict()`` function, that predicts on all of the resulting trees and returns a majority vote.
-6. You can now compare your results to the ``sklearn`` implementation of Random forest algorithm. 
-7. If you now uncomment the commented part in the ``main()`` function, you can experiment with missing values.
+3. Look at the class ``RandomForest`` and use the function ``build_tree()`` to implement the ``fit()`` function including bootstrapping and random feature selection.
+4. Finally, implement the ``predict()`` function, that predicts on all of the resulting trees and returns a majority vote.
+5. You can now compare your results to the ``sklearn`` implementation of Random forest algorithm. 
+6. If you now uncomment the commented part in the ``main()`` function, you can experiment with missing values.
 
 
diff --git a/src/ex3_my_forest.py b/src/ex3_my_forest.py
@@ -123,8 +123,22 @@ def build_tree(X, y, features, depth=0, max_depth=None):
     Returns:
     DecisionNode or Leaf: Root node of the decision tree.
     """
-    # 3) TODO: Implement me
-    return None
+    if len(set(y)) == 1:  # If all labels are the same
+        return Leaf(y)
+    if max_depth is not None and depth >= max_depth:  # If maximum depth is reached
+        return Leaf(y)
+    split_result = best_split(X, y, features)
+    if split_result is None:  # If no valid split is found
+        return Leaf(y)
+    feature_index, threshold = split_result
+    X_left, X_right, y_left, y_right = split(X, y, feature_index, threshold)
+    left = build_tree(
+        X_left, y_left, features, depth + 1, max_depth
+    )  # Recursively build the left subtree
+    right = build_tree(
+        X_right, y_right, features, depth + 1, max_depth
+    )  # Recursively build the right subtree
+    return DecisionNode(feature_index, threshold, left, right)
 
 def predict_sample(node, sample):
     """
@@ -185,7 +199,7 @@ def fit(self, X, y):
         X (array-like): Feature matrix.
         y (array-like): Labels.
         """
-        # 4) TODO: Implement me
+        # 3) TODO: Implement me
 
     def predict(self, X):
         """
@@ -197,7 +211,7 @@ def predict(self, X):
         Returns:
         array: Predicted class labels.
         """
-        # 5) TODO: Implement me
+        # 4) TODO: Implement me
         return None
 
 if __name__ == "__main__":
@@ -206,7 +220,7 @@ def predict(self, X):
     # Load wine dataset
     dataset = load_wine()
 
-    # 7) TODO: Uncomment the following lines to introduce missing values
+    # 6) TODO: Uncomment the following lines to introduce missing values
     # missing_rate=0.1
     # mask = np.random.rand(*dataset.data.shape) < missing_rate
     # dataset.data[mask] = np.nan
@@ -236,5 +250,5 @@ def predict(self, X):
     accuracy = np.mean(predictions == ytest)
     print(f'Accuracy_own: {accuracy}')
 
-    # 6) TODO: Compare your results to sklearn implementation 
+    # 5) TODO: Compare your results to sklearn implementation 
 
diff --git a/tests/test_ex3_my_forest.py b/tests/test_ex3_my_forest.py
@@ -1,34 +1,8 @@
 import pytest
 import numpy as np
-from collections import Counter
-from scipy.stats import mode
-from sklearn.datasets import load_wine
-from sklearn.model_selection import train_test_split
 
-from src.ex3_my_forest import entropy, information_gain, build_tree, RandomForest, DecisionNode, Leaf
+from src.ex3_my_forest import entropy, information_gain
 
-@pytest.fixture
-def dataset():
-    # Create a fixture for the dataset
-    dataset = load_wine()
-    missing_rate = 0.05
-    mask = np.random.rand(*dataset.data.shape) < missing_rate
-    dataset.data[mask] = np.nan
-    return dataset
-
-@pytest.fixture
-def train_test_data(dataset):
-    # Create a fixture for train-test split data
-    xtrain, xtest, ytrain, ytest = train_test_split(dataset.data, dataset.target, train_size=0.75, random_state=29)
-    return xtrain, xtest, ytrain, ytest
-
-@pytest.fixture
-def random_forest(train_test_data):
-    # Create a fixture for RandomForest instance
-    xtrain, _, ytrain, _ = train_test_data
-    rf = RandomForest(n_trees=10, max_depth=5, n_features=4)
-    rf.fit(xtrain, ytrain)
-    return rf
 
 def test_entropy():
     y = np.array([0, 0, 1, 1, 1, 1])
@@ -43,41 +17,3 @@ def test_information_gain():
     calculated_gain = information_gain(y, y_left, y_right)
     expected_gain = 0.9182958340544896  # Entropy(y) - 0*Entropy(y_left) - 1*Entropy(y_right)
     assert np.isclose(calculated_gain, expected_gain, atol=1e-5)
-
-
-def test_build_tree():
-    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
-    y = np.array([0, 1, 0, 1, 0])
-    features = [0, 1]
-    tree = build_tree(X, y, features, max_depth=2)
-
-    assert isinstance(tree, DecisionNode)
-    assert isinstance(tree.left, Leaf)
-    assert isinstance(tree.right, DecisionNode)
-    assert isinstance(tree.right.left, Leaf)
-    assert isinstance(tree.right.right, Leaf)
-
-    # Check the numerical threshold in the root decision node
-    expected_threshold_root = 1
-    assert np.isclose(tree.threshold, expected_threshold_root)
-
-    # Check the numerical threshold in the left decision node
-    expected_threshold_right = 3
-    assert np.isclose(tree.right.threshold, expected_threshold_right)
-
-    # Verify the predictions in the leaf nodes
-    assert tree.right.left.predictions == Counter([1])
-
-
-
-def test_random_forest_fit(random_forest):
-    assert len(random_forest.trees) == random_forest.n_trees
-
-def test_random_forest_predict(random_forest, train_test_data):
-    _, xtest, _, ytest = train_test_data
-    predictions = random_forest.predict(xtest)
-    accuracy = np.mean(predictions == ytest)
-
-    # Test accuracy is within a reasonable range
-    assert accuracy >= 0.85
-