Skip to content

Commit

Permalink
simplified the optional task
Browse files Browse the repository at this point in the history
  • Loading branch information
tenosel committed Sep 14, 2024
1 parent 8a9e554 commit 281724a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 76 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ Now, we will implement our own random forest for classification that will be abl

1. Implement the ``entropy()`` function.
2. Now use your ``entropy()`` function to implement the ``information_gain()`` function.
3. Next, use the implemented functions ``split()`` and``best_split()`` functions to find the best split and implement the function ``build_tree()`` to build a decision tree. Hint: You can use recursion for that. This function should return the resulting root node.
4. Look at the class ``RandomForest`` and implement the ``fit()`` function including bootstrapping and random feature selection.
5. Finally, implement the ``predict()`` function, that predicts on all of the resulting trees and returns a majority vote.
6. You can now compare your results to the ``sklearn`` implementation of Random forest algorithm.
7. If you now uncomment the commented part in the ``main()`` function, you can experiment with missing values.
3. Look at the class ``RandomForest`` and use the function ``build_tree()`` to implement the ``fit()`` function including bootstrapping and random feature selection.
4. Finally, implement the ``predict()`` function, that predicts on all of the resulting trees and returns a majority vote.
5. You can now compare your results to the ``sklearn`` implementation of Random forest algorithm.
6. If you now uncomment the commented part in the ``main()`` function, you can experiment with missing values.


26 changes: 20 additions & 6 deletions src/ex3_my_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,22 @@ def build_tree(X, y, features, depth=0, max_depth=None):
Returns:
DecisionNode or Leaf: Root node of the decision tree.
"""
# 3) TODO: Implement me
return None
if len(set(y)) == 1: # If all labels are the same
return Leaf(y)
if max_depth is not None and depth >= max_depth: # If maximum depth is reached
return Leaf(y)
split_result = best_split(X, y, features)
if split_result is None: # If no valid split is found
return Leaf(y)
feature_index, threshold = split_result
X_left, X_right, y_left, y_right = split(X, y, feature_index, threshold)
left = build_tree(
X_left, y_left, features, depth + 1, max_depth
) # Recursively build the left subtree
right = build_tree(
X_right, y_right, features, depth + 1, max_depth
) # Recursively build the right subtree
return DecisionNode(feature_index, threshold, left, right)

def predict_sample(node, sample):
"""
Expand Down Expand Up @@ -185,7 +199,7 @@ def fit(self, X, y):
X (array-like): Feature matrix.
y (array-like): Labels.
"""
# 4) TODO: Implement me
# 3) TODO: Implement me

def predict(self, X):
"""
Expand All @@ -197,7 +211,7 @@ def predict(self, X):
Returns:
array: Predicted class labels.
"""
# 5) TODO: Implement me
# 4) TODO: Implement me
return None

if __name__ == "__main__":
Expand All @@ -206,7 +220,7 @@ def predict(self, X):
# Load wine dataset
dataset = load_wine()

# 7) TODO: Uncomment the following lines to introduce missing values
# 6) TODO: Uncomment the following lines to introduce missing values
# missing_rate=0.1
# mask = np.random.rand(*dataset.data.shape) < missing_rate
# dataset.data[mask] = np.nan
Expand Down Expand Up @@ -236,5 +250,5 @@ def predict(self, X):
accuracy = np.mean(predictions == ytest)
print(f'Accuracy_own: {accuracy}')

# 6) TODO: Compare your results to sklearn implementation
# 5) TODO: Compare your results to sklearn implementation

66 changes: 1 addition & 65 deletions tests/test_ex3_my_forest.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,8 @@
import pytest
import numpy as np
from collections import Counter
from scipy.stats import mode
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

from src.ex3_my_forest import entropy, information_gain, build_tree, RandomForest, DecisionNode, Leaf
from src.ex3_my_forest import entropy, information_gain

@pytest.fixture
def dataset():
# Create a fixture for the dataset
dataset = load_wine()
missing_rate = 0.05
mask = np.random.rand(*dataset.data.shape) < missing_rate
dataset.data[mask] = np.nan
return dataset

@pytest.fixture
def train_test_data(dataset):
# Create a fixture for train-test split data
xtrain, xtest, ytrain, ytest = train_test_split(dataset.data, dataset.target, train_size=0.75, random_state=29)
return xtrain, xtest, ytrain, ytest

@pytest.fixture
def random_forest(train_test_data):
# Create a fixture for RandomForest instance
xtrain, _, ytrain, _ = train_test_data
rf = RandomForest(n_trees=10, max_depth=5, n_features=4)
rf.fit(xtrain, ytrain)
return rf

def test_entropy():
y = np.array([0, 0, 1, 1, 1, 1])
Expand All @@ -43,41 +17,3 @@ def test_information_gain():
calculated_gain = information_gain(y, y_left, y_right)
expected_gain = 0.9182958340544896 # Entropy(y) - 0*Entropy(y_left) - 1*Entropy(y_right)
assert np.isclose(calculated_gain, expected_gain, atol=1e-5)


def test_build_tree():
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 1, 0, 1, 0])
features = [0, 1]
tree = build_tree(X, y, features, max_depth=2)

assert isinstance(tree, DecisionNode)
assert isinstance(tree.left, Leaf)
assert isinstance(tree.right, DecisionNode)
assert isinstance(tree.right.left, Leaf)
assert isinstance(tree.right.right, Leaf)

# Check the numerical threshold in the root decision node
expected_threshold_root = 1
assert np.isclose(tree.threshold, expected_threshold_root)

# Check the numerical threshold in the left decision node
expected_threshold_right = 3
assert np.isclose(tree.right.threshold, expected_threshold_right)

# Verify the predictions in the leaf nodes
assert tree.right.left.predictions == Counter([1])



def test_random_forest_fit(random_forest):
assert len(random_forest.trees) == random_forest.n_trees

def test_random_forest_predict(random_forest, train_test_data):
_, xtest, _, ytest = train_test_data
predictions = random_forest.predict(xtest)
accuracy = np.mean(predictions == ytest)

# Test accuracy is within a reasonable range
assert accuracy >= 0.85

0 comments on commit 281724a

Please sign in to comment.