Skip to content

Commit

Permalink
Merge pull request #101 from jinlow/feature/validate-y-weight
Browse files Browse the repository at this point in the history
Feature/validate y weight
  • Loading branch information
jinlow authored Mar 23, 2024
2 parents 3e2ebe5 + ee87d36 commit 9a3f195
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 50 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
run: pip install forust --no-index --find-links dist --no-deps --force-reinstall
- name: Run Package Tests
run: |
pip install pytest pytest-cov black ruff setuptools --upgrade
pip install pytest pytest-cov 'black>=24.0.0,<25.0.0' ruff setuptools --upgrade
cd py-forust
ruff check .
black --check .
Expand Down Expand Up @@ -92,7 +92,7 @@ jobs:
run: pip install forust --no-index --find-links dist --no-deps --force-reinstall
- name: Run Package Tests
run: |
pip install pytest pytest-cov black ruff setuptools --upgrade
pip install pytest pytest-cov 'black>=24.0.0,<25.0.0' ruff setuptools --upgrade
cd py-forust
ruff check .
black --check .
Expand Down Expand Up @@ -149,7 +149,7 @@ jobs:
run: pip install forust --no-index --find-links dist --no-deps --force-reinstall
- name: Run Package Tests
run: |
pip install pytest pytest-cov black ruff setuptools --upgrade
pip install pytest pytest-cov 'black>=24.0.0,<25.0.0' ruff setuptools --upgrade
cd py-forust
ruff check .
black --check .
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: "22.6.0"
rev: "24.1.1"
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "forust-ml"
version = "0.4.5"
version = "0.4.6"
edition = "2021"
authors = ["James Inlow <[email protected]>"]
homepage = "https://github.com/jinlow/forust"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install forust

To use in a rust project add the following to your Cargo.toml file.
```toml
forust-ml = "0.4.5"
forust-ml = "0.4.6"
```

## Usage
Expand Down
4 changes: 2 additions & 2 deletions py-forust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-forust"
version = "0.4.5"
version = "0.4.6"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -10,7 +10,7 @@ crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.20.0", features = ["extension-module"] }
forust-ml = { version = "0.4.5", path = "../" }
forust-ml = { version = "0.4.6", path = "../" }
numpy = "0.20.0"
ndarray = "0.15.1"
serde_plain = { version = "1.0" }
Expand Down
13 changes: 7 additions & 6 deletions py-forust/forust/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ def fit(
cols: int,
y: np.ndarray,
sample_weight: np.ndarray,
evaluation_data: None
| list[tuple[FrameLike, ArrayLike, None | ArrayLike]] = None,
evaluation_data: (
None | list[tuple[FrameLike, ArrayLike, None | ArrayLike]]
) = None,
parallel: bool = True,
):
"""Fit method"""
Expand Down Expand Up @@ -536,10 +537,10 @@ def fit(
X: FrameLike,
y: ArrayLike,
sample_weight: Union[ArrayLike, None] = None,
evaluation_data: None
| list[
tuple[FrameLike, ArrayLike, ArrayLike] | tuple[FrameLike, ArrayLike]
] = None,
evaluation_data: (
None
| list[tuple[FrameLike, ArrayLike, ArrayLike] | tuple[FrameLike, ArrayLike]]
) = None,
) -> GradientBooster:
"""Fit the gradient booster on a provided dataset.
Expand Down
25 changes: 25 additions & 0 deletions py-forust/tests/test_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,31 @@ def X_y() -> Tuple[pd.DataFrame, pd.Series]:
return X, y


def test_booster_y_sample_validation():
X = pd.DataFrame({"a": [1, 2, 3], "b": [-1, 2, 4], "c": [np.nan, 3, 4]})
y = pd.Series([1, 0, 1])
y_miss = pd.Series([1, 0, np.nan])

model = GradientBooster()
with pytest.raises(ValueError, match="Missing values found in y"):
model.fit(X, y_miss)
with pytest.raises(ValueError, match="Negative values found in sample_weight"):
model.fit(X, y, sample_weight=X["b"])
with pytest.raises(ValueError, match="Missing values found in sample_weight"):
model.fit(X, y, sample_weight=X["c"])
# Eval sets
with pytest.raises(ValueError, match="Missing values found in eval set 0 y"):
model.fit(X, y, evaluation_data=[(X, y_miss)])
with pytest.raises(
ValueError, match="Negative values found in eval set 0 sample_weight"
):
model.fit(X, y, evaluation_data=[(X, y, X["b"])])
with pytest.raises(
ValueError, match="Missing values found in eval set 0 sample_weight"
):
model.fit(X, y, evaluation_data=[(X, y, X["c"])])


def test_booster_no_variance(X_y):
X, y = X_y
X.iloc[:, 3] = 1
Expand Down
2 changes: 1 addition & 1 deletion rs-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
To run this example, add the following code to your `Cargo.toml` file.
```toml
[dependencies]
forust-ml = "0.4.5"
forust-ml = "0.4.6"
polars = "0.28"
reqwest = { version = "0.11", features = ["blocking"] }
```
Expand Down
4 changes: 4 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,8 @@ pub enum ForustError {
/// First value is the name of the parameter, second is expected, third is what was passed.
#[error("Invalid parameter value passed for {0}, expected {1} but {2} provided.")]
InvalidParameter(String, String, String),
#[error("Missing values found in {0}")]
MissingValuesFound(String),
#[error("Negative values found in {0}")]
NegativeValuesFound(String),
}
18 changes: 17 additions & 1 deletion src/gradientbooster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ use crate::sampler::{GossSampler, RandomSampler, SampleMethod, Sampler};
use crate::shapley::predict_contributions_row_shapley;
use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter};
use crate::tree::Tree;
use crate::utils::{fmt_vec_output, odds, validate_positive_float_field};
use crate::utils::{
fmt_vec_output, odds, validate_not_nan_vec, validate_positive_float_field,
validate_positive_not_nan_vec,
};
use log::info;
use rand::rngs::StdRng;
use rand::seq::IteratorRandom;
Expand Down Expand Up @@ -437,6 +440,19 @@ impl GradientBooster {
sample_weight: &[f64],
evaluation_data: Option<Vec<EvaluationData>>,
) -> Result<(), ForustError> {
// Validate inputs
validate_not_nan_vec(y, "y".to_string())?;
validate_positive_not_nan_vec(sample_weight, "sample_weight".to_string())?;
if let Some(eval_data) = &evaluation_data {
for (i, (_, eval_y, eval_sample_weight)) in eval_data.iter().enumerate() {
validate_not_nan_vec(eval_y, format!("eval set {} y", i).to_string())?;
validate_positive_not_nan_vec(
eval_sample_weight,
format!("eval set {} sample_weight", i).to_string(),
)?;
}
}

let constraints_map = self
.monotone_constraints
.as_ref()
Expand Down
34 changes: 0 additions & 34 deletions src/objective.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,6 @@ impl ObjectiveFunction for LogLoss {
.unzip()
}

// #[inline]
// fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec<f32> {
// y.iter()
// .zip(yhat)
// .zip(sample_weight)
// .map(|((y_, yhat_), w_)| {
// let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
// ((yhat_ - *y_) * *w_) as f32
// })
// .collect()
// }
// #[inline]
// fn calc_hess(_: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec<f32> {
// yhat.iter()
// .zip(sample_weight)
// .map(|(yhat_, w_)| {
// let yhat_ = f64::ONE / (f64::ONE + (-*yhat_).exp());
// (yhat_ * (f64::ONE - yhat_) * *w_) as f32
// })
// .collect()
// }
fn default_metric() -> Metric {
Metric::LogLoss
}
Expand Down Expand Up @@ -124,19 +103,6 @@ impl ObjectiveFunction for SquaredLoss {
ytot / ntot
}

// #[inline]
// fn calc_grad(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> Vec<f32> {
// y.iter()
// .zip(yhat)
// .zip(sample_weight)
// .map(|((y_, yhat_), w_)| ((*yhat_ - *y_) * *w_) as f32)
// .collect()
// }

// #[inline]
// fn calc_hess(_: &[f64], _: &[f64], sample_weight: &[f64]) -> Vec<f32> {
// sample_weight.iter().map(|v| *v as f32).collect()
// }
#[inline]
fn calc_grad_hess(y: &[f64], yhat: &[f64], sample_weight: &[f64]) -> (Vec<f32>, Vec<f32>) {
y.iter()
Expand Down
27 changes: 27 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,33 @@ pub fn validate_float_parameter<T: FloatData<T>>(
}
}

pub fn validate_not_nan_vec(v: &[f64], name: String) -> Result<(), ForustError> {
if v.iter().any(|i| i.is_nan()) {
Err(ForustError::MissingValuesFound(name))
} else {
Ok(())
}
}

pub fn validate_positive_vec(v: &[f64], name: String) -> Result<(), ForustError> {
if v.iter().any(|i| i < &0.) {
Err(ForustError::NegativeValuesFound(name))
} else {
Ok(())
}
}

pub fn validate_positive_not_nan_vec(v: &[f64], name: String) -> Result<(), ForustError> {
let remainder = v
.iter()
.filter(|i| i.is_nan() || i < &&0.)
.copied()
.collect::<Vec<f64>>();
validate_positive_vec(&remainder, name.clone())?;
validate_not_nan_vec(&remainder, name)?;
Ok(())
}

macro_rules! validate_positive_float_field {
($var: expr) => {
let var_name = stringify!($var).split(".").nth(1).unwrap();
Expand Down

0 comments on commit 9a3f195

Please sign in to comment.