Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c-api][python-package][R-package] expose feature num bin #5048

Merged
merged 12 commits into from
Mar 15, 2022
11 changes: 11 additions & 0 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle,
LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumFeature(DatasetHandle handle,
int* out);

/*!
* \brief Get number of bins for feature.
* \param handle Handle of dataset
* \param feature Index of the feature
* \param[out] out The address to hold number of bins
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetGetFeatureNumBin(DatasetHandle handle,
int feature,
int* out);

/*!
* \brief Add features from ``source`` to ``target``.
* \param target The handle of the dataset to add features to
Expand Down
23 changes: 23 additions & 0 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2371,6 +2371,29 @@ def num_feature(self):
else:
raise LightGBMError("Cannot get num_feature before construct dataset")

def feature_num_bin(self, feature_idx: int) -> int:
"""Get the number of bins for a feature.

Parameters
----------
feature_idx : int
Index of the feature.

Returns
-------
number_of_bins : int
The number of constructed bins for the feature in the Dataset.
"""
if self.handle is not None:
ret = ctypes.c_int(0)
feature = ctypes.c_int(feature_idx)
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
feature,
ctypes.byref(ret)))
return ret.value
else:
raise LightGBMError("Cannot get feature_num_bin before construct dataset")

def get_ref_chain(self, ref_limit=100):
"""Get a chain of Dataset objects.

Expand Down
14 changes: 14 additions & 0 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1550,6 +1550,20 @@ int LGBM_DatasetGetNumFeature(DatasetHandle handle,
API_END();
}

int LGBM_DatasetGetFeatureNumBin(DatasetHandle handle,
int feature,
int* out) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
int inner_idx = dataset->InnerFeatureIndex(feature);
if (inner_idx >= 0) {
*out = dataset->FeatureNumBin(inner_idx);
} else {
*out = 0;
}
API_END();
}

int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
DatasetHandle source) {
API_BEGIN();
Expand Down
20 changes: 20 additions & 0 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,3 +621,23 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype):
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)


def test_feature_num_bin():
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
X = np.vstack([
np.random.rand(100),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
]).T
ds = lgb.Dataset(X).construct()
expected_num_bins = np.array([
35, # ceil(100[n_samples] / 3[min_data_in_bin]) = 34 + bin for zero
3, # 0, 1, 2
3, # 0, 1, 2
4, # 0, 1, 2 + nan
0, # unused
])
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
np.testing.assert_equal(actual_num_bins, expected_num_bins)