From a5b2d6f4133d47b2f11dd54f2b1e9c203edb9c30 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 17:24:28 -0600 Subject: [PATCH 01/15] [ci] [python-package] enable ruff-format on all Python code --- python-package/lightgbm/__init__.py | 37 +- python-package/lightgbm/basic.py | 2213 +++++++++++++-------------- python-package/lightgbm/callback.py | 112 +- python-package/lightgbm/compat.py | 11 + python-package/lightgbm/dask.py | 433 +++--- python-package/lightgbm/engine.py | 274 ++-- python-package/lightgbm/libpath.py | 18 +- python-package/lightgbm/plotting.py | 262 ++-- python-package/lightgbm/sklearn.py | 384 +++-- python-package/pyproject.toml | 1 - 10 files changed, 1770 insertions(+), 1975 deletions(-) diff --git a/python-package/lightgbm/__init__.py b/python-package/lightgbm/__init__.py index 0dc5b75cfdf2..ae38a6169a22 100644 --- a/python-package/lightgbm/__init__.py +++ b/python-package/lightgbm/__init__.py @@ -23,14 +23,33 @@ pass -_version_path = Path(__file__).absolute().parent / 'VERSION.txt' +_version_path = Path(__file__).absolute().parent / "VERSION.txt" if _version_path.is_file(): - __version__ = _version_path.read_text(encoding='utf-8').strip() + __version__ = _version_path.read_text(encoding="utf-8").strip() -__all__ = ['Dataset', 'Booster', 'CVBooster', 'Sequence', - 'register_logger', - 'train', 'cv', - 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', - 'DaskLGBMRegressor', 'DaskLGBMClassifier', 'DaskLGBMRanker', - 'log_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping', 'EarlyStopException', - 'plot_importance', 'plot_split_value_histogram', 'plot_metric', 'plot_tree', 'create_tree_digraph'] +__all__ = [ + "Dataset", + "Booster", + "CVBooster", + "Sequence", + "register_logger", + "train", + "cv", + "LGBMModel", + "LGBMRegressor", + "LGBMClassifier", + "LGBMRanker", + "DaskLGBMRegressor", + "DaskLGBMClassifier", + "DaskLGBMRanker", + "log_evaluation", + "record_evaluation", + "reset_parameter", + "early_stopping", + "EarlyStopException", + "plot_importance", + "plot_split_value_histogram", + "plot_metric", + "plot_tree", + "create_tree_digraph", +] diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 93862f983c4e..30788db76829 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -48,31 +48,23 @@ __all__ = [ - 'Booster', - 'Dataset', - 'LGBMDeprecationWarning', - 'LightGBMError', - 'register_logger', - 'Sequence', + "Booster", + "Dataset", + "LGBMDeprecationWarning", + "LightGBMError", + "register_logger", + "Sequence", ] _BoosterHandle = ctypes.c_void_p _DatasetHandle = ctypes.c_void_p -_ctypes_int_ptr = Union[ - "ctypes._Pointer[ctypes.c_int32]", - "ctypes._Pointer[ctypes.c_int64]" -] +_ctypes_int_ptr = Union["ctypes._Pointer[ctypes.c_int32]", "ctypes._Pointer[ctypes.c_int64]"] _ctypes_int_array = Union[ - "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]", - "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]" -] -_ctypes_float_ptr = Union[ - "ctypes._Pointer[ctypes.c_float]", - "ctypes._Pointer[ctypes.c_double]" + "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]", "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]" ] +_ctypes_float_ptr = Union["ctypes._Pointer[ctypes.c_float]", "ctypes._Pointer[ctypes.c_double]"] _ctypes_float_array = Union[ - "ctypes.Array[ctypes._Pointer[ctypes.c_float]]", - "ctypes.Array[ctypes._Pointer[ctypes.c_double]]" + "ctypes.Array[ctypes._Pointer[ctypes.c_float]]", "ctypes.Array[ctypes._Pointer[ctypes.c_double]]" ] _LGBM_EvalFunctionResultType = Tuple[str, float, bool] _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] @@ -88,10 +80,7 @@ pa_Array, pa_ChunkedArray, ] -_LGBM_PositionType = Union[ - np.ndarray, - pd_Series -] +_LGBM_PositionType = Union[np.ndarray, pd_Series] _LGBM_InitScoreType = Union[ List[float], List[List[float]], @@ -112,7 +101,7 @@ "Sequence", List["Sequence"], List[np.ndarray], - pa_Table + pa_Table, ] _LGBM_LabelType = Union[ List[float], @@ -149,18 +138,20 @@ def _is_zero(x: float) -> bool: def _get_sample_count(total_nrow: int, params: str) -> int: sample_cnt = ctypes.c_int(0) - _safe_call(_LIB.LGBM_GetSampleCount( - ctypes.c_int32(total_nrow), - _c_str(params), - ctypes.byref(sample_cnt), - )) + _safe_call( + _LIB.LGBM_GetSampleCount( + ctypes.c_int32(total_nrow), + _c_str(params), + ctypes.byref(sample_cnt), + ) + ) return sample_cnt.value class _MissingType(Enum): - NONE = 'None' - NAN = 'NaN' - ZERO = 'Zero' + NONE = "None" + NAN = "NaN" + ZERO = "Zero" class _DummyLogger: @@ -180,9 +171,7 @@ def _has_method(logger: Any, method_name: str) -> bool: return callable(getattr(logger, method_name, None)) -def register_logger( - logger: Any, info_method_name: str = "info", warning_method_name: str = "warning" -) -> None: +def register_logger(logger: Any, info_method_name: str = "info", warning_method_name: str = "warning") -> None: """Register custom logger. Parameters @@ -195,9 +184,7 @@ def register_logger( Method used to log warning messages. """ if not _has_method(logger, info_method_name) or not _has_method(logger, warning_method_name): - raise TypeError( - f"Logger must provide '{info_method_name}' and '{warning_method_name}' method" - ) + raise TypeError(f"Logger must provide '{info_method_name}' and '{warning_method_name}' method") global _LOGGER, _INFO_METHOD_NAME, _WARNING_METHOD_NAME _LOGGER = logger @@ -212,8 +199,8 @@ def _normalize_native_string(func: Callable[[str], None]) -> Callable[[str], Non @wraps(func) def wrapper(msg: str) -> None: nonlocal msg_normalized - if msg.strip() == '': - msg = ''.join(msg_normalized) + if msg.strip() == "": + msg = "".join(msg_normalized) msg_normalized = [] return func(msg) else: @@ -237,7 +224,7 @@ def _log_native(msg: str) -> None: def _log_callback(msg: bytes) -> None: """Redirect logs from native library into Python.""" - _log_native(str(msg.decode('utf-8'))) + _log_native(str(msg.decode("utf-8"))) def _load_lib() -> ctypes.CDLL: @@ -248,14 +235,15 @@ def _load_lib() -> ctypes.CDLL: callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p) lib.callback = callback(_log_callback) # type: ignore[attr-defined] if lib.LGBM_RegisterLogCallback(lib.callback) != 0: - raise LightGBMError(lib.LGBM_GetLastError().decode('utf-8')) + raise LightGBMError(lib.LGBM_GetLastError().decode("utf-8")) return lib # we don't need lib_lightgbm while building docs _LIB: ctypes.CDLL -if environ.get('LIGHTGBM_BUILD_DOC', False): +if environ.get("LIGHTGBM_BUILD_DOC", False): from unittest.mock import Mock # isort: skip + _LIB = Mock(ctypes.CDLL) # type: ignore else: _LIB = _load_lib() @@ -274,7 +262,7 @@ def _safe_call(ret: int) -> None: The return value from C API calls. """ if ret != 0: - raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8')) + raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8")) def _is_numeric(obj: Any) -> bool: @@ -314,39 +302,24 @@ def _is_1d_list(data: Any) -> bool: def _is_list_of_numpy_arrays(data: Any) -> "TypeGuard[List[np.ndarray]]": - return ( - isinstance(data, list) - and all(isinstance(x, np.ndarray) for x in data) - ) + return isinstance(data, list) and all(isinstance(x, np.ndarray) for x in data) def _is_list_of_sequences(data: Any) -> "TypeGuard[List[Sequence]]": - return ( - isinstance(data, list) - and all(isinstance(x, Sequence) for x in data) - ) + return isinstance(data, list) and all(isinstance(x, Sequence) for x in data) def _is_1d_collection(data: Any) -> bool: """Check whether data is a 1-D collection.""" - return ( - _is_numpy_1d_array(data) - or _is_numpy_column_array(data) - or _is_1d_list(data) - or isinstance(data, pd_Series) - ) + return _is_numpy_1d_array(data) or _is_numpy_column_array(data) or _is_1d_list(data) or isinstance(data, pd_Series) -def _list_to_1d_numpy( - data: Any, - dtype: "np.typing.DTypeLike", - name: str -) -> np.ndarray: +def _list_to_1d_numpy(data: Any, dtype: "np.typing.DTypeLike", name: str) -> np.ndarray: """Convert data to numpy 1-D array.""" if _is_numpy_1d_array(data): return _cast_numpy_array_to_dtype(data, dtype) elif _is_numpy_column_array(data): - _log_warning('Converting column-vector to 1d array') + _log_warning("Converting column-vector to 1d array") array = data.ravel() return _cast_numpy_array_to_dtype(array, dtype) elif _is_1d_list(data): @@ -355,8 +328,9 @@ def _list_to_1d_numpy( _check_for_bad_pandas_dtypes(data.to_frame().dtypes) return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well else: - raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n" - "It should be list, numpy 1-D array or pandas Series") + raise TypeError( + f"Wrong type({type(data).__name__}) for {name}.\n" "It should be list, numpy 1-D array or pandas Series" + ) def _is_numpy_2d_array(data: Any) -> bool: @@ -371,11 +345,7 @@ def _is_2d_list(data: Any) -> bool: def _is_2d_collection(data: Any) -> bool: """Check whether data is a 2-D collection.""" - return ( - _is_numpy_2d_array(data) - or _is_2d_list(data) - or isinstance(data, pd_DataFrame) - ) + return _is_numpy_2d_array(data) or _is_2d_list(data) or isinstance(data, pd_DataFrame) def _is_pyarrow_array(data: Any) -> "TypeGuard[Union[pa_Array, pa_ChunkedArray]]": @@ -439,12 +409,7 @@ def _export_arrow_to_c(data: pa_Table) -> _ArrowCArray: return _ArrowCArray(len(chunks), chunks, schema) - -def _data_to_2d_numpy( - data: Any, - dtype: "np.typing.DTypeLike", - name: str -) -> np.ndarray: +def _data_to_2d_numpy(data: Any, dtype: "np.typing.DTypeLike", name: str) -> np.ndarray: """Convert data to numpy 2-D array.""" if _is_numpy_2d_array(data): return _cast_numpy_array_to_dtype(data, dtype) @@ -453,8 +418,10 @@ def _data_to_2d_numpy( if isinstance(data, pd_DataFrame): _check_for_bad_pandas_dtypes(data.dtypes) return _cast_numpy_array_to_dtype(data.values, dtype) - raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n" - "It should be list of lists, numpy 2-D array or pandas DataFrame") + raise TypeError( + f"Wrong type({type(data).__name__}) for {name}.\n" + "It should be list of lists, numpy 2-D array or pandas DataFrame" + ) def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: @@ -462,7 +429,7 @@ def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndar if isinstance(cptr, ctypes.POINTER(ctypes.c_float)): return np.ctypeslib.as_array(cptr, shape=(length,)).copy() else: - raise RuntimeError('Expected float pointer') + raise RuntimeError("Expected float pointer") def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: @@ -470,7 +437,7 @@ def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndar if isinstance(cptr, ctypes.POINTER(ctypes.c_double)): return np.ctypeslib.as_array(cptr, shape=(length,)).copy() else: - raise RuntimeError('Expected double pointer') + raise RuntimeError("Expected double pointer") def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: @@ -478,7 +445,7 @@ def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarra if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)): return np.ctypeslib.as_array(cptr, shape=(length,)).copy() else: - raise RuntimeError('Expected int32 pointer') + raise RuntimeError("Expected int32 pointer") def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray: @@ -486,12 +453,12 @@ def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarra if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)): return np.ctypeslib.as_array(cptr, shape=(length,)).copy() else: - raise RuntimeError('Expected int64 pointer') + raise RuntimeError("Expected int64 pointer") def _c_str(string: str) -> ctypes.c_char_p: """Convert a Python string to C string.""" - return ctypes.c_char_p(string.encode('utf-8')) + return ctypes.c_char_p(string.encode("utf-8")) def _c_array(ctype: type, values: List[Any]) -> ctypes.Array: @@ -528,8 +495,8 @@ def _param_dict_to_str(data: Optional[Dict[str, Any]]) -> str: elif isinstance(val, (str, Path, _NUMERIC_TYPES)) or _is_numeric(val): pairs.append(f"{key}={val}") elif val is not None: - raise TypeError(f'Unknown type of parameter:{key}, got:{type(val).__name__}') - return ' '.join(pairs) + raise TypeError(f"Unknown type of parameter:{key}, got:{type(val).__name__}") + return " ".join(pairs) class _TempFile: @@ -569,22 +536,17 @@ def _get_all_param_aliases() -> Dict[str, List[str]]: tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_DumpParamAliases( - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) + _safe_call(_LIB.LGBM_DumpParamAliases(ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), ptr_string_buffer)) actual_len = tmp_out_len.value # if buffer length is not long enough, re-allocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_DumpParamAliases( - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) + _safe_call( + _LIB.LGBM_DumpParamAliases(ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer) + ) return json.loads( - string_buffer.value.decode('utf-8'), - object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} + string_buffer.value.decode("utf-8"), object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} ) @classmethod @@ -693,29 +655,26 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va "weight": _C_API_DTYPE_FLOAT32, "init_score": _C_API_DTYPE_FLOAT64, "group": _C_API_DTYPE_INT32, - "position": _C_API_DTYPE_INT32 + "position": _C_API_DTYPE_INT32, } """String name to int feature importance type mapper""" -_FEATURE_IMPORTANCE_TYPE_MAPPER = { - "split": _C_API_FEATURE_IMPORTANCE_SPLIT, - "gain": _C_API_FEATURE_IMPORTANCE_GAIN -} +_FEATURE_IMPORTANCE_TYPE_MAPPER = {"split": _C_API_FEATURE_IMPORTANCE_SPLIT, "gain": _C_API_FEATURE_IMPORTANCE_GAIN} def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray: """Fix the memory of multi-dimensional sliced object.""" if isinstance(data, np.ndarray) and isinstance(data.base, np.ndarray): if not data.flags.c_contiguous: - _log_warning("Usage of np.ndarray subset (sliced data) is not recommended " - "due to it will double the peak memory cost in LightGBM.") + _log_warning( + "Usage of np.ndarray subset (sliced data) is not recommended " + "due to it will double the peak memory cost in LightGBM." + ) return np.copy(data) return data -def _c_float_array( - data: np.ndarray -) -> Tuple[_ctypes_float_ptr, int, np.ndarray]: +def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray]: """Get pointer of float numpy array / list.""" if _is_1d_list(data): data = np.array(data, copy=False) @@ -736,9 +695,7 @@ def _c_float_array( return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed -def _c_int_array( - data: np.ndarray -) -> Tuple[_ctypes_int_ptr, int, np.ndarray]: +def _c_int_array(data: np.ndarray) -> Tuple[_ctypes_int_ptr, int, np.ndarray]: """Get pointer of int numpy array / list.""" if _is_1d_list(data): data = np.array(data, copy=False) @@ -760,28 +717,24 @@ def _c_int_array( def _is_allowed_numpy_dtype(dtype: type) -> bool: - float128 = getattr(np, 'float128', type(None)) - return ( - issubclass(dtype, (np.integer, np.floating, np.bool_)) - and not issubclass(dtype, (np.timedelta64, float128)) - ) + float128 = getattr(np, "float128", type(None)) + return issubclass(dtype, (np.integer, np.floating, np.bool_)) and not issubclass(dtype, (np.timedelta64, float128)) def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: bad_pandas_dtypes = [ - f'{column_name}: {pandas_dtype}' + f"{column_name}: {pandas_dtype}" for column_name, pandas_dtype in pandas_dtypes_series.items() if not _is_allowed_numpy_dtype(pandas_dtype.type) ] if bad_pandas_dtypes: - raise ValueError('pandas dtypes must be int, float or bool.\n' - f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}') + raise ValueError( + 'pandas dtypes must be int, float or bool.\n' + f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}' + ) -def _pandas_to_numpy( - data: pd_DataFrame, - target_dtype: "np.typing.DTypeLike" -) -> np.ndarray: +def _pandas_to_numpy(data: pd_DataFrame, target_dtype: "np.typing.DTypeLike") -> np.ndarray: _check_for_bad_pandas_dtypes(data.dtypes) try: # most common case (no nullable dtypes) @@ -799,17 +752,17 @@ def _data_from_pandas( data: pd_DataFrame, feature_name: _LGBM_FeatureNameConfiguration, categorical_feature: _LGBM_CategoricalFeatureConfiguration, - pandas_categorical: Optional[List[List]] + pandas_categorical: Optional[List[List]], ) -> Tuple[np.ndarray, List[str], Union[List[str], List[int]], List[List]]: if len(data.shape) != 2 or data.shape[0] < 1: - raise ValueError('Input data must be 2 dimensional and non empty.') + raise ValueError("Input data must be 2 dimensional and non empty.") # take shallow copy in case we modify categorical columns # whole column modifications don't change the original df data = data.copy(deep=False) # determine feature names - if feature_name == 'auto': + if feature_name == "auto": feature_name = [str(col) for col in data.columns] # determine categorical features @@ -819,7 +772,7 @@ def _data_from_pandas( pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] else: if len(cat_cols) != len(pandas_categorical): - raise ValueError('train and valid dataset categorical_feature do not match.') + raise ValueError("train and valid dataset categorical_feature do not match.") for col, category in zip(cat_cols, pandas_categorical): if list(data[col].cat.categories) != list(category): data[col] = data[col].cat.set_categories(category) @@ -827,7 +780,7 @@ def _data_from_pandas( data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) # use cat cols from DataFrame - if categorical_feature == 'auto': + if categorical_feature == "auto": categorical_feature = cat_cols_not_ordered df_dtypes = [dtype.type for dtype in data.dtypes] @@ -835,35 +788,28 @@ def _data_from_pandas( df_dtypes.append(np.float32) target_dtype = np.result_type(*df_dtypes) - return ( - _pandas_to_numpy(data, target_dtype=target_dtype), - feature_name, - categorical_feature, - pandas_categorical - ) + return (_pandas_to_numpy(data, target_dtype=target_dtype), feature_name, categorical_feature, pandas_categorical) def _dump_pandas_categorical( - pandas_categorical: Optional[List[List]], - file_name: Optional[Union[str, Path]] = None + pandas_categorical: Optional[List[List]], file_name: Optional[Union[str, Path]] = None ) -> str: categorical_json = json.dumps(pandas_categorical, default=_json_default_with_numpy) - pandas_str = f'\npandas_categorical:{categorical_json}\n' + pandas_str = f"\npandas_categorical:{categorical_json}\n" if file_name is not None: - with open(file_name, 'a') as f: + with open(file_name, "a") as f: f.write(pandas_str) return pandas_str def _load_pandas_categorical( - file_name: Optional[Union[str, Path]] = None, - model_str: Optional[str] = None + file_name: Optional[Union[str, Path]] = None, model_str: Optional[str] = None ) -> Optional[List[List]]: - pandas_key = 'pandas_categorical:' + pandas_key = "pandas_categorical:" offset = -len(pandas_key) if file_name is not None: max_offset = -getsize(file_name) - with open(file_name, 'rb') as f: + with open(file_name, "rb") as f: while True: if offset < max_offset: offset = max_offset @@ -872,14 +818,14 @@ def _load_pandas_categorical( if len(lines) >= 2: break offset *= 2 - last_line = lines[-1].decode('utf-8').strip() + last_line = lines[-1].decode("utf-8").strip() if not last_line.startswith(pandas_key): - last_line = lines[-2].decode('utf-8').strip() + last_line = lines[-2].decode("utf-8").strip() elif model_str is not None: - idx = model_str.rfind('\n', 0, offset) + idx = model_str.rfind("\n", 0, offset) last_line = model_str[idx:].strip() if last_line.startswith(pandas_key): - return json.loads(last_line[len(pandas_key):]) + return json.loads(last_line[len(pandas_key) :]) else: return None @@ -966,7 +912,7 @@ def __init__( booster_handle: _BoosterHandle, pandas_categorical: Optional[List[List]], pred_parameter: Dict[str, Any], - manage_handle: bool + manage_handle: bool, ): """Initialize the _InnerPredictor. @@ -988,20 +934,11 @@ def __init__( self.pred_parameter = _param_dict_to_str(pred_parameter) out_num_class = ctypes.c_int(0) - _safe_call( - _LIB.LGBM_BoosterGetNumClasses( - self._handle, - ctypes.byref(out_num_class) - ) - ) + _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) self.num_class = out_num_class.value @classmethod - def from_booster( - cls, - booster: "Booster", - pred_parameter: Dict[str, Any] - ) -> "_InnerPredictor": + def from_booster(cls, booster: "Booster", pred_parameter: Dict[str, Any]) -> "_InnerPredictor": """Initialize an ``_InnerPredictor`` from a ``Booster``. Parameters @@ -1012,25 +949,16 @@ def from_booster( Other parameters for the prediction. """ out_cur_iter = ctypes.c_int(0) - _safe_call( - _LIB.LGBM_BoosterGetCurrentIteration( - booster._handle, - ctypes.byref(out_cur_iter) - ) - ) + _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(booster._handle, ctypes.byref(out_cur_iter))) return cls( booster_handle=booster._handle, pandas_categorical=booster.pandas_categorical, pred_parameter=pred_parameter, - manage_handle=False + manage_handle=False, ) @classmethod - def from_model_file( - cls, - model_file: Union[str, Path], - pred_parameter: Dict[str, Any] - ) -> "_InnerPredictor": + def from_model_file(cls, model_file: Union[str, Path], pred_parameter: Dict[str, Any]) -> "_InnerPredictor": """Initialize an ``_InnerPredictor`` from a text file containing a LightGBM model. Parameters @@ -1044,16 +972,14 @@ def from_model_file( out_num_iterations = ctypes.c_int(0) _safe_call( _LIB.LGBM_BoosterCreateFromModelfile( - _c_str(str(model_file)), - ctypes.byref(out_num_iterations), - ctypes.byref(booster_handle) + _c_str(str(model_file)), ctypes.byref(out_num_iterations), ctypes.byref(booster_handle) ) ) return cls( booster_handle=booster_handle, pandas_categorical=_load_pandas_categorical(file_name=model_file), pred_parameter=pred_parameter, - manage_handle=True + manage_handle=True, ) def __del__(self) -> None: @@ -1065,8 +991,8 @@ def __del__(self) -> None: def __getstate__(self) -> Dict[str, Any]: this = self.__dict__.copy() - this.pop('handle', None) - this.pop('_handle', None) + this.pop("handle", None) + this.pop("_handle", None) return this def predict( @@ -1078,7 +1004,7 @@ def predict( pred_leaf: bool = False, pred_contrib: bool = False, data_has_header: bool = False, - validate_features: bool = False + validate_features: bool = False, ) -> Union[np.ndarray, scipy.sparse.spmatrix, List[scipy.sparse.spmatrix]]: """Predict logic. @@ -1117,7 +1043,7 @@ def predict( elif isinstance(data, pd_DataFrame) and validate_features: data_names = [str(x) for x in data.columns] ptr_names = (ctypes.c_char_p * len(data_names))() - ptr_names[:] = [x.encode('utf-8') for x in data_names] + ptr_names[:] = [x.encode("utf-8") for x in data_names] _safe_call( _LIB.LGBM_BoosterValidateFeatureNames( self._handle, @@ -1128,10 +1054,7 @@ def predict( if isinstance(data, pd_DataFrame): data = _data_from_pandas( - data=data, - feature_name="auto", - categorical_feature="auto", - pandas_categorical=self.pandas_categorical + data=data, feature_name="auto", categorical_feature="auto", pandas_categorical=self.pandas_categorical )[0] predict_type = _C_API_PREDICT_NORMAL @@ -1144,74 +1067,59 @@ def predict( if isinstance(data, (str, Path)): with _TempFile() as f: - _safe_call(_LIB.LGBM_BoosterPredictForFile( - self._handle, - _c_str(str(data)), - ctypes.c_int(data_has_header), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - _c_str(f.name))) + _safe_call( + _LIB.LGBM_BoosterPredictForFile( + self._handle, + _c_str(str(data)), + ctypes.c_int(data_has_header), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + _c_str(f.name), + ) + ) preds = np.loadtxt(f.name, dtype=np.float64) nrow = preds.shape[0] elif isinstance(data, scipy.sparse.csr_matrix): preds, nrow = self.__pred_for_csr( - csr=data, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + csr=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) elif isinstance(data, scipy.sparse.csc_matrix): preds, nrow = self.__pred_for_csc( - csc=data, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + csc=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) elif isinstance(data, np.ndarray): preds, nrow = self.__pred_for_np2d( - mat=data, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + mat=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) elif _is_pyarrow_table(data): preds, nrow = self.__pred_for_pyarrow_table( - table=data, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + table=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) elif isinstance(data, list): try: data = np.array(data) except BaseException as err: - raise ValueError('Cannot convert data list to numpy array.') from err + raise ValueError("Cannot convert data list to numpy array.") from err preds, nrow = self.__pred_for_np2d( - mat=data, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + mat=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) elif isinstance(data, dt_DataTable): preds, nrow = self.__pred_for_np2d( mat=data.to_numpy(), start_iteration=start_iteration, num_iteration=num_iteration, - predict_type=predict_type + predict_type=predict_type, ) else: try: - _log_warning('Converting data to scipy sparse matrix.') + _log_warning("Converting data to scipy sparse matrix.") csr = scipy.sparse.csr_matrix(data) except BaseException as err: - raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err + raise TypeError(f"Cannot predict data for type {type(data).__name__}") from err preds, nrow = self.__pred_for_csr( - csr=csr, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + csr=csr, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) if pred_leaf: preds = preds.astype(np.int32) @@ -1220,39 +1128,33 @@ def predict( if preds.size % nrow == 0: preds = preds.reshape(nrow, -1) else: - raise ValueError(f'Length of predict result ({preds.size}) cannot be divide nrow ({nrow})') + raise ValueError(f"Length of predict result ({preds.size}) cannot be divide nrow ({nrow})") return preds - def __get_num_preds( - self, - start_iteration: int, - num_iteration: int, - nrow: int, - predict_type: int - ) -> int: + def __get_num_preds(self, start_iteration: int, num_iteration: int, nrow: int, predict_type: int) -> int: """Get size of prediction result.""" if nrow > _MAX_INT32: - raise LightGBMError('LightGBM cannot perform prediction for data ' - f'with number of rows greater than MAX_INT32 ({_MAX_INT32}).\n' - 'You can split your data into chunks ' - 'and then concatenate predictions for them') + raise LightGBMError( + "LightGBM cannot perform prediction for data " + f"with number of rows greater than MAX_INT32 ({_MAX_INT32}).\n" + "You can split your data into chunks " + "and then concatenate predictions for them" + ) n_preds = ctypes.c_int64(0) - _safe_call(_LIB.LGBM_BoosterCalcNumPredict( - self._handle, - ctypes.c_int(nrow), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - ctypes.byref(n_preds))) + _safe_call( + _LIB.LGBM_BoosterCalcNumPredict( + self._handle, + ctypes.c_int(nrow), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.byref(n_preds), + ) + ) return n_preds.value def __inner_predict_np2d( - self, - mat: np.ndarray, - start_iteration: int, - num_iteration: int, - predict_type: int, - preds: Optional[np.ndarray] + self, mat: np.ndarray, start_iteration: int, num_iteration: int, predict_type: int, preds: Optional[np.ndarray] ) -> Tuple[np.ndarray, int]: if mat.dtype == np.float32 or mat.dtype == np.float64: data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) @@ -1260,60 +1162,60 @@ def __inner_predict_np2d( data = np.array(mat.reshape(mat.size), dtype=np.float32) ptr_data, type_ptr_data, _ = _c_float_array(data) n_preds = self.__get_num_preds( - start_iteration=start_iteration, - num_iteration=num_iteration, - nrow=mat.shape[0], - predict_type=predict_type + start_iteration=start_iteration, num_iteration=num_iteration, nrow=mat.shape[0], predict_type=predict_type ) if preds is None: preds = np.empty(n_preds, dtype=np.float64) elif len(preds.shape) != 1 or len(preds) != n_preds: raise ValueError("Wrong length of pre-allocated predict array") out_num_preds = ctypes.c_int64(0) - _safe_call(_LIB.LGBM_BoosterPredictForMat( - self._handle, - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int32(mat.shape[0]), - ctypes.c_int32(mat.shape[1]), - ctypes.c_int(_C_API_IS_ROW_MAJOR), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - ctypes.byref(out_num_preds), - preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + _safe_call( + _LIB.LGBM_BoosterPredictForMat( + self._handle, + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int32(mat.shape[0]), + ctypes.c_int32(mat.shape[1]), + ctypes.c_int(_C_API_IS_ROW_MAJOR), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), + ) + ) if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, mat.shape[0] def __pred_for_np2d( - self, - mat: np.ndarray, - start_iteration: int, - num_iteration: int, - predict_type: int + self, mat: np.ndarray, start_iteration: int, num_iteration: int, predict_type: int ) -> Tuple[np.ndarray, int]: """Predict for a 2-D numpy matrix.""" if len(mat.shape) != 2: - raise ValueError('Input numpy.ndarray or list must be 2 dimensional') + raise ValueError("Input numpy.ndarray or list must be 2 dimensional") nrow = mat.shape[0] if nrow > _MAX_INT32: sections = np.arange(start=_MAX_INT32, stop=nrow, step=_MAX_INT32) # __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal - n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])] + n_preds = [ + self.__get_num_preds(start_iteration, num_iteration, i, predict_type) + for i in np.diff([0] + list(sections) + [nrow]) + ] n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() preds = np.empty(sum(n_preds), dtype=np.float64) - for chunk, (start_idx_pred, end_idx_pred) in zip(np.array_split(mat, sections), - zip(n_preds_sections, n_preds_sections[1:])): + for chunk, (start_idx_pred, end_idx_pred) in zip( + np.array_split(mat, sections), zip(n_preds_sections, n_preds_sections[1:]) + ): # avoid memory consumption by arrays concatenation operations self.__inner_predict_np2d( mat=chunk, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type, - preds=preds[start_idx_pred:end_idx_pred] + preds=preds[start_idx_pred:end_idx_pred], ) return preds, nrow else: @@ -1322,7 +1224,7 @@ def __pred_for_np2d( start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type, - preds=None + preds=None, ) def __create_sparse_native( @@ -1334,7 +1236,7 @@ def __create_sparse_native( out_ptr_data: "ctypes._Pointer", indptr_type: int, data_type: int, - is_csr: bool + is_csr: bool, ) -> Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]]: # create numpy array from output arrays data_indices_len = out_shape[0] @@ -1363,8 +1265,8 @@ def __create_sparse_native( offset = 0 for cs_indptr in out_indptr_arrays: matrix_indptr_len = cs_indptr[cs_indptr.shape[0] - 1] - cs_indices = out_indices[offset + cs_indptr[0]:offset + matrix_indptr_len] - cs_data = out_data[offset + cs_indptr[0]:offset + matrix_indptr_len] + cs_indices = out_indices[offset + cs_indptr[0] : offset + matrix_indptr_len] + cs_data = out_data[offset + cs_indptr[0] : offset + matrix_indptr_len] offset += matrix_indptr_len # same shape as input csr or csc matrix except extra column for expected value cs_shape = [cs.shape[0], cs.shape[1] + 1] @@ -1374,8 +1276,11 @@ def __create_sparse_native( else: cs_output_matrices.append(scipy.sparse.csc_matrix((cs_data, cs_indices, cs_indptr), cs_shape)) # free the temporary native indptr, indices, and data - _safe_call(_LIB.LGBM_BoosterFreePredictSparse(out_ptr_indptr, out_ptr_indices, out_ptr_data, - ctypes.c_int(indptr_type), ctypes.c_int(data_type))) + _safe_call( + _LIB.LGBM_BoosterFreePredictSparse( + out_ptr_indptr, out_ptr_indices, out_ptr_data, ctypes.c_int(indptr_type), ctypes.c_int(data_type) + ) + ) if len(cs_output_matrices) == 1: return cs_output_matrices[0] return cs_output_matrices @@ -1386,14 +1291,11 @@ def __inner_predict_csr( start_iteration: int, num_iteration: int, predict_type: int, - preds: Optional[np.ndarray] + preds: Optional[np.ndarray], ) -> Tuple[np.ndarray, int]: nrow = len(csr.indptr) - 1 n_preds = self.__get_num_preds( - start_iteration=start_iteration, - num_iteration=num_iteration, - nrow=nrow, - predict_type=predict_type + start_iteration=start_iteration, num_iteration=num_iteration, nrow=nrow, predict_type=predict_type ) if preds is None: preds = np.empty(n_preds, dtype=np.float64) @@ -1407,32 +1309,31 @@ def __inner_predict_csr( assert csr.shape[1] <= _MAX_INT32 csr_indices = csr.indices.astype(np.int32, copy=False) - _safe_call(_LIB.LGBM_BoosterPredictForCSR( - self._handle, - ptr_indptr, - ctypes.c_int(type_ptr_indptr), - csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int64(len(csr.indptr)), - ctypes.c_int64(len(csr.data)), - ctypes.c_int64(csr.shape[1]), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - ctypes.byref(out_num_preds), - preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + _safe_call( + _LIB.LGBM_BoosterPredictForCSR( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csr.indptr)), + ctypes.c_int64(len(csr.data)), + ctypes.c_int64(csr.shape[1]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), + ) + ) if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, nrow def __inner_predict_csr_sparse( - self, - csr: scipy.sparse.csr_matrix, - start_iteration: int, - num_iteration: int, - predict_type: int + self, csr: scipy.sparse.csr_matrix, start_iteration: int, num_iteration: int, predict_type: int ) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]: ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr) ptr_data, type_ptr_data, _ = _c_float_array(csr.data) @@ -1450,25 +1351,28 @@ def __inner_predict_csr_sparse( else: out_ptr_data = ctypes.POINTER(ctypes.c_double)() out_shape = np.empty(2, dtype=np.int64) - _safe_call(_LIB.LGBM_BoosterPredictSparseOutput( - self._handle, - ptr_indptr, - ctypes.c_int(type_ptr_indptr), - csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int64(len(csr.indptr)), - ctypes.c_int64(len(csr.data)), - ctypes.c_int64(csr.shape[1]), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - ctypes.c_int(matrix_type), - out_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), - ctypes.byref(out_ptr_indptr), - ctypes.byref(out_ptr_indices), - ctypes.byref(out_ptr_data))) + _safe_call( + _LIB.LGBM_BoosterPredictSparseOutput( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csr.indptr)), + ctypes.c_int64(len(csr.data)), + ctypes.c_int64(csr.shape[1]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.c_int(matrix_type), + out_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), + ctypes.byref(out_ptr_indptr), + ctypes.byref(out_ptr_indices), + ctypes.byref(out_ptr_data), + ) + ) matrices = self.__create_sparse_native( cs=csr, out_shape=out_shape, @@ -1477,25 +1381,18 @@ def __inner_predict_csr_sparse( out_ptr_data=out_ptr_data, indptr_type=type_ptr_indptr, data_type=type_ptr_data, - is_csr=True + is_csr=True, ) nrow = len(csr.indptr) - 1 return matrices, nrow def __pred_for_csr( - self, - csr: scipy.sparse.csr_matrix, - start_iteration: int, - num_iteration: int, - predict_type: int + self, csr: scipy.sparse.csr_matrix, start_iteration: int, num_iteration: int, predict_type: int ) -> Tuple[np.ndarray, int]: """Predict for a CSR data.""" if predict_type == _C_API_PREDICT_CONTRIB: return self.__inner_predict_csr_sparse( - csr=csr, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + csr=csr, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) nrow = len(csr.indptr) - 1 if nrow > _MAX_INT32: @@ -1504,15 +1401,16 @@ def __pred_for_csr( n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)] n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum() preds = np.empty(sum(n_preds), dtype=np.float64) - for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(zip(sections, sections[1:]), - zip(n_preds_sections, n_preds_sections[1:])): + for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip( + zip(sections, sections[1:]), zip(n_preds_sections, n_preds_sections[1:]) + ): # avoid memory consumption by arrays concatenation operations self.__inner_predict_csr( csr=csr[start_idx:end_idx], start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type, - preds=preds[start_idx_pred:end_idx_pred] + preds=preds[start_idx_pred:end_idx_pred], ) return preds, nrow else: @@ -1521,15 +1419,11 @@ def __pred_for_csr( start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type, - preds=None + preds=None, ) def __inner_predict_sparse_csc( - self, - csc: scipy.sparse.csc_matrix, - start_iteration: int, - num_iteration: int, - predict_type: int + self, csc: scipy.sparse.csc_matrix, start_iteration: int, num_iteration: int, predict_type: int ): ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) ptr_data, type_ptr_data, _ = _c_float_array(csc.data) @@ -1547,25 +1441,28 @@ def __inner_predict_sparse_csc( else: out_ptr_data = ctypes.POINTER(ctypes.c_double)() out_shape = np.empty(2, dtype=np.int64) - _safe_call(_LIB.LGBM_BoosterPredictSparseOutput( - self._handle, - ptr_indptr, - ctypes.c_int(type_ptr_indptr), - csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int64(len(csc.indptr)), - ctypes.c_int64(len(csc.data)), - ctypes.c_int64(csc.shape[0]), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - ctypes.c_int(matrix_type), - out_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), - ctypes.byref(out_ptr_indptr), - ctypes.byref(out_ptr_indices), - ctypes.byref(out_ptr_data))) + _safe_call( + _LIB.LGBM_BoosterPredictSparseOutput( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csc.indptr)), + ctypes.c_int64(len(csc.data)), + ctypes.c_int64(csc.shape[0]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.c_int(matrix_type), + out_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), + ctypes.byref(out_ptr_indptr), + ctypes.byref(out_ptr_indices), + ctypes.byref(out_ptr_data), + ) + ) matrices = self.__create_sparse_native( cs=csc, out_shape=out_shape, @@ -1574,39 +1471,26 @@ def __inner_predict_sparse_csc( out_ptr_data=out_ptr_data, indptr_type=type_ptr_indptr, data_type=type_ptr_data, - is_csr=False + is_csr=False, ) nrow = csc.shape[0] return matrices, nrow def __pred_for_csc( - self, - csc: scipy.sparse.csc_matrix, - start_iteration: int, - num_iteration: int, - predict_type: int + self, csc: scipy.sparse.csc_matrix, start_iteration: int, num_iteration: int, predict_type: int ) -> Tuple[np.ndarray, int]: """Predict for a CSC data.""" nrow = csc.shape[0] if nrow > _MAX_INT32: return self.__pred_for_csr( - csr=csc.tocsr(), - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + csr=csc.tocsr(), start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) if predict_type == _C_API_PREDICT_CONTRIB: return self.__inner_predict_sparse_csc( - csc=csc, - start_iteration=start_iteration, - num_iteration=num_iteration, - predict_type=predict_type + csc=csc, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type ) n_preds = self.__get_num_preds( - start_iteration=start_iteration, - num_iteration=num_iteration, - nrow=nrow, - predict_type=predict_type + start_iteration=start_iteration, num_iteration=num_iteration, nrow=nrow, predict_type=predict_type ) preds = np.empty(n_preds, dtype=np.float64) out_num_preds = ctypes.c_int64(0) @@ -1617,32 +1501,31 @@ def __pred_for_csc( assert csc.shape[0] <= _MAX_INT32 csc_indices = csc.indices.astype(np.int32, copy=False) - _safe_call(_LIB.LGBM_BoosterPredictForCSC( - self._handle, - ptr_indptr, - ctypes.c_int(type_ptr_indptr), - csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int64(len(csc.indptr)), - ctypes.c_int64(len(csc.data)), - ctypes.c_int64(csc.shape[0]), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - ctypes.byref(out_num_preds), - preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + _safe_call( + _LIB.LGBM_BoosterPredictForCSC( + self._handle, + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csc.indptr)), + ctypes.c_int64(len(csc.data)), + ctypes.c_int64(csc.shape[0]), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), + ) + ) if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, nrow - + def __pred_for_pyarrow_table( - self, - table: pa_Table, - start_iteration: int, - num_iteration: int, - predict_type: int + self, table: pa_Table, start_iteration: int, num_iteration: int, predict_type: int ) -> Tuple[np.ndarray, int]: """Predict for a PyArrow table.""" if not PYARROW_INSTALLED: @@ -1654,27 +1537,27 @@ def __pred_for_pyarrow_table( # Prepare prediction output array n_preds = self.__get_num_preds( - start_iteration=start_iteration, - num_iteration=num_iteration, - nrow=table.num_rows, - predict_type=predict_type + start_iteration=start_iteration, num_iteration=num_iteration, nrow=table.num_rows, predict_type=predict_type ) preds = np.empty(n_preds, dtype=np.float64) out_num_preds = ctypes.c_int64(0) # Export Arrow table to C and run prediction c_array = _export_arrow_to_c(table) - _safe_call(_LIB.LGBM_BoosterPredictForArrow( - self._handle, - ctypes.c_int64(c_array.n_chunks), - ctypes.c_void_p(c_array.chunks_ptr), - ctypes.c_void_p(c_array.schema_ptr), - ctypes.c_int(predict_type), - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - _c_str(self.pred_parameter), - ctypes.byref(out_num_preds), - preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + _safe_call( + _LIB.LGBM_BoosterPredictForArrow( + self._handle, + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + ctypes.c_int(predict_type), + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + _c_str(self.pred_parameter), + ctypes.byref(out_num_preds), + preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), + ) + ) if n_preds != out_num_preds.value: raise ValueError("Wrong length for predict results") return preds, table.num_rows @@ -1688,9 +1571,7 @@ def current_iteration(self) -> int: The index of the current iteration. """ out_cur_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetCurrentIteration( - self._handle, - ctypes.byref(out_cur_iter))) + _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(self._handle, ctypes.byref(out_cur_iter))) return out_cur_iter.value @@ -1705,8 +1586,8 @@ def __init__( weight: Optional[_LGBM_WeightType] = None, group: Optional[_LGBM_GroupType] = None, init_score: Optional[_LGBM_InitScoreType] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", params: Optional[Dict[str, Any]] = None, free_raw_data: bool = True, position: Optional[_LGBM_PositionType] = None, @@ -1801,20 +1682,18 @@ def _create_sample_indices(self, total_nrow: int) -> np.ndarray: ptr_data, _, _ = _c_int_array(indices) actual_sample_cnt = ctypes.c_int32(0) - _safe_call(_LIB.LGBM_SampleIndices( - ctypes.c_int32(total_nrow), - _c_str(param_str), - ptr_data, - ctypes.byref(actual_sample_cnt), - )) + _safe_call( + _LIB.LGBM_SampleIndices( + ctypes.c_int32(total_nrow), + _c_str(param_str), + ptr_data, + ctypes.byref(actual_sample_cnt), + ) + ) assert sample_cnt == actual_sample_cnt.value return indices - def _init_from_ref_dataset( - self, - total_nrow: int, - ref_dataset: _DatasetHandle - ) -> 'Dataset': + def _init_from_ref_dataset(self, total_nrow: int, ref_dataset: _DatasetHandle) -> "Dataset": """Create dataset from a reference dataset. Parameters @@ -1830,11 +1709,13 @@ def _init_from_ref_dataset( Constructed Dataset object. """ self._handle = ctypes.c_void_p() - _safe_call(_LIB.LGBM_DatasetCreateByReference( - ref_dataset, - ctypes.c_int64(total_nrow), - ctypes.byref(self._handle), - )) + _safe_call( + _LIB.LGBM_DatasetCreateByReference( + ref_dataset, + ctypes.c_int64(total_nrow), + ctypes.byref(self._handle), + ) + ) return self def _init_from_sample( @@ -1886,20 +1767,22 @@ def _init_from_sample( self._handle = ctypes.c_void_p() params_str = _param_dict_to_str(self.get_params()) - _safe_call(_LIB.LGBM_DatasetCreateFromSampledColumn( - ctypes.cast(sample_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), - ctypes.cast(indices_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_int32))), - ctypes.c_int32(ncol), - num_per_col_ptr, - ctypes.c_int32(sample_cnt), - ctypes.c_int32(total_nrow), - ctypes.c_int64(total_nrow), - _c_str(params_str), - ctypes.byref(self._handle), - )) + _safe_call( + _LIB.LGBM_DatasetCreateFromSampledColumn( + ctypes.cast(sample_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), + ctypes.cast(indices_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_int32))), + ctypes.c_int32(ncol), + num_per_col_ptr, + ctypes.c_int32(sample_cnt), + ctypes.c_int32(total_nrow), + ctypes.c_int64(total_nrow), + _c_str(params_str), + ctypes.byref(self._handle), + ) + ) return self - def _push_rows(self, data: np.ndarray) -> 'Dataset': + def _push_rows(self, data: np.ndarray) -> "Dataset": """Add rows to Dataset. Parameters @@ -1916,14 +1799,16 @@ def _push_rows(self, data: np.ndarray) -> 'Dataset': data = data.reshape(data.size) data_ptr, data_type, _ = _c_float_array(data) - _safe_call(_LIB.LGBM_DatasetPushRows( - self._handle, - data_ptr, - data_type, - ctypes.c_int32(nrow), - ctypes.c_int32(ncol), - ctypes.c_int32(self._start_row), - )) + _safe_call( + _LIB.LGBM_DatasetPushRows( + self._handle, + data_ptr, + data_type, + ctypes.c_int32(nrow), + ctypes.c_int32(ncol), + ctypes.c_int32(self._start_row), + ) + ) self._start_row += nrow return self @@ -1937,27 +1822,29 @@ def get_params(self) -> Dict[str, Any]: """ if self.params is not None: # no min_data, nthreads and verbose in this function - dataset_params = _ConfigAliases.get("bin_construct_sample_cnt", - "categorical_feature", - "data_random_seed", - "enable_bundle", - "feature_pre_filter", - "forcedbins_filename", - "group_column", - "header", - "ignore_column", - "is_enable_sparse", - "label_column", - "linear_tree", - "max_bin", - "max_bin_by_feature", - "min_data_in_bin", - "pre_partition", - "precise_float_parser", - "two_round", - "use_missing", - "weight_column", - "zero_as_missing") + dataset_params = _ConfigAliases.get( + "bin_construct_sample_cnt", + "categorical_feature", + "data_random_seed", + "enable_bundle", + "feature_pre_filter", + "forcedbins_filename", + "group_column", + "header", + "ignore_column", + "is_enable_sparse", + "label_column", + "linear_tree", + "max_bin", + "max_bin_by_feature", + "min_data_in_bin", + "pre_partition", + "precise_float_parser", + "two_round", + "use_missing", + "weight_column", + "zero_as_missing", + ) return {k: v for k, v in self.params.items() if k in dataset_params} else: return {} @@ -1975,7 +1862,7 @@ def _set_init_score_by_predictor( self, predictor: Optional[_InnerPredictor], data: _LGBM_TrainDataType, - used_indices: Optional[Union[List[int], np.ndarray]] + used_indices: Optional[Union[List[int], np.ndarray]], ) -> "Dataset": data_has_header = False if isinstance(data, (str, Path)) and self.params is not None: @@ -1984,9 +1871,7 @@ def _set_init_score_by_predictor( num_data = self.num_data() if predictor is not None: init_score: Union[np.ndarray, scipy.sparse.spmatrix] = predictor.predict( - data=data, - raw_score=True, - data_has_header=data_has_header + data=data, raw_score=True, data_has_header=data_has_header ) init_score = init_score.ravel() if used_indices is not None: @@ -1996,7 +1881,9 @@ def _set_init_score_by_predictor( assert num_data == len(used_indices) for i in range(len(used_indices)): for j in range(predictor.num_class): - sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j] + sub_init_score[i * predictor.num_class + j] = init_score[ + used_indices[i] * predictor.num_class + j + ] init_score = sub_init_score if predictor.num_class > 1: # need to regroup init_score @@ -2024,7 +1911,7 @@ def _lazy_init( feature_name: _LGBM_FeatureNameConfiguration, categorical_feature: _LGBM_CategoricalFeatureConfiguration, params: Optional[Dict[str, Any]], - position: Optional[_LGBM_PositionType] + position: Optional[_LGBM_PositionType], ) -> "Dataset": if data is None: self._handle = None @@ -2037,7 +1924,7 @@ def _lazy_init( data=data, feature_name=feature_name, categorical_feature=categorical_feature, - pandas_categorical=self.pandas_categorical + pandas_categorical=self.pandas_categorical, ) # process for args @@ -2045,8 +1932,10 @@ def _lazy_init( args_names = inspect.signature(self.__class__._lazy_init).parameters.keys() for key in params.keys(): if key in args_names: - _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' - f'Please use {key} argument of the Dataset constructor to pass this parameter.') + _log_warning( + f"{key} keyword has been found in `params` and will be ignored.\n" + f"Please use {key} argument of the Dataset constructor to pass this parameter." + ) # get categorical features if isinstance(categorical_feature, list): categorical_indices = set() @@ -2065,9 +1954,9 @@ def _lazy_init( if cat_alias in params: # If the params[cat_alias] is equal to categorical_indices, do not report the warning. if not (isinstance(params[cat_alias], list) and set(params[cat_alias]) == categorical_indices): - _log_warning(f'{cat_alias} in param dict is overridden.') + _log_warning(f"{cat_alias} in param dict is overridden.") params.pop(cat_alias, None) - params['categorical_column'] = sorted(categorical_indices) + params["categorical_column"] = sorted(categorical_indices) params_str = _param_dict_to_str(params) self.params = params @@ -2076,15 +1965,15 @@ def _lazy_init( if isinstance(reference, Dataset): ref_dataset = reference.construct()._handle elif reference is not None: - raise TypeError('Reference dataset should be None or dataset instance') + raise TypeError("Reference dataset should be None or dataset instance") # start construct data if isinstance(data, (str, Path)): self._handle = ctypes.c_void_p() - _safe_call(_LIB.LGBM_DatasetCreateFromFile( - _c_str(str(data)), - _c_str(params_str), - ref_dataset, - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetCreateFromFile( + _c_str(str(data)), _c_str(params_str), ref_dataset, ctypes.byref(self._handle) + ) + ) elif isinstance(data, scipy.sparse.csr_matrix): self.__init_from_csr(data, params_str, ref_dataset) elif isinstance(data, scipy.sparse.csc_matrix): @@ -2100,7 +1989,7 @@ def _lazy_init( elif _is_list_of_sequences(data): self.__init_from_seqs(data, ref_dataset) else: - raise TypeError('Data list can only be of ndarray or Sequence') + raise TypeError("Data list can only be of ndarray or Sequence") elif isinstance(data, Sequence): self.__init_from_seqs([data], ref_dataset) elif isinstance(data, dt_DataTable): @@ -2110,7 +1999,7 @@ def _lazy_init( csr = scipy.sparse.csr_matrix(data) self.__init_from_csr(csr, params_str, ref_dataset) except BaseException as err: - raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err + raise TypeError(f"Cannot initialize Dataset from {type(data).__name__}") from err if label is not None: self.set_label(label) if self.get_label() is None: @@ -2124,15 +2013,11 @@ def _lazy_init( if isinstance(predictor, _InnerPredictor): if self._predictor is None and init_score is not None: _log_warning("The init_score will be overridden by the prediction of init_model.") - self._set_init_score_by_predictor( - predictor=predictor, - data=data, - used_indices=None - ) + self._set_init_score_by_predictor(predictor=predictor, data=data, used_indices=None) elif init_score is not None: self.set_init_score(init_score) elif predictor is not None: - raise TypeError(f'Wrong predictor type {type(predictor).__name__}') + raise TypeError(f"Wrong predictor type {type(predictor).__name__}") # set feature names return self.set_feature_name(feature_name) @@ -2149,7 +2034,7 @@ def _yield_row_from_seqlist(seqs: List[Sequence], indices: Iterable[int]): seq = seqs[seq_id] id_in_seq = row_id - offset row = seq[id_in_seq] - yield row if row.flags['OWNDATA'] else row.copy() + yield row if row.flags["OWNDATA"] else row.copy() def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarray], List[np.ndarray]]: """Sample data from seqs. @@ -2179,11 +2064,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr return filtered, filtered_idx - def __init_from_seqs( - self, - seqs: List[Sequence], - ref_dataset: Optional[_DatasetHandle] - ) -> "Dataset": + def __init_from_seqs(self, seqs: List[Sequence], ref_dataset: Optional[_DatasetHandle]) -> "Dataset": """ Initialize data from list of Sequence objects. @@ -2206,21 +2087,16 @@ def __init_from_seqs( for seq in seqs: nrow = len(seq) - batch_size = getattr(seq, 'batch_size', None) or Sequence.batch_size + batch_size = getattr(seq, "batch_size", None) or Sequence.batch_size for start in range(0, nrow, batch_size): end = min(start + batch_size, nrow) self._push_rows(seq[start:end]) return self - def __init_from_np2d( - self, - mat: np.ndarray, - params_str: str, - ref_dataset: Optional[_DatasetHandle] - ) -> "Dataset": + def __init_from_np2d(self, mat: np.ndarray, params_str: str, ref_dataset: Optional[_DatasetHandle]) -> "Dataset": """Initialize data from a 2-D numpy matrix.""" if len(mat.shape) != 2: - raise ValueError('Input numpy.ndarray must be 2 dimensional') + raise ValueError("Input numpy.ndarray must be 2 dimensional") self._handle = ctypes.c_void_p() if mat.dtype == np.float32 or mat.dtype == np.float64: @@ -2229,22 +2105,22 @@ def __init_from_np2d( data = np.array(mat.reshape(mat.size), dtype=np.float32) ptr_data, type_ptr_data, _ = _c_float_array(data) - _safe_call(_LIB.LGBM_DatasetCreateFromMat( - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int32(mat.shape[0]), - ctypes.c_int32(mat.shape[1]), - ctypes.c_int(_C_API_IS_ROW_MAJOR), - _c_str(params_str), - ref_dataset, - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetCreateFromMat( + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int32(mat.shape[0]), + ctypes.c_int32(mat.shape[1]), + ctypes.c_int(_C_API_IS_ROW_MAJOR), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle), + ) + ) return self def __init_from_list_np2d( - self, - mats: List[np.ndarray], - params_str: str, - ref_dataset: Optional[_DatasetHandle] + self, mats: List[np.ndarray], params_str: str, ref_dataset: Optional[_DatasetHandle] ) -> "Dataset": """Initialize data from a list of 2-D numpy matrices.""" ncol = mats[0].shape[1] @@ -2260,10 +2136,10 @@ def __init_from_list_np2d( for i, mat in enumerate(mats): if len(mat.shape) != 2: - raise ValueError('Input numpy.ndarray must be 2 dimensional') + raise ValueError("Input numpy.ndarray must be 2 dimensional") if mat.shape[1] != ncol: - raise ValueError('Input arrays must have same number of columns') + raise ValueError("Input arrays must have same number of columns") nrow[i] = mat.shape[0] @@ -2274,33 +2150,33 @@ def __init_from_list_np2d( chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i]) if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data: - raise ValueError('Input chunks must have same type') + raise ValueError("Input chunks must have same type") ptr_data[i] = chunk_ptr_data type_ptr_data = chunk_type_ptr_data holders.append(holder) self._handle = ctypes.c_void_p() - _safe_call(_LIB.LGBM_DatasetCreateFromMats( - ctypes.c_int32(len(mats)), - ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), - ctypes.c_int(type_ptr_data), - nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ctypes.c_int32(ncol), - ctypes.c_int(_C_API_IS_ROW_MAJOR), - _c_str(params_str), - ref_dataset, - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetCreateFromMats( + ctypes.c_int32(len(mats)), + ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), + ctypes.c_int(type_ptr_data), + nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ctypes.c_int32(ncol), + ctypes.c_int(_C_API_IS_ROW_MAJOR), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle), + ) + ) return self def __init_from_csr( - self, - csr: scipy.sparse.csr_matrix, - params_str: str, - ref_dataset: Optional[_DatasetHandle] + self, csr: scipy.sparse.csr_matrix, params_str: str, ref_dataset: Optional[_DatasetHandle] ) -> "Dataset": """Initialize data from a CSR matrix.""" if len(csr.indices) != len(csr.data): - raise ValueError(f'Length mismatch: {len(csr.indices)} vs {len(csr.data)}') + raise ValueError(f"Length mismatch: {len(csr.indices)} vs {len(csr.data)}") self._handle = ctypes.c_void_p() ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr) @@ -2309,29 +2185,29 @@ def __init_from_csr( assert csr.shape[1] <= _MAX_INT32 csr_indices = csr.indices.astype(np.int32, copy=False) - _safe_call(_LIB.LGBM_DatasetCreateFromCSR( - ptr_indptr, - ctypes.c_int(type_ptr_indptr), - csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int64(len(csr.indptr)), - ctypes.c_int64(len(csr.data)), - ctypes.c_int64(csr.shape[1]), - _c_str(params_str), - ref_dataset, - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetCreateFromCSR( + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csr.indptr)), + ctypes.c_int64(len(csr.data)), + ctypes.c_int64(csr.shape[1]), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle), + ) + ) return self def __init_from_csc( - self, - csc: scipy.sparse.csc_matrix, - params_str: str, - ref_dataset: Optional[_DatasetHandle] + self, csc: scipy.sparse.csc_matrix, params_str: str, ref_dataset: Optional[_DatasetHandle] ) -> "Dataset": """Initialize data from a CSC matrix.""" if len(csc.indices) != len(csc.data): - raise ValueError(f'Length mismatch: {len(csc.indices)} vs {len(csc.data)}') + raise ValueError(f"Length mismatch: {len(csc.indices)} vs {len(csc.data)}") self._handle = ctypes.c_void_p() ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) @@ -2340,25 +2216,25 @@ def __init_from_csc( assert csc.shape[0] <= _MAX_INT32 csc_indices = csc.indices.astype(np.int32, copy=False) - _safe_call(_LIB.LGBM_DatasetCreateFromCSC( - ptr_indptr, - ctypes.c_int(type_ptr_indptr), - csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ptr_data, - ctypes.c_int(type_ptr_data), - ctypes.c_int64(len(csc.indptr)), - ctypes.c_int64(len(csc.data)), - ctypes.c_int64(csc.shape[0]), - _c_str(params_str), - ref_dataset, - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetCreateFromCSC( + ptr_indptr, + ctypes.c_int(type_ptr_indptr), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_data, + ctypes.c_int(type_ptr_data), + ctypes.c_int64(len(csc.indptr)), + ctypes.c_int64(len(csc.data)), + ctypes.c_int64(csc.shape[0]), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle), + ) + ) return self def __init_from_pyarrow_table( - self, - table: pa_Table, - params_str: str, - ref_dataset: Optional[_DatasetHandle] + self, table: pa_Table, params_str: str, ref_dataset: Optional[_DatasetHandle] ) -> "Dataset": """Initialize data from a PyArrow table.""" if not PYARROW_INSTALLED: @@ -2371,20 +2247,21 @@ def __init_from_pyarrow_table( # Export Arrow table to C c_array = _export_arrow_to_c(table) self._handle = ctypes.c_void_p() - _safe_call(_LIB.LGBM_DatasetCreateFromArrow( - ctypes.c_int64(c_array.n_chunks), - ctypes.c_void_p(c_array.chunks_ptr), - ctypes.c_void_p(c_array.schema_ptr), - _c_str(params_str), - ref_dataset, - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetCreateFromArrow( + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle), + ) + ) return self @staticmethod def _compare_params_for_warning( - params: Dict[str, Any], - other_params: Dict[str, Any], - ignore_keys: Set[str] + params: Dict[str, Any], other_params: Dict[str, Any], ignore_keys: Set[str] ) -> bool: """Compare two dictionaries with params ignoring some keys. @@ -2430,32 +2307,45 @@ def construct(self) -> "Dataset": if not self._compare_params_for_warning( params=params, other_params=reference_params, - ignore_keys=_ConfigAliases.get("categorical_feature") + ignore_keys=_ConfigAliases.get("categorical_feature"), ): - _log_warning('Overriding the parameters from Reference Dataset.') + _log_warning("Overriding the parameters from Reference Dataset.") self._update_params(reference_params) if self.used_indices is None: # create valid - self._lazy_init(data=self.data, label=self.label, reference=self.reference, - weight=self.weight, group=self.group, position=self.position, - init_score=self.init_score, predictor=self._predictor, - feature_name=self.feature_name, categorical_feature='auto', params=self.params) + self._lazy_init( + data=self.data, + label=self.label, + reference=self.reference, + weight=self.weight, + group=self.group, + position=self.position, + init_score=self.init_score, + predictor=self._predictor, + feature_name=self.feature_name, + categorical_feature="auto", + params=self.params, + ) else: # construct subset - used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name='used_indices') + used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name="used_indices") assert used_indices.flags.c_contiguous if self.reference.group is not None: group_info = np.array(self.reference.group).astype(np.int32, copy=False) - _, self.group = np.unique(np.repeat(range(len(group_info)), repeats=group_info)[self.used_indices], - return_counts=True) + _, self.group = np.unique( + np.repeat(range(len(group_info)), repeats=group_info)[self.used_indices], return_counts=True + ) self._handle = ctypes.c_void_p() params_str = _param_dict_to_str(self.params) - _safe_call(_LIB.LGBM_DatasetGetSubset( - self.reference.construct()._handle, - used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ctypes.c_int32(used_indices.shape[0]), - _c_str(params_str), - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_DatasetGetSubset( + self.reference.construct()._handle, + used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ctypes.c_int32(used_indices.shape[0]), + _c_str(params_str), + ctypes.byref(self._handle), + ) + ) if not self.free_raw_data: self.get_data() if self.group is not None: @@ -2464,20 +2354,29 @@ def construct(self) -> "Dataset": self.set_position(self.position) if self.get_label() is None: raise ValueError("Label should not be None.") - if isinstance(self._predictor, _InnerPredictor) and self._predictor is not self.reference._predictor: + if ( + isinstance(self._predictor, _InnerPredictor) + and self._predictor is not self.reference._predictor + ): self.get_data() self._set_init_score_by_predictor( - predictor=self._predictor, - data=self.data, - used_indices=used_indices + predictor=self._predictor, data=self.data, used_indices=used_indices ) else: # create train - self._lazy_init(data=self.data, label=self.label, reference=None, - weight=self.weight, group=self.group, - init_score=self.init_score, predictor=self._predictor, - feature_name=self.feature_name, categorical_feature=self.categorical_feature, - params=self.params, position=self.position) + self._lazy_init( + data=self.data, + label=self.label, + reference=None, + weight=self.weight, + group=self.group, + init_score=self.init_score, + predictor=self._predictor, + feature_name=self.feature_name, + categorical_feature=self.categorical_feature, + params=self.params, + position=self.position, + ) if self.free_raw_data: self.data = None self.feature_name = self.get_feature_name() @@ -2491,7 +2390,7 @@ def create_valid( group: Optional[_LGBM_GroupType] = None, init_score: Optional[_LGBM_InitScoreType] = None, params: Optional[Dict[str, Any]] = None, - position: Optional[_LGBM_PositionType] = None + position: Optional[_LGBM_PositionType] = None, ) -> "Dataset": """Create validation data align with current Dataset. @@ -2522,18 +2421,22 @@ def create_valid( valid : Dataset Validation Dataset with reference to self. """ - ret = Dataset(data, label=label, reference=self, - weight=weight, group=group, position=position, init_score=init_score, - params=params, free_raw_data=self.free_raw_data) + ret = Dataset( + data, + label=label, + reference=self, + weight=weight, + group=group, + position=position, + init_score=init_score, + params=params, + free_raw_data=self.free_raw_data, + ) ret._predictor = self._predictor ret.pandas_categorical = self.pandas_categorical return ret - def subset( - self, - used_indices: List[int], - params: Optional[Dict[str, Any]] = None - ) -> "Dataset": + def subset(self, used_indices: List[int], params: Optional[Dict[str, Any]] = None) -> "Dataset": """Get subset of current Dataset. Parameters @@ -2550,9 +2453,14 @@ def subset( """ if params is None: params = self.params - ret = Dataset(None, reference=self, feature_name=self.feature_name, - categorical_feature=self.categorical_feature, params=params, - free_raw_data=self.free_raw_data) + ret = Dataset( + None, + reference=self, + feature_name=self.feature_name, + categorical_feature=self.categorical_feature, + params=params, + free_raw_data=self.free_raw_data, + ) ret._predictor = self._predictor ret.pandas_categorical = self.pandas_categorical ret.used_indices = sorted(used_indices) @@ -2576,9 +2484,7 @@ def save_binary(self, filename: Union[str, Path]) -> "Dataset": self : Dataset Returns self. """ - _safe_call(_LIB.LGBM_DatasetSaveBinary( - self.construct()._handle, - _c_str(str(filename)))) + _safe_call(_LIB.LGBM_DatasetSaveBinary(self.construct()._handle, _c_str(str(filename)))) return self def _update_params(self, params: Optional[Dict[str, Any]]) -> "Dataset": @@ -2597,15 +2503,15 @@ def update(): update() elif params is not None: ret = _LIB.LGBM_DatasetUpdateParamChecking( - _c_str(_param_dict_to_str(self.params)), - _c_str(_param_dict_to_str(params))) + _c_str(_param_dict_to_str(self.params)), _c_str(_param_dict_to_str(params)) + ) if ret != 0: # could be updated if data is not freed if self.data is not None: update() self._free_handle() else: - raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8')) + raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8")) return self def _reverse_update_params(self) -> "Dataset": @@ -2617,7 +2523,20 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]] + data: Optional[ + Union[ + List[List[float]], + List[List[int]], + List[float], + List[int], + np.ndarray, + pd_Series, + pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, + ] + ], ) -> "Dataset": """Set property into the Dataset. @@ -2637,12 +2556,15 @@ def set_field( raise Exception(f"Cannot set {field_name} before construct dataset") if data is None: # set to None - _safe_call(_LIB.LGBM_DatasetSetField( - self._handle, - _c_str(field_name), - None, - ctypes.c_int(0), - ctypes.c_int(_FIELD_TYPE_MAPPER[field_name]))) + _safe_call( + _LIB.LGBM_DatasetSetField( + self._handle, + _c_str(field_name), + None, + ctypes.c_int(0), + ctypes.c_int(_FIELD_TYPE_MAPPER[field_name]), + ) + ) return self # If the data is a arrow data, we can just pass it to C @@ -2652,36 +2574,42 @@ def set_field( if _is_pyarrow_table(data): if field_name != "init_score": raise ValueError(f"pyarrow tables are not supported for field '{field_name}'") - data = pa_chunked_array([ - chunk for array in data.columns for chunk in array.chunks # type: ignore - ]) + data = pa_chunked_array( + [ + chunk + for array in data.columns + for chunk in array.chunks # type: ignore + ] + ) c_array = _export_arrow_to_c(data) - _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow( - self._handle, - _c_str(field_name), - ctypes.c_int64(c_array.n_chunks), - ctypes.c_void_p(c_array.chunks_ptr), - ctypes.c_void_p(c_array.schema_ptr), - )) + _safe_call( + _LIB.LGBM_DatasetSetFieldFromArrow( + self._handle, + _c_str(field_name), + ctypes.c_int64(c_array.n_chunks), + ctypes.c_void_p(c_array.chunks_ptr), + ctypes.c_void_p(c_array.schema_ptr), + ) + ) self.version += 1 return self dtype: "np.typing.DTypeLike" - if field_name == 'init_score': + if field_name == "init_score": dtype = np.float64 if _is_1d_collection(data): data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) elif _is_2d_collection(data): data = _data_to_2d_numpy(data, dtype=dtype, name=field_name) - data = data.ravel(order='F') + data = data.ravel(order="F") else: raise TypeError( - 'init_score must be list, numpy 1-D array or pandas Series.\n' - 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' + "init_score must be list, numpy 1-D array or pandas Series.\n" + "In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame." ) else: - dtype = np.int32 if (field_name == 'group' or field_name == 'position') else np.float32 + dtype = np.int32 if (field_name == "group" or field_name == "position") else np.float32 data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr] @@ -2693,12 +2621,11 @@ def set_field( raise TypeError(f"Expected np.float32/64 or np.int32, met type({data.dtype})") if type_data != _FIELD_TYPE_MAPPER[field_name]: raise TypeError("Input type error for set_field") - _safe_call(_LIB.LGBM_DatasetSetField( - self._handle, - _c_str(field_name), - ptr_data, - ctypes.c_int(len(data)), - ctypes.c_int(type_data))) + _safe_call( + _LIB.LGBM_DatasetSetField( + self._handle, _c_str(field_name), ptr_data, ctypes.c_int(len(data)), ctypes.c_int(type_data) + ) + ) self.version += 1 return self @@ -2726,44 +2653,37 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]: tmp_out_len = ctypes.c_int(0) out_type = ctypes.c_int(0) ret = ctypes.POINTER(ctypes.c_void_p)() - _safe_call(_LIB.LGBM_DatasetGetField( - self._handle, - _c_str(field_name), - ctypes.byref(tmp_out_len), - ctypes.byref(ret), - ctypes.byref(out_type))) + _safe_call( + _LIB.LGBM_DatasetGetField( + self._handle, _c_str(field_name), ctypes.byref(tmp_out_len), ctypes.byref(ret), ctypes.byref(out_type) + ) + ) if out_type.value != _FIELD_TYPE_MAPPER[field_name]: raise TypeError("Return type error for get_field") if tmp_out_len.value == 0: return None if out_type.value == _C_API_DTYPE_INT32: arr = _cint32_array_to_numpy( - cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), - length=tmp_out_len.value + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), length=tmp_out_len.value ) elif out_type.value == _C_API_DTYPE_FLOAT32: arr = _cfloat32_array_to_numpy( - cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), - length=tmp_out_len.value + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), length=tmp_out_len.value ) elif out_type.value == _C_API_DTYPE_FLOAT64: arr = _cfloat64_array_to_numpy( - cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), - length=tmp_out_len.value + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), length=tmp_out_len.value ) else: raise TypeError("Unknown type") - if field_name == 'init_score': + if field_name == "init_score": num_data = self.num_data() num_classes = arr.size // num_data if num_classes > 1: - arr = arr.reshape((num_data, num_classes), order='F') + arr = arr.reshape((num_data, num_classes), order="F") return arr - def set_categorical_feature( - self, - categorical_feature: _LGBM_CategoricalFeatureConfiguration - ) -> "Dataset": + def set_categorical_feature(self, categorical_feature: _LGBM_CategoricalFeatureConfiguration) -> "Dataset": """Set categorical features. Parameters @@ -2782,22 +2702,23 @@ def set_categorical_feature( if self.categorical_feature is None: self.categorical_feature = categorical_feature return self._free_handle() - elif categorical_feature == 'auto': + elif categorical_feature == "auto": return self else: - if self.categorical_feature != 'auto': - _log_warning('categorical_feature in Dataset is overridden.\n' - f'New categorical_feature is {list(categorical_feature)}') + if self.categorical_feature != "auto": + _log_warning( + "categorical_feature in Dataset is overridden.\n" + f"New categorical_feature is {list(categorical_feature)}" + ) self.categorical_feature = categorical_feature return self._free_handle() else: - raise LightGBMError("Cannot set categorical feature after freed raw data, " - "set free_raw_data=False when construct Dataset to avoid this.") + raise LightGBMError( + "Cannot set categorical feature after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this." + ) - def _set_predictor( - self, - predictor: Optional[_InnerPredictor] - ) -> "Dataset": + def _set_predictor(self, predictor: Optional[_InnerPredictor]) -> "Dataset": """Set predictor for continued training. It is not recommended for user to call this function. @@ -2806,27 +2727,25 @@ def _set_predictor( if predictor is None and self._predictor is None: return self elif isinstance(predictor, _InnerPredictor) and isinstance(self._predictor, _InnerPredictor): - if (predictor == self._predictor) and (predictor.current_iteration() == self._predictor.current_iteration()): + if (predictor == self._predictor) and ( + predictor.current_iteration() == self._predictor.current_iteration() + ): return self if self._handle is None: self._predictor = predictor elif self.data is not None: self._predictor = predictor - self._set_init_score_by_predictor( - predictor=self._predictor, - data=self.data, - used_indices=None - ) + self._set_init_score_by_predictor(predictor=self._predictor, data=self.data, used_indices=None) elif self.used_indices is not None and self.reference is not None and self.reference.data is not None: self._predictor = predictor self._set_init_score_by_predictor( - predictor=self._predictor, - data=self.reference.data, - used_indices=self.used_indices + predictor=self._predictor, data=self.reference.data, used_indices=self.used_indices ) else: - raise LightGBMError("Cannot set predictor after freed raw data, " - "set free_raw_data=False when construct Dataset to avoid this.") + raise LightGBMError( + "Cannot set predictor after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this." + ) return self def set_reference(self, reference: "Dataset") -> "Dataset": @@ -2842,9 +2761,9 @@ def set_reference(self, reference: "Dataset") -> "Dataset": self : Dataset Dataset with set reference. """ - self.set_categorical_feature(reference.categorical_feature) \ - .set_feature_name(reference.feature_name) \ - ._set_predictor(reference._predictor) + self.set_categorical_feature(reference.categorical_feature).set_feature_name( + reference.feature_name + )._set_predictor(reference._predictor) # we're done if self and reference share a common upstream reference if self.get_ref_chain().intersection(reference.get_ref_chain()): return self @@ -2852,8 +2771,10 @@ def set_reference(self, reference: "Dataset") -> "Dataset": self.reference = reference return self._free_handle() else: - raise LightGBMError("Cannot set reference after freed raw data, " - "set free_raw_data=False when construct Dataset to avoid this.") + raise LightGBMError( + "Cannot set reference after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this." + ) def set_feature_name(self, feature_name: _LGBM_FeatureNameConfiguration) -> "Dataset": """Set feature name. @@ -2868,16 +2789,19 @@ def set_feature_name(self, feature_name: _LGBM_FeatureNameConfiguration) -> "Dat self : Dataset Dataset with set feature name. """ - if feature_name != 'auto': + if feature_name != "auto": self.feature_name = feature_name - if self._handle is not None and feature_name is not None and feature_name != 'auto': + if self._handle is not None and feature_name is not None and feature_name != "auto": if len(feature_name) != self.num_feature(): - raise ValueError(f"Length of feature_name({len(feature_name)}) and num_feature({self.num_feature()}) don't match") + raise ValueError( + f"Length of feature_name({len(feature_name)}) and num_feature({self.num_feature()}) don't match" + ) c_feature_name = [_c_str(name) for name in feature_name] - _safe_call(_LIB.LGBM_DatasetSetFeatureNames( - self._handle, - _c_array(ctypes.c_char_p, c_feature_name), - ctypes.c_int(len(feature_name)))) + _safe_call( + _LIB.LGBM_DatasetSetFeatureNames( + self._handle, _c_array(ctypes.c_char_p, c_feature_name), ctypes.c_int(len(feature_name)) + ) + ) return self def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": @@ -2897,20 +2821,17 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": if self._handle is not None: if isinstance(label, pd_DataFrame): if len(label.columns) > 1: - raise ValueError('DataFrame for label cannot have multiple columns') + raise ValueError("DataFrame for label cannot have multiple columns") label_array = np.ravel(_pandas_to_numpy(label, target_dtype=np.float32)) elif _is_pyarrow_array(label): label_array = label else: - label_array = _list_to_1d_numpy(label, dtype=np.float32, name='label') - self.set_field('label', label_array) - self.label = self.get_field('label') # original values can be modified at cpp side + label_array = _list_to_1d_numpy(label, dtype=np.float32, name="label") + self.set_field("label", label_array) + self.label = self.get_field("label") # original values can be modified at cpp side return self - def set_weight( - self, - weight: Optional[_LGBM_WeightType] - ) -> "Dataset": + def set_weight(self, weight: Optional[_LGBM_WeightType]) -> "Dataset": """Set weight of each instance. Parameters @@ -2935,15 +2856,12 @@ def set_weight( # Set field if self._handle is not None and weight is not None: if not _is_pyarrow_array(weight): - weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight') - self.set_field('weight', weight) - self.weight = self.get_field('weight') # original values can be modified at cpp side + weight = _list_to_1d_numpy(weight, dtype=np.float32, name="weight") + self.set_field("weight", weight) + self.weight = self.get_field("weight") # original values can be modified at cpp side return self - def set_init_score( - self, - init_score: Optional[_LGBM_InitScoreType] - ) -> "Dataset": + def set_init_score(self, init_score: Optional[_LGBM_InitScoreType]) -> "Dataset": """Set init score of Booster to start from. Parameters @@ -2958,14 +2876,11 @@ def set_init_score( """ self.init_score = init_score if self._handle is not None and init_score is not None: - self.set_field('init_score', init_score) - self.init_score = self.get_field('init_score') # original values can be modified at cpp side + self.set_field("init_score", init_score) + self.init_score = self.get_field("init_score") # original values can be modified at cpp side return self - def set_group( - self, - group: Optional[_LGBM_GroupType] - ) -> "Dataset": + def set_group(self, group: Optional[_LGBM_GroupType]) -> "Dataset": """Set group size of Dataset (used for ranking). Parameters @@ -2985,18 +2900,15 @@ def set_group( self.group = group if self._handle is not None and group is not None: if not _is_pyarrow_array(group): - group = _list_to_1d_numpy(group, dtype=np.int32, name='group') - self.set_field('group', group) + group = _list_to_1d_numpy(group, dtype=np.int32, name="group") + self.set_field("group", group) # original values can be modified at cpp side - constructed_group = self.get_field('group') + constructed_group = self.get_field("group") if constructed_group is not None: self.group = np.diff(constructed_group) return self - def set_position( - self, - position: Optional[_LGBM_PositionType] - ) -> "Dataset": + def set_position(self, position: Optional[_LGBM_PositionType]) -> "Dataset": """Set position of Dataset (used for ranking). Parameters @@ -3011,8 +2923,8 @@ def set_position( """ self.position = position if self._handle is not None and position is not None: - position = _list_to_1d_numpy(position, dtype=np.int32, name='position') - self.set_field('position', position) + position = _list_to_1d_numpy(position, dtype=np.int32, name="position") + self.set_field("position", position) return self def get_feature_name(self) -> List[str]: @@ -3031,13 +2943,16 @@ def get_feature_name(self) -> List[str]: required_string_buffer_size = ctypes.c_size_t(0) string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] - _safe_call(_LIB.LGBM_DatasetGetFeatureNames( - self._handle, - ctypes.c_int(num_feature), - ctypes.byref(tmp_out_len), - ctypes.c_size_t(reserved_string_buffer_size), - ctypes.byref(required_string_buffer_size), - ptr_string_buffers)) + _safe_call( + _LIB.LGBM_DatasetGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(reserved_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers, + ) + ) if num_feature != tmp_out_len.value: raise ValueError("Length of feature names doesn't equal with num_feature") actual_string_buffer_size = required_string_buffer_size.value @@ -3045,14 +2960,17 @@ def get_feature_name(self) -> List[str]: if reserved_string_buffer_size < actual_string_buffer_size: string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] - _safe_call(_LIB.LGBM_DatasetGetFeatureNames( - self._handle, - ctypes.c_int(num_feature), - ctypes.byref(tmp_out_len), - ctypes.c_size_t(actual_string_buffer_size), - ctypes.byref(required_string_buffer_size), - ptr_string_buffers)) - return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)] + _safe_call( + _LIB.LGBM_DatasetGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(actual_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers, + ) + ) + return [string_buffers[i].value.decode("utf-8") for i in range(num_feature)] def get_label(self) -> Optional[_LGBM_LabelType]: """Get the label of the Dataset. @@ -3064,7 +2982,7 @@ def get_label(self) -> Optional[_LGBM_LabelType]: For a constructed ``Dataset``, this will only return a numpy array. """ if self.label is None: - self.label = self.get_field('label') + self.label = self.get_field("label") return self.label def get_weight(self) -> Optional[_LGBM_WeightType]: @@ -3077,7 +2995,7 @@ def get_weight(self) -> Optional[_LGBM_WeightType]: For a constructed ``Dataset``, this will only return ``None`` or a numpy array. """ if self.weight is None: - self.weight = self.get_field('weight') + self.weight = self.get_field("weight") return self.weight def get_init_score(self) -> Optional[_LGBM_InitScoreType]: @@ -3090,7 +3008,7 @@ def get_init_score(self) -> Optional[_LGBM_InitScoreType]: For a constructed ``Dataset``, this will only return ``None`` or a numpy array. """ if self.init_score is None: - self.init_score = self.get_field('init_score') + self.init_score = self.get_field("init_score") return self.init_score def get_data(self) -> Optional[_LGBM_TrainDataType]: @@ -3117,12 +3035,15 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]: elif _is_list_of_sequences(self.data) and len(self.data) > 0: self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices))) else: - _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n" - "Returning original raw data") + _log_warning( + f"Cannot subset {type(self.data).__name__} type of raw data.\n" "Returning original raw data" + ) self._need_slice = False if self.data is None: - raise LightGBMError("Cannot call `get_data` after freed raw data, " - "set free_raw_data=False when construct Dataset to avoid this.") + raise LightGBMError( + "Cannot call `get_data` after freed raw data, " + "set free_raw_data=False when construct Dataset to avoid this." + ) return self.data def get_group(self) -> Optional[_LGBM_GroupType]: @@ -3139,7 +3060,7 @@ def get_group(self) -> Optional[_LGBM_GroupType]: For a constructed ``Dataset``, this will only return ``None`` or a numpy array. """ if self.group is None: - self.group = self.get_field('group') + self.group = self.get_field("group") if self.group is not None: # group data from LightGBM is boundaries data, need to convert to group size self.group = np.diff(self.group) @@ -3155,7 +3076,7 @@ def get_position(self) -> Optional[_LGBM_PositionType]: For a constructed ``Dataset``, this will only return ``None`` or a numpy array. """ if self.position is None: - self.position = self.get_field('position') + self.position = self.get_field("position") return self.position def num_data(self) -> int: @@ -3168,8 +3089,7 @@ def num_data(self) -> int: """ if self._handle is not None: ret = ctypes.c_int(0) - _safe_call(_LIB.LGBM_DatasetGetNumData(self._handle, - ctypes.byref(ret))) + _safe_call(_LIB.LGBM_DatasetGetNumData(self._handle, ctypes.byref(ret))) return ret.value else: raise LightGBMError("Cannot get num_data before construct dataset") @@ -3184,8 +3104,7 @@ def num_feature(self) -> int: """ if self._handle is not None: ret = ctypes.c_int(0) - _safe_call(_LIB.LGBM_DatasetGetNumFeature(self._handle, - ctypes.byref(ret))) + _safe_call(_LIB.LGBM_DatasetGetNumFeature(self._handle, ctypes.byref(ret))) return ret.value else: raise LightGBMError("Cannot get num_feature before construct dataset") @@ -3211,9 +3130,7 @@ def feature_num_bin(self, feature: Union[int, str]) -> int: else: feature_index = feature ret = ctypes.c_int(0) - _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self._handle, - ctypes.c_int(feature_index), - ctypes.byref(ret))) + _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self._handle, ctypes.c_int(feature_index), ctypes.byref(ret))) return ret.value else: raise LightGBMError("Cannot get feature_num_bin before construct dataset") @@ -3264,7 +3181,7 @@ def add_features_from(self, other: "Dataset") -> "Dataset": Dataset with the new features added. """ if self._handle is None or other._handle is None: - raise ValueError('Both source and target Datasets must be constructed before adding features') + raise ValueError("Both source and target Datasets must be constructed before adding features") _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self._handle, other._handle)) was_none = self.data is None old_self_data_type = type(self.data).__name__ @@ -3294,21 +3211,19 @@ def add_features_from(self, other: "Dataset") -> "Dataset": self.data = None elif isinstance(self.data, pd_DataFrame): if not PANDAS_INSTALLED: - raise LightGBMError("Cannot add features to DataFrame type of raw data " - "without pandas installed. " - "Install pandas and restart your session.") + raise LightGBMError( + "Cannot add features to DataFrame type of raw data " + "without pandas installed. " + "Install pandas and restart your session." + ) if isinstance(other.data, np.ndarray): - self.data = concat((self.data, pd_DataFrame(other.data)), - axis=1, ignore_index=True) + self.data = concat((self.data, pd_DataFrame(other.data)), axis=1, ignore_index=True) elif isinstance(other.data, scipy.sparse.spmatrix): - self.data = concat((self.data, pd_DataFrame(other.data.toarray())), - axis=1, ignore_index=True) + self.data = concat((self.data, pd_DataFrame(other.data.toarray())), axis=1, ignore_index=True) elif isinstance(other.data, pd_DataFrame): - self.data = concat((self.data, other.data), - axis=1, ignore_index=True) + self.data = concat((self.data, other.data), axis=1, ignore_index=True) elif isinstance(other.data, dt_DataTable): - self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), - axis=1, ignore_index=True) + self.data = concat((self.data, pd_DataFrame(other.data.to_numpy())), axis=1, ignore_index=True) else: self.data = None elif isinstance(self.data, dt_DataTable): @@ -3325,14 +3240,19 @@ def add_features_from(self, other: "Dataset") -> "Dataset": else: self.data = None if self.data is None: - err_msg = (f"Cannot add features from {type(other.data).__name__} type of raw data to " - f"{old_self_data_type} type of raw data.\n") - err_msg += ("Set free_raw_data=False when construct Dataset to avoid this" - if was_none else "Freeing raw data") + err_msg = ( + f"Cannot add features from {type(other.data).__name__} type of raw data to " + f"{old_self_data_type} type of raw data.\n" + ) + err_msg += ( + "Set free_raw_data=False when construct Dataset to avoid this" if was_none else "Freeing raw data" + ) _log_warning(err_msg) self.feature_name = self.get_feature_name() - _log_warning("Reseting categorical features.\n" - "You can set new categorical features via ``set_categorical_feature`` method") + _log_warning( + "Reseting categorical features.\n" + "You can set new categorical features via ``set_categorical_feature`` method" + ) self.categorical_feature = "auto" self.pandas_categorical = None return self @@ -3352,25 +3272,14 @@ def _dump_text(self, filename: Union[str, Path]) -> "Dataset": self : Dataset Returns self. """ - _safe_call(_LIB.LGBM_DatasetDumpText( - self.construct()._handle, - _c_str(str(filename)))) + _safe_call(_LIB.LGBM_DatasetDumpText(self.construct()._handle, _c_str(str(filename)))) return self -_LGBM_CustomObjectiveFunction = Callable[ - [np.ndarray, Dataset], - Tuple[np.ndarray, np.ndarray] -] +_LGBM_CustomObjectiveFunction = Callable[[np.ndarray, Dataset], Tuple[np.ndarray, np.ndarray]] _LGBM_CustomEvalFunction = Union[ - Callable[ - [np.ndarray, Dataset], - _LGBM_EvalFunctionResultType - ], - Callable[ - [np.ndarray, Dataset], - List[_LGBM_EvalFunctionResultType] - ] + Callable[[np.ndarray, Dataset], _LGBM_EvalFunctionResultType], + Callable[[np.ndarray, Dataset], List[_LGBM_EvalFunctionResultType]], ] @@ -3382,7 +3291,7 @@ def __init__( params: Optional[Dict[str, Any]] = None, train_set: Optional[Dataset] = None, model_file: Optional[Union[str, Path]] = None, - model_str: Optional[str] = None + model_str: Optional[str] = None, ): """Initialize the Booster. @@ -3408,50 +3317,37 @@ def __init__( if train_set is not None: # Training task if not isinstance(train_set, Dataset): - raise TypeError(f'Training data should be Dataset instance, met {type(train_set).__name__}') - params = _choose_param_value( - main_param_name="machines", - params=params, - default_value=None - ) + raise TypeError(f"Training data should be Dataset instance, met {type(train_set).__name__}") + params = _choose_param_value(main_param_name="machines", params=params, default_value=None) # if "machines" is given, assume user wants to do distributed learning, and set up network if params["machines"] is None: params.pop("machines", None) else: machines = params["machines"] if isinstance(machines, str): - num_machines_from_machine_list = len(machines.split(',')) + num_machines_from_machine_list = len(machines.split(",")) elif isinstance(machines, (list, set)): num_machines_from_machine_list = len(machines) - machines = ','.join(machines) + machines = ",".join(machines) else: raise ValueError("Invalid machines in params.") params = _choose_param_value( - main_param_name="num_machines", - params=params, - default_value=num_machines_from_machine_list - ) - params = _choose_param_value( - main_param_name="local_listen_port", - params=params, - default_value=12400 + main_param_name="num_machines", params=params, default_value=num_machines_from_machine_list ) + params = _choose_param_value(main_param_name="local_listen_port", params=params, default_value=12400) self.set_network( machines=machines, local_listen_port=params["local_listen_port"], listen_time_out=params.get("time_out", 120), - num_machines=params["num_machines"] + num_machines=params["num_machines"], ) # construct booster object train_set.construct() # copy the parameters from train_set params.update(train_set.get_params()) params_str = _param_dict_to_str(params) - _safe_call(_LIB.LGBM_BoosterCreate( - train_set._handle, - _c_str(params_str), - ctypes.byref(self._handle))) + _safe_call(_LIB.LGBM_BoosterCreate(train_set._handle, _c_str(params_str), ctypes.byref(self._handle))) # save reference to data self.train_set = train_set self.valid_sets: List[Dataset] = [] @@ -3459,13 +3355,9 @@ def __init__( self.__num_dataset = 1 self.__init_predictor = train_set._predictor if self.__init_predictor is not None: - _safe_call(_LIB.LGBM_BoosterMerge( - self._handle, - self.__init_predictor._handle)) + _safe_call(_LIB.LGBM_BoosterMerge(self._handle, self.__init_predictor._handle)) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses( - self._handle, - ctypes.byref(out_num_class))) + _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) self.__num_class = out_num_class.value # buffer for inner predict self.__inner_predict_buffer: List[Optional[np.ndarray]] = [None] @@ -3476,24 +3368,24 @@ def __init__( elif model_file is not None: # Prediction task out_num_iterations = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterCreateFromModelfile( - _c_str(str(model_file)), - ctypes.byref(out_num_iterations), - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_BoosterCreateFromModelfile( + _c_str(str(model_file)), ctypes.byref(out_num_iterations), ctypes.byref(self._handle) + ) + ) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses( - self._handle, - ctypes.byref(out_num_class))) + _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(file_name=model_file) if params: - _log_warning('Ignoring params argument, using parameters from model file.') + _log_warning("Ignoring params argument, using parameters from model file.") params = self._get_loaded_param() elif model_str is not None: self.model_from_string(model_str) else: - raise TypeError('Need at least one training dataset or model file or model string ' - 'to create Booster instance') + raise TypeError( + "Need at least one training dataset or model file or model string " "to create Booster instance" + ) self.params = params def __del__(self) -> None: @@ -3517,23 +3409,24 @@ def __deepcopy__(self, _) -> "Booster": def __getstate__(self) -> Dict[str, Any]: this = self.__dict__.copy() - handle = this['_handle'] - this.pop('train_set', None) - this.pop('valid_sets', None) + handle = this["_handle"] + this.pop("train_set", None) + this.pop("valid_sets", None) if handle is not None: this["_handle"] = self.model_to_string(num_iteration=-1) return this def __setstate__(self, state: Dict[str, Any]) -> None: - model_str = state.get('_handle', state.get('handle', None)) + model_str = state.get("_handle", state.get("handle", None)) if model_str is not None: handle = ctypes.c_void_p() out_num_iterations = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterLoadModelFromString( - _c_str(model_str), - ctypes.byref(out_num_iterations), - ctypes.byref(handle))) - state['_handle'] = handle + _safe_call( + _LIB.LGBM_BoosterLoadModelFromString( + _c_str(model_str), ctypes.byref(out_num_iterations), ctypes.byref(handle) + ) + ) + state["_handle"] = handle self.__dict__.update(state) def _get_loaded_param(self) -> Dict[str, Any]: @@ -3541,22 +3434,22 @@ def _get_loaded_param(self) -> Dict[str, Any]: tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_BoosterGetLoadedParam( - self._handle, - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) + _safe_call( + _LIB.LGBM_BoosterGetLoadedParam( + self._handle, ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), ptr_string_buffer + ) + ) actual_len = tmp_out_len.value # if buffer length is not long enough, re-allocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_BoosterGetLoadedParam( - self._handle, - ctypes.c_int64(actual_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - return json.loads(string_buffer.value.decode('utf-8')) + _safe_call( + _LIB.LGBM_BoosterGetLoadedParam( + self._handle, ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer + ) + ) + return json.loads(string_buffer.value.decode("utf-8")) def free_dataset(self) -> "Booster": """Free Booster's Datasets. @@ -3566,8 +3459,8 @@ def free_dataset(self) -> "Booster": self : Booster Booster without Datasets. """ - self.__dict__.pop('train_set', None) - self.__dict__.pop('valid_sets', None) + self.__dict__.pop("train_set", None) + self.__dict__.pop("valid_sets", None) self.__num_dataset = 0 return self @@ -3581,7 +3474,7 @@ def set_network( machines: Union[List[str], Set[str], str], local_listen_port: int = 12400, listen_time_out: int = 120, - num_machines: int = 1 + num_machines: int = 1, ) -> "Booster": """Set the network configuration. @@ -3602,11 +3495,15 @@ def set_network( Booster with set network. """ if isinstance(machines, (list, set)): - machines = ','.join(machines) - _safe_call(_LIB.LGBM_NetworkInit(_c_str(machines), - ctypes.c_int(local_listen_port), - ctypes.c_int(listen_time_out), - ctypes.c_int(num_machines))) + machines = ",".join(machines) + _safe_call( + _LIB.LGBM_NetworkInit( + _c_str(machines), + ctypes.c_int(local_listen_port), + ctypes.c_int(listen_time_out), + ctypes.c_int(num_machines), + ) + ) self._network = True return self @@ -3651,85 +3548,80 @@ def trees_to_dataframe(self) -> pd_DataFrame: Returns a pandas DataFrame of the parsed model. """ if not PANDAS_INSTALLED: - raise LightGBMError('This method cannot be run without pandas installed. ' - 'You must install pandas and restart your session to use this method.') + raise LightGBMError( + "This method cannot be run without pandas installed. " + "You must install pandas and restart your session to use this method." + ) if self.num_trees() == 0: - raise LightGBMError('There are no trees in this Booster and thus nothing to parse') + raise LightGBMError("There are no trees in this Booster and thus nothing to parse") def _is_split_node(tree: Dict[str, Any]) -> bool: - return 'split_index' in tree.keys() + return "split_index" in tree.keys() def create_node_record( tree: Dict[str, Any], node_depth: int = 1, tree_index: Optional[int] = None, feature_names: Optional[List[str]] = None, - parent_node: Optional[str] = None + parent_node: Optional[str] = None, ) -> Dict[str, Any]: - - def _get_node_index( - tree: Dict[str, Any], - tree_index: Optional[int] - ) -> str: - tree_num = f'{tree_index}-' if tree_index is not None else '' + def _get_node_index(tree: Dict[str, Any], tree_index: Optional[int]) -> str: + tree_num = f"{tree_index}-" if tree_index is not None else "" is_split = _is_split_node(tree) - node_type = 'S' if is_split else 'L' + node_type = "S" if is_split else "L" # if a single node tree it won't have `leaf_index` so return 0 - node_num = tree.get('split_index' if is_split else 'leaf_index', 0) + node_num = tree.get("split_index" if is_split else "leaf_index", 0) return f"{tree_num}{node_type}{node_num}" - def _get_split_feature( - tree: Dict[str, Any], - feature_names: Optional[List[str]] - ) -> Optional[str]: + def _get_split_feature(tree: Dict[str, Any], feature_names: Optional[List[str]]) -> Optional[str]: if _is_split_node(tree): if feature_names is not None: - feature_name = feature_names[tree['split_feature']] + feature_name = feature_names[tree["split_feature"]] else: - feature_name = tree['split_feature'] + feature_name = tree["split_feature"] else: feature_name = None return feature_name def _is_single_node_tree(tree: Dict[str, Any]) -> bool: - return set(tree.keys()) == {'leaf_value'} + return set(tree.keys()) == {"leaf_value"} # Create the node record, and populate universal data members node: Dict[str, Union[int, str, None]] = OrderedDict() - node['tree_index'] = tree_index - node['node_depth'] = node_depth - node['node_index'] = _get_node_index(tree, tree_index) - node['left_child'] = None - node['right_child'] = None - node['parent_index'] = parent_node - node['split_feature'] = _get_split_feature(tree, feature_names) - node['split_gain'] = None - node['threshold'] = None - node['decision_type'] = None - node['missing_direction'] = None - node['missing_type'] = None - node['value'] = None - node['weight'] = None - node['count'] = None + node["tree_index"] = tree_index + node["node_depth"] = node_depth + node["node_index"] = _get_node_index(tree, tree_index) + node["left_child"] = None + node["right_child"] = None + node["parent_index"] = parent_node + node["split_feature"] = _get_split_feature(tree, feature_names) + node["split_gain"] = None + node["threshold"] = None + node["decision_type"] = None + node["missing_direction"] = None + node["missing_type"] = None + node["value"] = None + node["weight"] = None + node["count"] = None # Update values to reflect node type (leaf or split) if _is_split_node(tree): - node['left_child'] = _get_node_index(tree['left_child'], tree_index) - node['right_child'] = _get_node_index(tree['right_child'], tree_index) - node['split_gain'] = tree['split_gain'] - node['threshold'] = tree['threshold'] - node['decision_type'] = tree['decision_type'] - node['missing_direction'] = 'left' if tree['default_left'] else 'right' - node['missing_type'] = tree['missing_type'] - node['value'] = tree['internal_value'] - node['weight'] = tree['internal_weight'] - node['count'] = tree['internal_count'] + node["left_child"] = _get_node_index(tree["left_child"], tree_index) + node["right_child"] = _get_node_index(tree["right_child"], tree_index) + node["split_gain"] = tree["split_gain"] + node["threshold"] = tree["threshold"] + node["decision_type"] = tree["decision_type"] + node["missing_direction"] = "left" if tree["default_left"] else "right" + node["missing_type"] = tree["missing_type"] + node["value"] = tree["internal_value"] + node["weight"] = tree["internal_weight"] + node["count"] = tree["internal_count"] else: - node['value'] = tree['leaf_value'] + node["value"] = tree["leaf_value"] if not _is_single_node_tree(tree): - node['weight'] = tree['leaf_weight'] - node['count'] = tree['leaf_count'] + node["weight"] = tree["leaf_weight"] + node["count"] = tree["leaf_count"] return node @@ -3738,27 +3630,28 @@ def tree_dict_to_node_list( node_depth: int = 1, tree_index: Optional[int] = None, feature_names: Optional[List[str]] = None, - parent_node: Optional[str] = None + parent_node: Optional[str] = None, ) -> List[Dict[str, Any]]: - - node = create_node_record(tree=tree, - node_depth=node_depth, - tree_index=tree_index, - feature_names=feature_names, - parent_node=parent_node) + node = create_node_record( + tree=tree, + node_depth=node_depth, + tree_index=tree_index, + feature_names=feature_names, + parent_node=parent_node, + ) res = [node] if _is_split_node(tree): # traverse the next level of the tree - children = ['left_child', 'right_child'] + children = ["left_child", "right_child"] for child in children: subtree_list = tree_dict_to_node_list( tree=tree[child], node_depth=node_depth + 1, tree_index=tree_index, feature_names=feature_names, - parent_node=node['node_index'] + parent_node=node["node_index"], ) # In tree format, "subtree_list" is a list of node records (dicts), # and we add node to the list. @@ -3766,12 +3659,14 @@ def tree_dict_to_node_list( return res model_dict = self.dump_model() - feature_names = model_dict['feature_names'] + feature_names = model_dict["feature_names"] model_list = [] - for tree in model_dict['tree_info']: - model_list.extend(tree_dict_to_node_list(tree=tree['tree_structure'], - tree_index=tree['tree_index'], - feature_names=feature_names)) + for tree in model_dict["tree_info"]: + model_list.extend( + tree_dict_to_node_list( + tree=tree["tree_structure"], tree_index=tree["tree_index"], feature_names=feature_names + ) + ) return pd_DataFrame(model_list, columns=model_list[0].keys()) @@ -3807,13 +3702,10 @@ def add_valid(self, data: Dataset, name: str) -> "Booster": Booster with set validation data. """ if not isinstance(data, Dataset): - raise TypeError(f'Validation data should be Dataset instance, met {type(data).__name__}') + raise TypeError(f"Validation data should be Dataset instance, met {type(data).__name__}") if data._predictor is not self.__init_predictor: - raise LightGBMError("Add validation data failed, " - "you should use same predictor for these data") - _safe_call(_LIB.LGBM_BoosterAddValidData( - self._handle, - data.construct()._handle)) + raise LightGBMError("Add validation data failed, " "you should use same predictor for these data") + _safe_call(_LIB.LGBM_BoosterAddValidData(self._handle, data.construct()._handle)) self.valid_sets.append(data) self.name_valid_sets.append(name) self.__num_dataset += 1 @@ -3836,17 +3728,11 @@ def reset_parameter(self, params: Dict[str, Any]) -> "Booster": """ params_str = _param_dict_to_str(params) if params_str: - _safe_call(_LIB.LGBM_BoosterResetParameter( - self._handle, - _c_str(params_str))) + _safe_call(_LIB.LGBM_BoosterResetParameter(self._handle, _c_str(params_str))) self.params.update(params) return self - def update( - self, - train_set: Optional[Dataset] = None, - fobj: Optional[_LGBM_CustomObjectiveFunction] = None - ) -> bool: + def update(self, train_set: Optional[Dataset] = None, fobj: Optional[_LGBM_CustomObjectiveFunction] = None) -> bool: """Update Booster for one iteration. Parameters @@ -3888,23 +3774,18 @@ def update( is_the_same_train_set = train_set is self.train_set and self.train_set_version == train_set.version if train_set is not None and not is_the_same_train_set: if not isinstance(train_set, Dataset): - raise TypeError(f'Training data should be Dataset instance, met {type(train_set).__name__}') + raise TypeError(f"Training data should be Dataset instance, met {type(train_set).__name__}") if train_set._predictor is not self.__init_predictor: - raise LightGBMError("Replace training data failed, " - "you should use same predictor for these data") + raise LightGBMError("Replace training data failed, " "you should use same predictor for these data") self.train_set = train_set - _safe_call(_LIB.LGBM_BoosterResetTrainingData( - self._handle, - self.train_set.construct()._handle)) + _safe_call(_LIB.LGBM_BoosterResetTrainingData(self._handle, self.train_set.construct()._handle)) self.__inner_predict_buffer[0] = None self.train_set_version = self.train_set.version is_finished = ctypes.c_int(0) if fobj is None: if self.__set_objective_to_none: - raise LightGBMError('Cannot update due to null objective function.') - _safe_call(_LIB.LGBM_BoosterUpdateOneIter( - self._handle, - ctypes.byref(is_finished))) + raise LightGBMError("Cannot update due to null objective function.") + _safe_call(_LIB.LGBM_BoosterUpdateOneIter(self._handle, ctypes.byref(is_finished))) self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] return is_finished.value == 1 else: @@ -3913,11 +3794,7 @@ def update( grad, hess = fobj(self.__inner_predict(0), self.train_set) return self.__boost(grad, hess) - def __boost( - self, - grad: np.ndarray, - hess: np.ndarray - ) -> bool: + def __boost(self, grad: np.ndarray, hess: np.ndarray) -> bool: """Boost Booster for one iteration with customized gradient statistics. .. note:: @@ -3942,10 +3819,10 @@ def __boost( Whether the boost was successfully finished. """ if self.__num_class > 1: - grad = grad.ravel(order='F') - hess = hess.ravel(order='F') - grad = _list_to_1d_numpy(grad, dtype=np.float32, name='gradient') - hess = _list_to_1d_numpy(hess, dtype=np.float32, name='hessian') + grad = grad.ravel(order="F") + hess = hess.ravel(order="F") + grad = _list_to_1d_numpy(grad, dtype=np.float32, name="gradient") + hess = _list_to_1d_numpy(hess, dtype=np.float32, name="hessian") assert grad.flags.c_contiguous assert hess.flags.c_contiguous if len(grad) != len(hess): @@ -3958,11 +3835,14 @@ def __boost( f"number of models per one iteration ({self.__num_class})" ) is_finished = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom( - self._handle, - grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - ctypes.byref(is_finished))) + _safe_call( + _LIB.LGBM_BoosterUpdateOneIterCustom( + self._handle, + grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), + ctypes.byref(is_finished), + ) + ) self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] return is_finished.value == 1 @@ -3974,8 +3854,7 @@ def rollback_one_iter(self) -> "Booster": self : Booster Booster with rolled back one iteration. """ - _safe_call(_LIB.LGBM_BoosterRollbackOneIter( - self._handle)) + _safe_call(_LIB.LGBM_BoosterRollbackOneIter(self._handle)) self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] return self @@ -3988,9 +3867,7 @@ def current_iteration(self) -> int: The index of the current iteration. """ out_cur_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetCurrentIteration( - self._handle, - ctypes.byref(out_cur_iter))) + _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(self._handle, ctypes.byref(out_cur_iter))) return out_cur_iter.value def num_model_per_iteration(self) -> int: @@ -4002,9 +3879,7 @@ def num_model_per_iteration(self) -> int: The number of models per iteration. """ model_per_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterNumModelPerIteration( - self._handle, - ctypes.byref(model_per_iter))) + _safe_call(_LIB.LGBM_BoosterNumModelPerIteration(self._handle, ctypes.byref(model_per_iter))) return model_per_iter.value def num_trees(self) -> int: @@ -4016,9 +3891,7 @@ def num_trees(self) -> int: The number of weak sub-models. """ num_trees = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel( - self._handle, - ctypes.byref(num_trees))) + _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel(self._handle, ctypes.byref(num_trees))) return num_trees.value def upper_bound(self) -> float: @@ -4030,9 +3903,7 @@ def upper_bound(self) -> float: Upper bound value of the model. """ ret = ctypes.c_double(0) - _safe_call(_LIB.LGBM_BoosterGetUpperBoundValue( - self._handle, - ctypes.byref(ret))) + _safe_call(_LIB.LGBM_BoosterGetUpperBoundValue(self._handle, ctypes.byref(ret))) return ret.value def lower_bound(self) -> float: @@ -4044,16 +3915,14 @@ def lower_bound(self) -> float: Lower bound value of the model. """ ret = ctypes.c_double(0) - _safe_call(_LIB.LGBM_BoosterGetLowerBoundValue( - self._handle, - ctypes.byref(ret))) + _safe_call(_LIB.LGBM_BoosterGetLowerBoundValue(self._handle, ctypes.byref(ret))) return ret.value def eval( self, data: Dataset, name: str, - feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None, ) -> List[_LGBM_BoosterEvalMethodResultType]: """Evaluate for data. @@ -4105,8 +3974,7 @@ def eval( return self.__inner_eval(name, data_idx, feval) def eval_train( - self, - feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + self, feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None ) -> List[_LGBM_BoosterEvalMethodResultType]: """Evaluate for training data. @@ -4139,8 +4007,7 @@ def eval_train( return self.__inner_eval(self._train_data_name, 0, feval) def eval_valid( - self, - feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + self, feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None ) -> List[_LGBM_BoosterEvalMethodResultType]: """Evaluate for validation data. @@ -4170,15 +4037,18 @@ def eval_valid( result : list List with (validation_dataset_name, eval_name, eval_result, is_higher_better) tuples. """ - return [item for i in range(1, self.__num_dataset) - for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)] + return [ + item + for i in range(1, self.__num_dataset) + for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval) + ] def save_model( self, filename: Union[str, Path], num_iteration: Optional[int] = None, start_iteration: int = 0, - importance_type: str = 'split' + importance_type: str = "split", ) -> "Booster": """Save Booster to file. @@ -4205,20 +4075,19 @@ def save_model( if num_iteration is None: num_iteration = self.best_iteration importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] - _safe_call(_LIB.LGBM_BoosterSaveModel( - self._handle, - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - ctypes.c_int(importance_type_int), - _c_str(str(filename)))) + _safe_call( + _LIB.LGBM_BoosterSaveModel( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + _c_str(str(filename)), + ) + ) _dump_pandas_categorical(self.pandas_categorical, filename) return self - def shuffle_models( - self, - start_iteration: int = 0, - end_iteration: int = -1 - ) -> "Booster": + def shuffle_models(self, start_iteration: int = 0, end_iteration: int = -1) -> "Booster": """Shuffle models. Parameters @@ -4234,10 +4103,9 @@ def shuffle_models( self : Booster Booster with shuffled models. """ - _safe_call(_LIB.LGBM_BoosterShuffleModels( - self._handle, - ctypes.c_int(start_iteration), - ctypes.c_int(end_iteration))) + _safe_call( + _LIB.LGBM_BoosterShuffleModels(self._handle, ctypes.c_int(start_iteration), ctypes.c_int(end_iteration)) + ) return self def model_from_string(self, model_str: str) -> "Booster": @@ -4259,23 +4127,19 @@ def model_from_string(self, model_str: str) -> "Booster": self._free_buffer() self._handle = ctypes.c_void_p() out_num_iterations = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterLoadModelFromString( - _c_str(model_str), - ctypes.byref(out_num_iterations), - ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_BoosterLoadModelFromString( + _c_str(model_str), ctypes.byref(out_num_iterations), ctypes.byref(self._handle) + ) + ) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses( - self._handle, - ctypes.byref(out_num_class))) + _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(model_str=model_str) return self def model_to_string( - self, - num_iteration: Optional[int] = None, - start_iteration: int = 0, - importance_type: str = 'split' + self, num_iteration: Optional[int] = None, start_iteration: int = 0, importance_type: str = "split" ) -> str: """Save Booster to string. @@ -4304,28 +4168,34 @@ def model_to_string( tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_BoosterSaveModelToString( - self._handle, - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - ctypes.c_int(importance_type_int), - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, re-allocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_BoosterSaveModelToString( + _safe_call( + _LIB.LGBM_BoosterSaveModelToString( self._handle, ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), ctypes.c_int(importance_type_int), - ctypes.c_int64(actual_len), + ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), - ptr_string_buffer)) - ret = string_buffer.value.decode('utf-8') + ptr_string_buffer, + ) + ) + actual_len = tmp_out_len.value + # if buffer length is not long enough, re-allocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call( + _LIB.LGBM_BoosterSaveModelToString( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer, + ) + ) + ret = string_buffer.value.decode("utf-8") ret += _dump_pandas_categorical(self.pandas_categorical) return ret @@ -4333,8 +4203,8 @@ def dump_model( self, num_iteration: Optional[int] = None, start_iteration: int = 0, - importance_type: str = 'split', - object_hook: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None + importance_type: str = "split", + object_hook: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, ) -> Dict[str, Any]: """Dump Booster to JSON format. @@ -4372,30 +4242,35 @@ def dump_model( tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_BoosterDumpModel( - self._handle, - ctypes.c_int(start_iteration), - ctypes.c_int(num_iteration), - ctypes.c_int(importance_type_int), - ctypes.c_int64(buffer_len), - ctypes.byref(tmp_out_len), - ptr_string_buffer)) - actual_len = tmp_out_len.value - # if buffer length is not long enough, reallocate a buffer - if actual_len > buffer_len: - string_buffer = ctypes.create_string_buffer(actual_len) - ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_BoosterDumpModel( + _safe_call( + _LIB.LGBM_BoosterDumpModel( self._handle, ctypes.c_int(start_iteration), ctypes.c_int(num_iteration), ctypes.c_int(importance_type_int), - ctypes.c_int64(actual_len), + ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), - ptr_string_buffer)) - ret = json.loads(string_buffer.value.decode('utf-8'), object_hook=object_hook) - ret['pandas_categorical'] = json.loads(json.dumps(self.pandas_categorical, - default=_json_default_with_numpy)) + ptr_string_buffer, + ) + ) + actual_len = tmp_out_len.value + # if buffer length is not long enough, reallocate a buffer + if actual_len > buffer_len: + string_buffer = ctypes.create_string_buffer(actual_len) + ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) + _safe_call( + _LIB.LGBM_BoosterDumpModel( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(num_iteration), + ctypes.c_int(importance_type_int), + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer, + ) + ) + ret = json.loads(string_buffer.value.decode("utf-8"), object_hook=object_hook) + ret["pandas_categorical"] = json.loads(json.dumps(self.pandas_categorical, default=_json_default_with_numpy)) return ret def predict( @@ -4408,7 +4283,7 @@ def predict( pred_contrib: bool = False, data_has_header: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ) -> Union[np.ndarray, scipy.sparse.spmatrix, List[scipy.sparse.spmatrix]]: """Make a prediction. @@ -4472,7 +4347,7 @@ def predict( pred_leaf=pred_leaf, pred_contrib=pred_contrib, data_has_header=data_has_header, - validate_features=validate_features + validate_features=validate_features, ) def refit( @@ -4484,12 +4359,12 @@ def refit( weight: Optional[_LGBM_WeightType] = None, group: Optional[_LGBM_GroupType] = None, init_score: Optional[_LGBM_InitScoreType] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", dataset_params: Optional[Dict[str, Any]] = None, free_raw_data: bool = True, validate_features: bool = False, - **kwargs + **kwargs, ) -> "Booster": """Refit the existing Booster by new data. @@ -4572,29 +4447,17 @@ def refit( Refitted Booster. """ if self.__set_objective_to_none: - raise LightGBMError('Cannot refit due to null objective function.') + raise LightGBMError("Cannot refit due to null objective function.") if dataset_params is None: dataset_params = {} - predictor = _InnerPredictor.from_booster( - booster=self, - pred_parameter=deepcopy(kwargs) - ) + predictor = _InnerPredictor.from_booster(booster=self, pred_parameter=deepcopy(kwargs)) leaf_preds: np.ndarray = predictor.predict( # type: ignore[assignment] - data=data, - start_iteration=-1, - pred_leaf=True, - validate_features=validate_features + data=data, start_iteration=-1, pred_leaf=True, validate_features=validate_features ) nrow, ncol = leaf_preds.shape out_is_linear = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetLinear( - self._handle, - ctypes.byref(out_is_linear))) - new_params = _choose_param_value( - main_param_name="linear_tree", - params=self.params, - default_value=None - ) + _safe_call(_LIB.LGBM_BoosterGetLinear(self._handle, ctypes.byref(out_is_linear))) + new_params = _choose_param_value(main_param_name="linear_tree", params=self.params, default_value=None) new_params["linear_tree"] = bool(out_is_linear.value) new_params.update(dataset_params) train_set = Dataset( @@ -4609,19 +4472,13 @@ def refit( params=new_params, free_raw_data=free_raw_data, ) - new_params['refit_decay_rate'] = decay_rate + new_params["refit_decay_rate"] = decay_rate new_booster = Booster(new_params, train_set) # Copy models - _safe_call(_LIB.LGBM_BoosterMerge( - new_booster._handle, - predictor._handle)) + _safe_call(_LIB.LGBM_BoosterMerge(new_booster._handle, predictor._handle)) leaf_preds = leaf_preds.reshape(-1) ptr_data, _, _ = _c_int_array(leaf_preds) - _safe_call(_LIB.LGBM_BoosterRefit( - new_booster._handle, - ptr_data, - ctypes.c_int32(nrow), - ctypes.c_int32(ncol))) + _safe_call(_LIB.LGBM_BoosterRefit(new_booster._handle, ptr_data, ctypes.c_int32(nrow), ctypes.c_int32(ncol))) new_booster._network = self._network return new_booster @@ -4641,11 +4498,9 @@ def get_leaf_output(self, tree_id: int, leaf_id: int) -> float: The output of the leaf. """ ret = ctypes.c_double(0) - _safe_call(_LIB.LGBM_BoosterGetLeafValue( - self._handle, - ctypes.c_int(tree_id), - ctypes.c_int(leaf_id), - ctypes.byref(ret))) + _safe_call( + _LIB.LGBM_BoosterGetLeafValue(self._handle, ctypes.c_int(tree_id), ctypes.c_int(leaf_id), ctypes.byref(ret)) + ) return ret.value def set_leaf_output( @@ -4653,7 +4508,7 @@ def set_leaf_output( tree_id: int, leaf_id: int, value: float, - ) -> 'Booster': + ) -> "Booster": """Set the output of a leaf. .. versionadded:: 4.0.0 @@ -4674,10 +4529,7 @@ def set_leaf_output( """ _safe_call( _LIB.LGBM_BoosterSetLeafValue( - self._handle, - ctypes.c_int(tree_id), - ctypes.c_int(leaf_id), - ctypes.c_double(value) + self._handle, ctypes.c_int(tree_id), ctypes.c_int(leaf_id), ctypes.c_double(value) ) ) return self @@ -4691,9 +4543,7 @@ def num_feature(self) -> int: The number of features. """ out_num_feature = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumFeature( - self._handle, - ctypes.byref(out_num_feature))) + _safe_call(_LIB.LGBM_BoosterGetNumFeature(self._handle, ctypes.byref(out_num_feature))) return out_num_feature.value def feature_name(self) -> List[str]: @@ -4711,13 +4561,16 @@ def feature_name(self) -> List[str]: required_string_buffer_size = ctypes.c_size_t(0) string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)] ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] - _safe_call(_LIB.LGBM_BoosterGetFeatureNames( - self._handle, - ctypes.c_int(num_feature), - ctypes.byref(tmp_out_len), - ctypes.c_size_t(reserved_string_buffer_size), - ctypes.byref(required_string_buffer_size), - ptr_string_buffers)) + _safe_call( + _LIB.LGBM_BoosterGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(reserved_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers, + ) + ) if num_feature != tmp_out_len.value: raise ValueError("Length of feature names doesn't equal with num_feature") actual_string_buffer_size = required_string_buffer_size.value @@ -4725,20 +4578,19 @@ def feature_name(self) -> List[str]: if reserved_string_buffer_size < actual_string_buffer_size: string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)] ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] - _safe_call(_LIB.LGBM_BoosterGetFeatureNames( - self._handle, - ctypes.c_int(num_feature), - ctypes.byref(tmp_out_len), - ctypes.c_size_t(actual_string_buffer_size), - ctypes.byref(required_string_buffer_size), - ptr_string_buffers)) - return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)] + _safe_call( + _LIB.LGBM_BoosterGetFeatureNames( + self._handle, + ctypes.c_int(num_feature), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(actual_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers, + ) + ) + return [string_buffers[i].value.decode("utf-8") for i in range(num_feature)] - def feature_importance( - self, - importance_type: str = 'split', - iteration: Optional[int] = None - ) -> np.ndarray: + def feature_importance(self, importance_type: str = "split", iteration: Optional[int] = None) -> np.ndarray: """Get feature importances. Parameters @@ -4761,21 +4613,21 @@ def feature_importance( iteration = self.best_iteration importance_type_int = _FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type] result = np.empty(self.num_feature(), dtype=np.float64) - _safe_call(_LIB.LGBM_BoosterFeatureImportance( - self._handle, - ctypes.c_int(iteration), - ctypes.c_int(importance_type_int), - result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + _safe_call( + _LIB.LGBM_BoosterFeatureImportance( + self._handle, + ctypes.c_int(iteration), + ctypes.c_int(importance_type_int), + result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), + ) + ) if importance_type_int == _C_API_FEATURE_IMPORTANCE_SPLIT: return result.astype(np.int32) else: return result def get_split_value_histogram( - self, - feature: Union[int, str], - bins: Optional[Union[int, str]] = None, - xgboost_style: bool = False + self, feature: Union[int, str], bins: Optional[Union[int, str]] = None, xgboost_style: bool = False ) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray, pd_DataFrame]: """Get split value histogram for the specified feature. @@ -4809,27 +4661,28 @@ def get_split_value_histogram( result_array_like : numpy array or pandas DataFrame (if pandas is installed) If ``xgboost_style=True``, the histogram of used splitting values for the specified feature. """ + def add(root: Dict[str, Any]) -> None: """Recursively add thresholds.""" - if 'split_index' in root: # non-leaf + if "split_index" in root: # non-leaf if feature_names is not None and isinstance(feature, str): - split_feature = feature_names[root['split_feature']] + split_feature = feature_names[root["split_feature"]] else: - split_feature = root['split_feature'] + split_feature = root["split_feature"] if split_feature == feature: - if isinstance(root['threshold'], str): - raise LightGBMError('Cannot compute split value histogram for the categorical feature') + if isinstance(root["threshold"], str): + raise LightGBMError("Cannot compute split value histogram for the categorical feature") else: - values.append(root['threshold']) - add(root['left_child']) - add(root['right_child']) + values.append(root["threshold"]) + add(root["left_child"]) + add(root["right_child"]) model = self.dump_model() - feature_names = model.get('feature_names') - tree_infos = model['tree_info'] + feature_names = model.get("feature_names") + tree_infos = model["tree_info"] values: List[float] = [] for tree_info in tree_infos: - add(tree_info['tree_structure']) + add(tree_info["tree_structure"]) if bins is None or isinstance(bins, int) and xgboost_style: n_unique = len(np.unique(values)) @@ -4839,7 +4692,7 @@ def add(root: Dict[str, Any]) -> None: ret = np.column_stack((bin_edges[1:], hist)) ret = ret[ret[:, 1] > 0] if PANDAS_INSTALLED: - return pd_DataFrame(ret, columns=['SplitValue', 'Count']) + return pd_DataFrame(ret, columns=["SplitValue", "Count"]) else: return ret else: @@ -4849,7 +4702,7 @@ def __inner_eval( self, data_name: str, data_idx: int, - feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]], ) -> List[_LGBM_BoosterEvalMethodResultType]: """Evaluate training or validation data.""" if data_idx >= self.__num_dataset: @@ -4859,16 +4712,18 @@ def __inner_eval( if self.__num_inner_eval > 0: result = np.empty(self.__num_inner_eval, dtype=np.float64) tmp_out_len = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetEval( - self._handle, - ctypes.c_int(data_idx), - ctypes.byref(tmp_out_len), - result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))) + _safe_call( + _LIB.LGBM_BoosterGetEval( + self._handle, + ctypes.c_int(data_idx), + ctypes.byref(tmp_out_len), + result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), + ) + ) if tmp_out_len.value != self.__num_inner_eval: raise ValueError("Wrong length of eval results") for i in range(self.__num_inner_eval): - ret.append((data_name, self.__name_inner_eval[i], - result[i], self.__higher_better_inner_eval[i])) + ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i])) if callable(feval): feval = [feval] if feval is not None: @@ -4902,18 +4757,16 @@ def __inner_predict(self, data_idx: int) -> np.ndarray: if not self.__is_predicted_cur_iter[data_idx]: tmp_out_len = ctypes.c_int64(0) data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double)) # type: ignore[union-attr] - _safe_call(_LIB.LGBM_BoosterGetPredict( - self._handle, - ctypes.c_int(data_idx), - ctypes.byref(tmp_out_len), - data_ptr)) + _safe_call( + _LIB.LGBM_BoosterGetPredict(self._handle, ctypes.c_int(data_idx), ctypes.byref(tmp_out_len), data_ptr) + ) if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): # type: ignore[arg-type] raise ValueError(f"Wrong length of predict results for data {data_idx}") self.__is_predicted_cur_iter[data_idx] = True result: np.ndarray = self.__inner_predict_buffer[data_idx] # type: ignore[assignment] if self.__num_class > 1: num_data = result.size // self.__num_class - result = result.reshape(num_data, self.__num_class, order='F') + result = result.reshape(num_data, self.__num_class, order="F") return result def __get_eval_info(self) -> None: @@ -4922,9 +4775,7 @@ def __get_eval_info(self) -> None: self.__need_reload_eval_info = False out_num_eval = ctypes.c_int(0) # Get num of inner evals - _safe_call(_LIB.LGBM_BoosterGetEvalCounts( - self._handle, - ctypes.byref(out_num_eval))) + _safe_call(_LIB.LGBM_BoosterGetEvalCounts(self._handle, ctypes.byref(out_num_eval))) self.__num_inner_eval = out_num_eval.value if self.__num_inner_eval > 0: # Get name of eval metrics @@ -4935,13 +4786,16 @@ def __get_eval_info(self) -> None: ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(self.__num_inner_eval) ] ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] - _safe_call(_LIB.LGBM_BoosterGetEvalNames( - self._handle, - ctypes.c_int(self.__num_inner_eval), - ctypes.byref(tmp_out_len), - ctypes.c_size_t(reserved_string_buffer_size), - ctypes.byref(required_string_buffer_size), - ptr_string_buffers)) + _safe_call( + _LIB.LGBM_BoosterGetEvalNames( + self._handle, + ctypes.c_int(self.__num_inner_eval), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(reserved_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers, + ) + ) if self.__num_inner_eval != tmp_out_len.value: raise ValueError("Length of eval names doesn't equal with num_evals") actual_string_buffer_size = required_string_buffer_size.value @@ -4950,17 +4804,20 @@ def __get_eval_info(self) -> None: string_buffers = [ ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(self.__num_inner_eval) ] - ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) # type: ignore[misc] - _safe_call(_LIB.LGBM_BoosterGetEvalNames( - self._handle, - ctypes.c_int(self.__num_inner_eval), - ctypes.byref(tmp_out_len), - ctypes.c_size_t(actual_string_buffer_size), - ctypes.byref(required_string_buffer_size), - ptr_string_buffers)) - self.__name_inner_eval = [ - string_buffers[i].value.decode('utf-8') for i in range(self.__num_inner_eval) - ] + ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)( + *map(ctypes.addressof, string_buffers) + ) # type: ignore[misc] + _safe_call( + _LIB.LGBM_BoosterGetEvalNames( + self._handle, + ctypes.c_int(self.__num_inner_eval), + ctypes.byref(tmp_out_len), + ctypes.c_size_t(actual_string_buffer_size), + ctypes.byref(required_string_buffer_size), + ptr_string_buffers, + ) + ) + self.__name_inner_eval = [string_buffers[i].value.decode("utf-8") for i in range(self.__num_inner_eval)] self.__higher_better_inner_eval = [ - name.startswith(('auc', 'ndcg@', 'map@', 'average_precision')) for name in self.__name_inner_eval + name.startswith(("auc", "ndcg@", "map@", "average_precision")) for name in self.__name_inner_eval ] diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index 0a4fa65a5d85..a6683421325c 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -18,21 +18,17 @@ from .engine import CVBooster __all__ = [ - 'EarlyStopException', - 'early_stopping', - 'log_evaluation', - 'record_evaluation', - 'reset_parameter', + "EarlyStopException", + "early_stopping", + "log_evaluation", + "record_evaluation", + "reset_parameter", ] _EvalResultDict = Dict[str, Dict[str, List[Any]]] -_EvalResultTuple = Union[ - _LGBM_BoosterEvalMethodResultType, - _LGBM_BoosterEvalMethodResultWithStandardDeviationType -] +_EvalResultTuple = Union[_LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType] _ListOfEvalResultTuples = Union[ - List[_LGBM_BoosterEvalMethodResultType], - List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] + List[_LGBM_BoosterEvalMethodResultType], List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] ] @@ -95,8 +91,8 @@ def __init__(self, period: int = 1, show_stdv: bool = True) -> None: def __call__(self, env: CallbackEnv) -> None: if self.period > 0 and env.evaluation_result_list and (env.iteration + 1) % self.period == 0: - result = '\t'.join([_format_eval_result(x, self.show_stdv) for x in env.evaluation_result_list]) - _log_info(f'[{env.iteration + 1}]\t{result}') + result = "\t".join([_format_eval_result(x, self.show_stdv) for x in env.evaluation_result_list]) + _log_info(f"[{env.iteration + 1}]\t{result}") def log_evaluation(period: int = 1, show_stdv: bool = True) -> _LogEvaluationCallback: @@ -133,7 +129,7 @@ def __init__(self, eval_result: _EvalResultDict) -> None: self.before_iteration = False if not isinstance(eval_result, dict): - raise TypeError('eval_result should be a dictionary') + raise TypeError("eval_result should be a dictionary") self.eval_result = eval_result def _init(self, env: CallbackEnv) -> None: @@ -152,8 +148,8 @@ def _init(self, env: CallbackEnv) -> None: if len(item) == 4: self.eval_result[data_name].setdefault(eval_name, []) else: - self.eval_result[data_name].setdefault(f'{eval_name}-mean', []) - self.eval_result[data_name].setdefault(f'{eval_name}-stdv', []) + self.eval_result[data_name].setdefault(f"{eval_name}-mean", []) + self.eval_result[data_name].setdefault(f"{eval_name}-stdv", []) def __call__(self, env: CallbackEnv) -> None: if env.iteration == env.begin_iteration: @@ -171,8 +167,8 @@ def __call__(self, env: CallbackEnv) -> None: data_name, eval_name = item[1].split() res_mean = item[2] res_stdv = item[4] # type: ignore[misc] - self.eval_result[data_name][f'{eval_name}-mean'].append(res_mean) - self.eval_result[data_name][f'{eval_name}-stdv'].append(res_stdv) + self.eval_result[data_name][f"{eval_name}-mean"].append(res_mean) + self.eval_result[data_name][f"{eval_name}-stdv"].append(res_stdv) def record_evaluation(eval_result: Dict[str, Dict[str, List[Any]]]) -> Callable: @@ -230,8 +226,10 @@ def __call__(self, env: CallbackEnv) -> None: elif callable(value): new_param = value(env.iteration - env.begin_iteration) else: - raise ValueError("Only list and callable values are supported " - "as a mapping from boosting round index to new parameter value.") + raise ValueError( + "Only list and callable values are supported " + "as a mapping from boosting round index to new parameter value." + ) if new_param != env.params.get(key, None): new_parameters[key] = new_param if new_parameters: @@ -276,9 +274,8 @@ def __init__( stopping_rounds: int, first_metric_only: bool = False, verbose: bool = True, - min_delta: Union[float, List[float]] = 0.0 + min_delta: Union[float, List[float]] = 0.0, ) -> None: - if not isinstance(stopping_rounds, int) or stopping_rounds <= 0: raise ValueError(f"stopping_rounds should be an integer and greater than 0. got: {stopping_rounds}") @@ -298,7 +295,7 @@ def _reset_storages(self) -> None: self.best_iter: List[int] = [] self.best_score_list: List[_ListOfEvalResultTuples] = [] self.cmp_op: List[Callable[[float, float], bool]] = [] - self.first_metric = '' + self.first_metric = "" def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool: return curr_score > best_score + delta @@ -321,29 +318,24 @@ def _is_train_set(self, ds_name: str, eval_name: str, env: CallbackEnv) -> bool: def _init(self, env: CallbackEnv) -> None: if env.evaluation_result_list is None or env.evaluation_result_list == []: - raise ValueError( - "For early stopping, at least one dataset and eval metric is required for evaluation" - ) + raise ValueError("For early stopping, at least one dataset and eval metric is required for evaluation") - is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting")) + is_dart = any(env.params.get(alias, "") == "dart" for alias in _ConfigAliases.get("boosting")) if is_dart: self.enabled = False - _log_warning('Early stopping is not available in dart mode') + _log_warning("Early stopping is not available in dart mode") return # validation sets are guaranteed to not be identical to the training data in cv() if isinstance(env.model, Booster): - only_train_set = ( - len(env.evaluation_result_list) == 1 - and self._is_train_set( - ds_name=env.evaluation_result_list[0][0], - eval_name=env.evaluation_result_list[0][1].split(" ")[0], - env=env - ) + only_train_set = len(env.evaluation_result_list) == 1 and self._is_train_set( + ds_name=env.evaluation_result_list[0][0], + eval_name=env.evaluation_result_list[0][1].split(" ")[0], + env=env, ) if only_train_set: self.enabled = False - _log_warning('Only training set found, disabling early stopping.') + _log_warning("Only training set found, disabling early stopping.") return if self.verbose: @@ -355,26 +347,26 @@ def _init(self, env: CallbackEnv) -> None: n_datasets = len(env.evaluation_result_list) // n_metrics if isinstance(self.min_delta, list): if not all(t >= 0 for t in self.min_delta): - raise ValueError('Values for early stopping min_delta must be non-negative.') + raise ValueError("Values for early stopping min_delta must be non-negative.") if len(self.min_delta) == 0: if self.verbose: - _log_info('Disabling min_delta for early stopping.') + _log_info("Disabling min_delta for early stopping.") deltas = [0.0] * n_datasets * n_metrics elif len(self.min_delta) == 1: if self.verbose: - _log_info(f'Using {self.min_delta[0]} as min_delta for all metrics.') + _log_info(f"Using {self.min_delta[0]} as min_delta for all metrics.") deltas = self.min_delta * n_datasets * n_metrics else: if len(self.min_delta) != n_metrics: - raise ValueError('Must provide a single value for min_delta or as many as metrics.') + raise ValueError("Must provide a single value for min_delta or as many as metrics.") if self.first_metric_only and self.verbose: - _log_info(f'Using only {self.min_delta[0]} as early stopping min_delta.') + _log_info(f"Using only {self.min_delta[0]} as early stopping min_delta.") deltas = self.min_delta * n_datasets else: if self.min_delta < 0: - raise ValueError('Early stopping min_delta must be non-negative.') + raise ValueError("Early stopping min_delta must be non-negative.") if self.min_delta > 0 and n_metrics > 1 and not self.first_metric_only and self.verbose: - _log_info(f'Using {self.min_delta} as min_delta for all metrics.') + _log_info(f"Using {self.min_delta} as min_delta for all metrics.") deltas = [self.min_delta] * n_datasets * n_metrics # split is needed for " " case (e.g. "train l1") @@ -382,18 +374,19 @@ def _init(self, env: CallbackEnv) -> None: for eval_ret, delta in zip(env.evaluation_result_list, deltas): self.best_iter.append(0) if eval_ret[3]: # greater is better - self.best_score.append(float('-inf')) + self.best_score.append(float("-inf")) self.cmp_op.append(partial(self._gt_delta, delta=delta)) else: - self.best_score.append(float('inf')) + self.best_score.append(float("inf")) self.cmp_op.append(partial(self._lt_delta, delta=delta)) def _final_iteration_check(self, env: CallbackEnv, eval_name_splitted: List[str], i: int) -> None: if env.iteration == env.end_iteration - 1: if self.verbose: - best_score_str = '\t'.join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]) - _log_info('Did not meet early stopping. ' - f'Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}') + best_score_str = "\t".join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]) + _log_info( + "Did not meet early stopping. " f"Best iteration is:\n[{self.best_iter[i] + 1}]\t{best_score_str}" + ) if self.first_metric_only: _log_info(f"Evaluated only: {eval_name_splitted[-1]}") raise EarlyStopException(self.best_iter[i], self.best_score_list[i]) @@ -409,7 +402,7 @@ def __call__(self, env: CallbackEnv) -> None: "Please report it at https://github.com/microsoft/LightGBM/issues" ) # self.best_score_list is initialized to an empty list - first_time_updating_best_score_list = (self.best_score_list == []) + first_time_updating_best_score_list = self.best_score_list == [] for i in range(len(env.evaluation_result_list)): score = env.evaluation_result_list[i][2] if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]): @@ -423,15 +416,13 @@ def __call__(self, env: CallbackEnv) -> None: eval_name_splitted = env.evaluation_result_list[i][1].split(" ") if self.first_metric_only and self.first_metric != eval_name_splitted[-1]: continue # use only the first metric for early stopping - if self._is_train_set( - ds_name=env.evaluation_result_list[i][0], - eval_name=eval_name_splitted[0], - env=env - ): + if self._is_train_set(ds_name=env.evaluation_result_list[i][0], eval_name=eval_name_splitted[0], env=env): continue # train data for lgb.cv or sklearn wrapper (underlying lgb.train) elif env.iteration - self.best_iter[i] >= self.stopping_rounds: if self.verbose: - eval_result_str = '\t'.join([_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]]) + eval_result_str = "\t".join( + [_format_eval_result(x, show_stdv=True) for x in self.best_score_list[i]] + ) _log_info(f"Early stopping, best iteration is:\n[{self.best_iter[i] + 1}]\t{eval_result_str}") if self.first_metric_only: _log_info(f"Evaluated only: {eval_name_splitted[-1]}") @@ -439,7 +430,12 @@ def __call__(self, env: CallbackEnv) -> None: self._final_iteration_check(env, eval_name_splitted, i) -def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbose: bool = True, min_delta: Union[float, List[float]] = 0.0) -> _EarlyStoppingCallback: +def early_stopping( + stopping_rounds: int, + first_metric_only: bool = False, + verbose: bool = True, + min_delta: Union[float, List[float]] = 0.0, +) -> _EarlyStoppingCallback: """Create a callback that activates early stopping. Activates early stopping. @@ -473,4 +469,6 @@ def early_stopping(stopping_rounds: int, first_metric_only: bool = False, verbos callback : _EarlyStoppingCallback The callback that activates early stopping. """ - return _EarlyStoppingCallback(stopping_rounds=stopping_rounds, first_metric_only=first_metric_only, verbose=verbose, min_delta=min_delta) + return _EarlyStoppingCallback( + stopping_rounds=stopping_rounds, first_metric_only=first_metric_only, verbose=verbose, min_delta=min_delta + ) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index bd1b29a1e802..086c6a199ff3 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -8,6 +8,7 @@ from pandas import DataFrame as pd_DataFrame from pandas import Series as pd_Series from pandas import concat + try: from pandas import CategoricalDtype as pd_CategoricalDtype except ImportError: @@ -40,15 +41,18 @@ def __init__(self, *args, **kwargs): try: from numpy.random import Generator as np_random_Generator except ImportError: + class np_random_Generator: # type: ignore """Dummy class for np.random.Generator.""" def __init__(self, *args, **kwargs): pass + """matplotlib""" try: import matplotlib # noqa: F401 + MATPLOTLIB_INSTALLED = True except ImportError: MATPLOTLIB_INSTALLED = False @@ -56,6 +60,7 @@ def __init__(self, *args, **kwargs): """graphviz""" try: import graphviz # noqa: F401 + GRAPHVIZ_INSTALLED = True except ImportError: GRAPHVIZ_INSTALLED = False @@ -63,6 +68,7 @@ def __init__(self, *args, **kwargs): """datatable""" try: import datatable + if hasattr(datatable, "Frame"): dt_DataTable = datatable.Frame else: @@ -85,6 +91,7 @@ def __init__(self, *args, **kwargs): from sklearn.utils.class_weight import compute_sample_weight from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import assert_all_finite, check_array, check_X_y + try: from sklearn.exceptions import NotFittedError from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold @@ -155,6 +162,7 @@ class _LGBMRegressorBase: # type: ignore from dask.dataframe import DataFrame as dask_DataFrame from dask.dataframe import Series as dask_Series from dask.distributed import Client, Future, default_client, wait + DASK_INSTALLED = True except ImportError: DASK_INSTALLED = False @@ -195,6 +203,7 @@ class dask_Series: # type: ignore def __init__(self, *args, **kwargs): pass + """pyarrow""" try: import pyarrow.compute as pa_compute @@ -205,6 +214,7 @@ def __init__(self, *args, **kwargs): from pyarrow.cffi import ffi as arrow_cffi from pyarrow.types import is_floating as arrow_is_floating from pyarrow.types import is_integer as arrow_is_integer + PYARROW_INSTALLED = True except ImportError: PYARROW_INSTALLED = False @@ -266,4 +276,5 @@ def _LGBMCpuCount(only_physical_cores: bool = True) -> int: def _LGBMCpuCount(only_physical_cores: bool = True) -> int: return cpu_count() + __all__: List[str] = [] diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index ee8bf58ce463..333600ac7566 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -51,9 +51,9 @@ ) __all__ = [ - 'DaskLGBMClassifier', - 'DaskLGBMRanker', - 'DaskLGBMRegressor', + "DaskLGBMClassifier", + "DaskLGBMRanker", + "DaskLGBMRegressor", ] _DaskCollection = Union[dask_Array, dask_DataFrame, dask_Series] @@ -67,7 +67,7 @@ class _RemoteSocket: def acquire(self) -> int: self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - self.socket.bind(('', 0)) + self.socket.bind(("", 0)) return self.socket.getsockname()[1] def release(self) -> None: @@ -153,9 +153,11 @@ def _concat(seq: List[_DaskPart]) -> _DaskPart: elif isinstance(seq[0], (pd_DataFrame, pd_Series)): return concat(seq, axis=0) elif isinstance(seq[0], ss.spmatrix): - return ss.vstack(seq, format='csr') + return ss.vstack(seq, format="csr") else: - raise TypeError(f'Data must be one of: numpy arrays, pandas dataframes, sparse matrices (from scipy). Got {type(seq[0]).__name__}.') + raise TypeError( + f"Data must be one of: numpy arrays, pandas dataframes, sparse matrices (from scipy). Got {type(seq[0]).__name__}." + ) def _remove_list_padding(*args: Any) -> List[List[Any]]: @@ -186,41 +188,41 @@ def _train_part( return_model: bool, time_out: int, remote_socket: _RemoteSocket, - **kwargs: Any + **kwargs: Any, ) -> Optional[LGBMModel]: network_params = { - 'machines': machines, - 'local_listen_port': local_listen_port, - 'time_out': time_out, - 'num_machines': num_machines + "machines": machines, + "local_listen_port": local_listen_port, + "time_out": time_out, + "num_machines": num_machines, } params.update(network_params) is_ranker = issubclass(model_factory, LGBMRanker) # Concatenate many parts into one - data = _concat([x['data'] for x in list_of_parts]) - label = _concat([x['label'] for x in list_of_parts]) + data = _concat([x["data"] for x in list_of_parts]) + label = _concat([x["label"] for x in list_of_parts]) - if 'weight' in list_of_parts[0]: - weight = _concat([x['weight'] for x in list_of_parts]) + if "weight" in list_of_parts[0]: + weight = _concat([x["weight"] for x in list_of_parts]) else: weight = None - if 'group' in list_of_parts[0]: - group = _concat([x['group'] for x in list_of_parts]) + if "group" in list_of_parts[0]: + group = _concat([x["group"] for x in list_of_parts]) else: group = None - if 'init_score' in list_of_parts[0]: - init_score = _concat([x['init_score'] for x in list_of_parts]) + if "init_score" in list_of_parts[0]: + init_score = _concat([x["init_score"] for x in list_of_parts]) else: init_score = None # construct local eval_set data. - n_evals = max(len(x.get('eval_set', [])) for x in list_of_parts) - eval_names = kwargs.pop('eval_names', None) - eval_class_weight = kwargs.get('eval_class_weight') + n_evals = max(len(x.get("eval_set", [])) for x in list_of_parts) + eval_names = kwargs.pop("eval_names", None) + eval_class_weight = kwargs.get("eval_class_weight") local_eval_set = None local_eval_names = None local_eval_sample_weight = None @@ -228,8 +230,8 @@ def _train_part( local_eval_group = None if n_evals: - has_eval_sample_weight = any(x.get('eval_sample_weight') is not None for x in list_of_parts) - has_eval_init_score = any(x.get('eval_init_score') is not None for x in list_of_parts) + has_eval_sample_weight = any(x.get("eval_sample_weight") is not None for x in list_of_parts) + has_eval_init_score = any(x.get("eval_init_score") is not None for x in list_of_parts) local_eval_set = [] evals_result_names = [] @@ -251,7 +253,7 @@ def _train_part( init_score_e = [] g_e = [] for part in list_of_parts: - if not part.get('eval_set'): + if not part.get("eval_set"): continue # require that eval_name exists in evaluated result data in case dropped due to padding. @@ -259,12 +261,12 @@ def _train_part( if eval_names: evals_result_name = eval_names[i] else: - evals_result_name = f'valid_{i}' + evals_result_name = f"valid_{i}" - eval_set = part['eval_set'][i] + eval_set = part["eval_set"][i] if eval_set is _DatasetNames.TRAINSET: - x_e.append(part['data']) - y_e.append(part['label']) + x_e.append(part["data"]) + y_e.append(part["label"]) else: x_e.extend(eval_set[0]) y_e.extend(eval_set[1]) @@ -272,24 +274,24 @@ def _train_part( if evals_result_name not in evals_result_names: evals_result_names.append(evals_result_name) - eval_weight = part.get('eval_sample_weight') + eval_weight = part.get("eval_sample_weight") if eval_weight: if eval_weight[i] is _DatasetNames.SAMPLE_WEIGHT: - w_e.append(part['weight']) + w_e.append(part["weight"]) else: w_e.extend(eval_weight[i]) - eval_init_score = part.get('eval_init_score') + eval_init_score = part.get("eval_init_score") if eval_init_score: if eval_init_score[i] is _DatasetNames.INIT_SCORE: - init_score_e.append(part['init_score']) + init_score_e.append(part["init_score"]) else: init_score_e.extend(eval_init_score[i]) - eval_group = part.get('eval_group') + eval_group = part.get("eval_group") if eval_group: if eval_group[i] is _DatasetNames.GROUP: - g_e.append(part['group']) + g_e.append(part["group"]) else: g_e.extend(eval_group[i]) @@ -313,7 +315,7 @@ def _train_part( if eval_names: local_eval_names = [eval_names[i] for i in eval_component_idx] if eval_class_weight: - kwargs['eval_class_weight'] = [eval_class_weight[i] for i in eval_component_idx] + kwargs["eval_class_weight"] = [eval_class_weight[i] for i in eval_component_idx] model = model_factory(**params) if remote_socket is not None: @@ -331,7 +333,7 @@ def _train_part( eval_init_score=local_eval_init_score, eval_group=local_eval_group, eval_names=local_eval_names, - **kwargs + **kwargs, ) else: model.fit( @@ -343,7 +345,7 @@ def _train_part( eval_sample_weight=local_eval_sample_weight, eval_init_score=local_eval_init_score, eval_names=local_eval_names, - **kwargs + **kwargs, ) finally: @@ -389,7 +391,9 @@ def _machines_to_worker_map(machines: str, worker_addresses: Iterable[str]) -> D machine_addresses = machines.split(",") if len(set(machine_addresses)) != len(machine_addresses): - raise ValueError(f"Found duplicates in 'machines' ({machines}). Each entry in 'machines' must be a unique IP-port combination.") + raise ValueError( + f"Found duplicates in 'machines' ({machines}). Each entry in 'machines' must be a unique IP-port combination." + ) machine_to_port = defaultdict(set) for address in machine_addresses: @@ -423,7 +427,7 @@ def _train( eval_group: Optional[List[_DaskVectorLike]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_at: Optional[Union[List[int], Tuple[int, ...]]] = None, - **kwargs: Any + **kwargs: Any, ) -> LGBMModel: """Inner train routine. @@ -512,36 +516,23 @@ def _train( params = deepcopy(params) # capture whether local_listen_port or its aliases were provided - listen_port_in_params = any( - alias in params for alias in _ConfigAliases.get("local_listen_port") - ) + listen_port_in_params = any(alias in params for alias in _ConfigAliases.get("local_listen_port")) # capture whether machines or its aliases were provided - machines_in_params = any( - alias in params for alias in _ConfigAliases.get("machines") - ) + machines_in_params = any(alias in params for alias in _ConfigAliases.get("machines")) - params = _choose_param_value( - main_param_name="tree_learner", - params=params, - default_value="data" - ) - allowed_tree_learners = { - 'data', - 'data_parallel', - 'feature', - 'feature_parallel', - 'voting', - 'voting_parallel' - } + params = _choose_param_value(main_param_name="tree_learner", params=params, default_value="data") + allowed_tree_learners = {"data", "data_parallel", "feature", "feature_parallel", "voting", "voting_parallel"} if params["tree_learner"] not in allowed_tree_learners: - _log_warning(f'Parameter tree_learner set to {params["tree_learner"]}, which is not allowed. Using "data" as default') - params['tree_learner'] = 'data' + _log_warning( + f'Parameter tree_learner set to {params["tree_learner"]}, which is not allowed. Using "data" as default' + ) + params["tree_learner"] = "data" # Some passed-in parameters can be removed: # * 'num_machines': set automatically from Dask worker list # * 'num_threads': overridden to match nthreads on each Dask process - for param_alias in _ConfigAliases.get('num_machines', 'num_threads'): + for param_alias in _ConfigAliases.get("num_machines", "num_threads"): if param_alias in params: _log_warning(f"Parameter {param_alias} will be ignored.") params.pop(param_alias) @@ -549,23 +540,23 @@ def _train( # Split arrays/dataframes into parts. Arrange parts into dicts to enforce co-locality data_parts = _split_to_parts(data=data, is_matrix=True) label_parts = _split_to_parts(data=label, is_matrix=False) - parts = [{'data': x, 'label': y} for (x, y) in zip(data_parts, label_parts)] + parts = [{"data": x, "label": y} for (x, y) in zip(data_parts, label_parts)] n_parts = len(parts) if sample_weight is not None: weight_parts = _split_to_parts(data=sample_weight, is_matrix=False) for i in range(n_parts): - parts[i]['weight'] = weight_parts[i] + parts[i]["weight"] = weight_parts[i] if group is not None: group_parts = _split_to_parts(data=group, is_matrix=False) for i in range(n_parts): - parts[i]['group'] = group_parts[i] + parts[i]["group"] = group_parts[i] if init_score is not None: init_score_parts = _split_to_parts(data=init_score, is_matrix=False) for i in range(n_parts): - parts[i]['init_score'] = init_score_parts[i] + parts[i]["init_score"] = init_score_parts[i] # evals_set will to be re-constructed into smaller lists of (X, y) tuples, where # X and y are each delayed sub-lists of original eval dask Collections. @@ -575,47 +566,16 @@ def _train( n_largest_eval_parts = max(x[0].npartitions for x in eval_set) eval_sets: Dict[ - int, - List[ - Union[ - _DatasetNames, - Tuple[ - List[Optional[_DaskMatrixLike]], - List[Optional[_DaskVectorLike]] - ] - ] - ] + int, List[Union[_DatasetNames, Tuple[List[Optional[_DaskMatrixLike]], List[Optional[_DaskVectorLike]]]]] ] = defaultdict(list) if eval_sample_weight: - eval_sample_weights: Dict[ - int, - List[ - Union[ - _DatasetNames, - List[Optional[_DaskVectorLike]] - ] - ] - ] = defaultdict(list) + eval_sample_weights: Dict[int, List[Union[_DatasetNames, List[Optional[_DaskVectorLike]]]]] = defaultdict( + list + ) if eval_group: - eval_groups: Dict[ - int, - List[ - Union[ - _DatasetNames, - List[Optional[_DaskVectorLike]] - ] - ] - ] = defaultdict(list) + eval_groups: Dict[int, List[Union[_DatasetNames, List[Optional[_DaskVectorLike]]]]] = defaultdict(list) if eval_init_score: - eval_init_scores: Dict[ - int, - List[ - Union[ - _DatasetNames, - List[Optional[_DaskMatrixLike]] - ] - ] - ] = defaultdict(list) + eval_init_scores: Dict[int, List[Union[_DatasetNames, List[Optional[_DaskMatrixLike]]]]] = defaultdict(list) for i, (X_eval, y_eval) in enumerate(eval_set): n_this_eval_parts = X_eval.npartitions @@ -704,13 +664,13 @@ def _train( # assign sub-eval_set components to worker parts. for parts_idx, e_set in eval_sets.items(): - parts[parts_idx]['eval_set'] = e_set + parts[parts_idx]["eval_set"] = e_set if eval_sample_weight: - parts[parts_idx]['eval_sample_weight'] = eval_sample_weights[parts_idx] + parts[parts_idx]["eval_sample_weight"] = eval_sample_weights[parts_idx] if eval_init_score: - parts[parts_idx]['eval_init_score'] = eval_init_scores[parts_idx] + parts[parts_idx]["eval_init_score"] = eval_init_scores[parts_idx] if eval_group: - parts[parts_idx]['eval_group'] = eval_groups[parts_idx] + parts[parts_idx]["eval_group"] = eval_groups[parts_idx] # Start computation in the background parts = list(map(delayed, parts)) @@ -718,7 +678,7 @@ def _train( wait(parts) for part in parts: - if part.status == 'error': # type: ignore + if part.status == "error": # type: ignore # trigger error locally return part # type: ignore[return-value] @@ -735,7 +695,7 @@ def _train( for worker in worker_map: has_eval_set = False for part in worker_map[worker]: - if 'eval_set' in part.result(): # type: ignore[attr-defined] + if "eval_set" in part.result(): # type: ignore[attr-defined] has_eval_set = True break @@ -747,31 +707,23 @@ def _train( # assign general validation set settings to fit kwargs. if eval_names: - kwargs['eval_names'] = eval_names + kwargs["eval_names"] = eval_names if eval_class_weight: - kwargs['eval_class_weight'] = eval_class_weight + kwargs["eval_class_weight"] = eval_class_weight if eval_metric: - kwargs['eval_metric'] = eval_metric + kwargs["eval_metric"] = eval_metric if eval_at: - kwargs['eval_at'] = eval_at + kwargs["eval_at"] = eval_at master_worker = next(iter(worker_map)) worker_ncores = client.ncores() # resolve aliases for network parameters and pop the result off params. # these values are added back in calls to `_train_part()` - params = _choose_param_value( - main_param_name="local_listen_port", - params=params, - default_value=12400 - ) + params = _choose_param_value(main_param_name="local_listen_port", params=params, default_value=12400) local_listen_port = params.pop("local_listen_port") - params = _choose_param_value( - main_param_name="machines", - params=params, - default_value=None - ) + params = _choose_param_value(main_param_name="machines", params=params, default_value=None) machines = params.pop("machines") # figure out network params @@ -779,10 +731,7 @@ def _train( worker_addresses = worker_map.keys() if machines is not None: _log_info("Using passed-in 'machines' parameter") - worker_address_to_port = _machines_to_worker_map( - machines=machines, - worker_addresses=worker_addresses - ) + worker_address_to_port = _machines_to_worker_map(machines=machines, worker_addresses=worker_addresses) else: if listen_port_in_params: _log_info("Using passed-in 'local_listen_port' for all workers") @@ -795,19 +744,16 @@ def _train( ) raise LightGBMError(msg) - worker_address_to_port = { - address: local_listen_port - for address in worker_addresses - } + worker_address_to_port = {address: local_listen_port for address in worker_addresses} else: _log_info("Finding random open ports for workers") - worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers(client, list(worker_map.keys())) + worker_to_socket_future, worker_address_to_port = _assign_open_ports_to_workers( + client, list(worker_map.keys()) + ) - machines = ','.join([ - f'{urlparse(worker_address).hostname}:{port}' - for worker_address, port - in worker_address_to_port.items() - ]) + machines = ",".join( + [f"{urlparse(worker_address).hostname}:{port}" for worker_address, port in worker_address_to_port.items()] + ) num_machines = len(worker_address_to_port) @@ -823,18 +769,18 @@ def _train( client.submit( _train_part, model_factory=model_factory, - params={**params, 'num_threads': worker_ncores[worker]}, + params={**params, "num_threads": worker_ncores[worker]}, list_of_parts=list_of_parts, machines=machines, local_listen_port=worker_address_to_port[worker], num_machines=num_machines, - time_out=params.get('time_out', 120), + time_out=params.get("time_out", 120), remote_socket=worker_to_socket_future.get(worker, None), return_model=(worker == master_worker), workers=[worker], allow_other_workers=False, pure=False, - **kwargs + **kwargs, ) for worker, list_of_parts in worker_map.items() ] @@ -848,14 +794,14 @@ def _train( # on the Dask cluster you're connected to and which workers have pieces of # the training data if not listen_port_in_params: - for param in _ConfigAliases.get('local_listen_port'): + for param in _ConfigAliases.get("local_listen_port"): model._other_params.pop(param, None) if not machines_in_params: - for param in _ConfigAliases.get('machines'): + for param in _ConfigAliases.get("machines"): model._other_params.pop(param, None) - for param in _ConfigAliases.get('num_machines', 'timeout'): + for param in _ConfigAliases.get("num_machines", "timeout"): model._other_params.pop(param, None) return model @@ -868,35 +814,24 @@ def _predict_part( pred_proba: bool, pred_leaf: bool, pred_contrib: bool, - **kwargs: Any + **kwargs: Any, ) -> _DaskPart: - result: _DaskPart if part.shape[0] == 0: result = np.array([]) elif pred_proba: result = model.predict_proba( - part, - raw_score=raw_score, - pred_leaf=pred_leaf, - pred_contrib=pred_contrib, - **kwargs + part, raw_score=raw_score, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs ) else: - result = model.predict( - part, - raw_score=raw_score, - pred_leaf=pred_leaf, - pred_contrib=pred_contrib, - **kwargs - ) + result = model.predict(part, raw_score=raw_score, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) # dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series if isinstance(part, pd_DataFrame): if len(result.shape) == 2: result = pd_DataFrame(result, index=part.index) else: - result = pd_Series(result, index=part.index, name='predictions') + result = pd_Series(result, index=part.index, name="predictions") return result @@ -910,7 +845,7 @@ def _predict( pred_leaf: bool = False, pred_contrib: bool = False, dtype: _PredictionDtype = np.float32, - **kwargs: Any + **kwargs: Any, ) -> Union[dask_Array, List[dask_Array]]: """Inner predict routine. @@ -943,7 +878,7 @@ def _predict( If ``pred_contrib=True``, the feature contributions for each sample. """ if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): - raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') + raise LightGBMError("dask, pandas and scikit-learn are required for lightgbm.dask") if isinstance(data, dask_DataFrame): return data.map_partitions( _predict_part, @@ -952,19 +887,14 @@ def _predict( pred_proba=pred_proba, pred_leaf=pred_leaf, pred_contrib=pred_contrib, - **kwargs + **kwargs, ).values elif isinstance(data, dask_Array): # for multi-class classification with sparse matrices, pred_contrib predictions # are returned as a list of sparse matrices (one per class) num_classes = model._n_classes - if ( - num_classes > 2 - and pred_contrib - and isinstance(data._meta, ss.spmatrix) - ): - + if num_classes > 2 and pred_contrib and isinstance(data._meta, ss.spmatrix): predict_function = partial( _predict_part, model=model, @@ -972,7 +902,7 @@ def _predict( pred_proba=pred_proba, pred_leaf=False, pred_contrib=True, - **kwargs + **kwargs, ) delayed_chunks = data.to_delayed() @@ -997,18 +927,16 @@ def _extract(items: List[Any], i: int) -> Any: for j, partition in enumerate(preds.to_delayed()): for i in range(num_classes): part = dask_array_from_delayed( - value=_extract(partition, i), - shape=(nrows_per_chunk[j], num_cols), - meta=pred_meta + value=_extract(partition, i), shape=(nrows_per_chunk[j], num_cols), meta=pred_meta ) out[i].append(part) # by default, dask.array.concatenate() concatenates sparse arrays into a COO matrix # the code below is used instead to ensure that the sparse type is preserved during concatentation if isinstance(pred_meta, ss.csr_matrix): - concat_fn = partial(ss.vstack, format='csr') + concat_fn = partial(ss.vstack, format="csr") elif isinstance(pred_meta, ss.csc_matrix): - concat_fn = partial(ss.vstack, format='csc') + concat_fn = partial(ss.vstack, format="csc") else: concat_fn = ss.vstack @@ -1018,9 +946,7 @@ def _extract(items: List[Any], i: int) -> Any: for i in range(num_classes): out_arrays.append( dask_array_from_delayed( - value=delayed(concat_fn)(out[i]), - shape=(data.shape[0], num_cols), - meta=pred_meta + value=delayed(concat_fn)(out[i]), shape=(data.shape[0], num_cols), meta=pred_meta ) ) @@ -1042,7 +968,7 @@ def _extract(items: List[Any], i: int) -> Any: if len(pred_row.shape) > 1: chunks += (pred_row.shape[1],) else: - map_blocks_kwargs['drop_axis'] = 1 + map_blocks_kwargs["drop_axis"] = 1 return data.map_blocks( predict_fn, chunks=chunks, @@ -1051,11 +977,10 @@ def _extract(items: List[Any], i: int) -> Any: **map_blocks_kwargs, ) else: - raise TypeError(f'Data must be either Dask Array or Dask DataFrame. Got {type(data).__name__}.') + raise TypeError(f"Data must be either Dask Array or Dask DataFrame. Got {type(data).__name__}.") class _DaskLGBMModel: - @property def client_(self) -> Client: """:obj:`dask.distributed.Client`: Dask client. @@ -1064,7 +989,7 @@ def client_(self) -> Client: with ``model.set_params(client=client)``. """ if not getattr(self, "fitted_", False): - raise LGBMNotFittedError('Cannot access property client_ before calling fit().') + raise LGBMNotFittedError("Cannot access property client_ before calling fit().") return _get_dask_client(client=self.client) @@ -1093,12 +1018,12 @@ def _lgb_dask_fit( eval_group: Optional[List[_DaskVectorLike]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_at: Optional[Union[List[int], Tuple[int, ...]]] = None, - **kwargs: Any + **kwargs: Any, ) -> "_DaskLGBMModel": if not DASK_INSTALLED: - raise LightGBMError('dask is required for lightgbm.dask') + raise LightGBMError("dask is required for lightgbm.dask") if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): - raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') + raise LightGBMError("dask, pandas and scikit-learn are required for lightgbm.dask") params = self.get_params(True) # type: ignore[attr-defined] params.pop("client", None) @@ -1120,7 +1045,7 @@ def _lgb_dask_fit( eval_group=eval_group, eval_metric=eval_metric, eval_at=eval_at, - **kwargs + **kwargs, ) self.set_params(**model.get_params()) # type: ignore[attr-defined] @@ -1137,7 +1062,9 @@ def _lgb_dask_to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel: return model @staticmethod - def _lgb_dask_copy_extra_params(source: Union["_DaskLGBMModel", LGBMModel], dest: Union["_DaskLGBMModel", LGBMModel]) -> None: + def _lgb_dask_copy_extra_params( + source: Union["_DaskLGBMModel", LGBMModel], dest: Union["_DaskLGBMModel", LGBMModel] + ) -> None: params = source.get_params() # type: ignore[union-attr] attributes = source.__dict__ extra_param_names = set(attributes.keys()).difference(params.keys()) @@ -1150,7 +1077,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): def __init__( self, - boosting_type: str = 'gbdt', + boosting_type: str = "gbdt", num_leaves: int = 31, max_depth: int = -1, learning_rate: float = 0.1, @@ -1158,19 +1085,19 @@ def __init__( subsample_for_bin: int = 200000, objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, class_weight: Optional[Union[dict, str]] = None, - min_split_gain: float = 0., + min_split_gain: float = 0.0, min_child_weight: float = 1e-3, min_child_samples: int = 20, - subsample: float = 1., + subsample: float = 1.0, subsample_freq: int = 0, - colsample_bytree: float = 1., - reg_alpha: float = 0., - reg_lambda: float = 0., - random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + colsample_bytree: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 0.0, + random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, n_jobs: Optional[int] = None, - importance_type: str = 'split', + importance_type: str = "split", client: Optional[Client] = None, - **kwargs: Any + **kwargs: Any, ): """Docstring is inherited from the lightgbm.LGBMClassifier.__init__.""" self.client = client @@ -1194,11 +1121,11 @@ def __init__( random_state=random_state, n_jobs=n_jobs, importance_type=importance_type, - **kwargs + **kwargs, ) _base_doc = LGBMClassifier.__init__.__doc__ - _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore + _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore __init__.__doc__ = f""" {_before_kwargs}client : dask.distributed.Client or None, optional (default=None) {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled. @@ -1220,7 +1147,7 @@ def fit( # type: ignore[override] eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_init_score: Optional[List[_DaskCollection]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, - **kwargs: Any + **kwargs: Any, ) -> "DaskLGBMClassifier": """Docstring is inherited from the lightgbm.LGBMClassifier.fit.""" self._lgb_dask_fit( @@ -1235,7 +1162,7 @@ def fit( # type: ignore[override] eval_class_weight=eval_class_weight, eval_init_score=eval_init_score, eval_metric=eval_metric, - **kwargs + **kwargs, ) return self @@ -1247,15 +1174,13 @@ def fit( # type: ignore[override] group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)", - eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" + eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)", ) # DaskLGBMClassifier does not support group, eval_group. - _base_doc = (_base_doc[:_base_doc.find('group :')] - + _base_doc[_base_doc.find('eval_set :'):]) + _base_doc = _base_doc[: _base_doc.find("group :")] + _base_doc[_base_doc.find("eval_set :") :] - _base_doc = (_base_doc[:_base_doc.find('eval_group :')] - + _base_doc[_base_doc.find('eval_metric :'):]) + _base_doc = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :] # DaskLGBMClassifier support for callbacks and init_model is not tested fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs @@ -1278,7 +1203,7 @@ def predict( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMClassifier.predict.""" return _predict( @@ -1292,7 +1217,7 @@ def predict( pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **kwargs + **kwargs, ) predict.__doc__ = _lgbmmodel_doc_predict.format( @@ -1301,7 +1226,7 @@ def predict( output_name="predicted_result", predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]", ) def predict_proba( @@ -1313,7 +1238,7 @@ def predict_proba( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMClassifier.predict_proba.""" return _predict( @@ -1327,7 +1252,7 @@ def predict_proba( pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **kwargs + **kwargs, ) predict_proba.__doc__ = _lgbmmodel_doc_predict.format( @@ -1336,7 +1261,7 @@ def predict_proba( output_name="predicted_probability", predicted_result_shape="Dask Array of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or (if multi-class and using sparse inputs) a list of ``n_classes`` Dask Arrays of shape = [n_samples, n_features + 1]", ) def to_local(self) -> LGBMClassifier: @@ -1355,7 +1280,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel): def __init__( self, - boosting_type: str = 'gbdt', + boosting_type: str = "gbdt", num_leaves: int = 31, max_depth: int = -1, learning_rate: float = 0.1, @@ -1363,19 +1288,19 @@ def __init__( subsample_for_bin: int = 200000, objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, class_weight: Optional[Union[dict, str]] = None, - min_split_gain: float = 0., + min_split_gain: float = 0.0, min_child_weight: float = 1e-3, min_child_samples: int = 20, - subsample: float = 1., + subsample: float = 1.0, subsample_freq: int = 0, - colsample_bytree: float = 1., - reg_alpha: float = 0., - reg_lambda: float = 0., - random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + colsample_bytree: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 0.0, + random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, n_jobs: Optional[int] = None, - importance_type: str = 'split', + importance_type: str = "split", client: Optional[Client] = None, - **kwargs: Any + **kwargs: Any, ): """Docstring is inherited from the lightgbm.LGBMRegressor.__init__.""" self.client = client @@ -1399,11 +1324,11 @@ def __init__( random_state=random_state, n_jobs=n_jobs, importance_type=importance_type, - **kwargs + **kwargs, ) _base_doc = LGBMRegressor.__init__.__doc__ - _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore + _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore __init__.__doc__ = f""" {_before_kwargs}client : dask.distributed.Client or None, optional (default=None) {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled. @@ -1424,7 +1349,7 @@ def fit( # type: ignore[override] eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskVectorLike]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, - **kwargs: Any + **kwargs: Any, ) -> "DaskLGBMRegressor": """Docstring is inherited from the lightgbm.LGBMRegressor.fit.""" self._lgb_dask_fit( @@ -1438,7 +1363,7 @@ def fit( # type: ignore[override] eval_sample_weight=eval_sample_weight, eval_init_score=eval_init_score, eval_metric=eval_metric, - **kwargs + **kwargs, ) return self @@ -1450,18 +1375,15 @@ def fit( # type: ignore[override] group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)", - eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" + eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)", ) # DaskLGBMRegressor does not support group, eval_class_weight, eval_group. - _base_doc = (_base_doc[:_base_doc.find('group :')] - + _base_doc[_base_doc.find('eval_set :'):]) + _base_doc = _base_doc[: _base_doc.find("group :")] + _base_doc[_base_doc.find("eval_set :") :] - _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')] - + _base_doc[_base_doc.find('eval_init_score :'):]) + _base_doc = _base_doc[: _base_doc.find("eval_class_weight :")] + _base_doc[_base_doc.find("eval_init_score :") :] - _base_doc = (_base_doc[:_base_doc.find('eval_group :')] - + _base_doc[_base_doc.find('eval_metric :'):]) + _base_doc = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :] # DaskLGBMRegressor support for callbacks and init_model is not tested fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs @@ -1484,7 +1406,7 @@ def predict( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRegressor.predict.""" return _predict( @@ -1497,7 +1419,7 @@ def predict( pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **kwargs + **kwargs, ) predict.__doc__ = _lgbmmodel_doc_predict.format( @@ -1506,7 +1428,7 @@ def predict( output_name="predicted_result", predicted_result_shape="Dask Array of shape = [n_samples]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]", ) def to_local(self) -> LGBMRegressor: @@ -1525,7 +1447,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel): def __init__( self, - boosting_type: str = 'gbdt', + boosting_type: str = "gbdt", num_leaves: int = 31, max_depth: int = -1, learning_rate: float = 0.1, @@ -1533,19 +1455,19 @@ def __init__( subsample_for_bin: int = 200000, objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, class_weight: Optional[Union[dict, str]] = None, - min_split_gain: float = 0., + min_split_gain: float = 0.0, min_child_weight: float = 1e-3, min_child_samples: int = 20, - subsample: float = 1., + subsample: float = 1.0, subsample_freq: int = 0, - colsample_bytree: float = 1., - reg_alpha: float = 0., - reg_lambda: float = 0., - random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + colsample_bytree: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 0.0, + random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, n_jobs: Optional[int] = None, - importance_type: str = 'split', + importance_type: str = "split", client: Optional[Client] = None, - **kwargs: Any + **kwargs: Any, ): """Docstring is inherited from the lightgbm.LGBMRanker.__init__.""" self.client = client @@ -1569,11 +1491,11 @@ def __init__( random_state=random_state, n_jobs=n_jobs, importance_type=importance_type, - **kwargs + **kwargs, ) _base_doc = LGBMRanker.__init__.__doc__ - _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs') # type: ignore + _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition("**kwargs") # type: ignore __init__.__doc__ = f""" {_before_kwargs}client : dask.distributed.Client or None, optional (default=None) {' ':4}Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. The Dask client used by this class will not be saved if the model object is pickled. @@ -1597,7 +1519,7 @@ def fit( # type: ignore[override] eval_group: Optional[List[_DaskVectorLike]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5), - **kwargs: Any + **kwargs: Any, ) -> "DaskLGBMRanker": """Docstring is inherited from the lightgbm.LGBMRanker.fit.""" self._lgb_dask_fit( @@ -1614,7 +1536,7 @@ def fit( # type: ignore[override] eval_group=eval_group, eval_metric=eval_metric, eval_at=eval_at, - **kwargs + **kwargs, ) return self @@ -1626,17 +1548,18 @@ def fit( # type: ignore[override] group_shape="Dask Array or Dask Series or None, optional (default=None)", eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)", - eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" + eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)", ) # DaskLGBMRanker does not support eval_class_weight or early stopping - _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')] - + _base_doc[_base_doc.find('eval_init_score :'):]) + _base_doc = _base_doc[: _base_doc.find("eval_class_weight :")] + _base_doc[_base_doc.find("eval_init_score :") :] - _base_doc = (_base_doc[:_base_doc.find('feature_name :')] - + "eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))\n" - + f"{' ':8}The evaluation positions of the specified metric.\n" - + f"{' ':4}{_base_doc[_base_doc.find('feature_name :'):]}") + _base_doc = ( + _base_doc[: _base_doc.find("feature_name :")] + + "eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))\n" + + f"{' ':8}The evaluation positions of the specified metric.\n" + + f"{' ':4}{_base_doc[_base_doc.find('feature_name :'):]}" + ) # DaskLGBMRanker support for callbacks and init_model is not tested fit.__doc__ = f"""{_base_doc[:_base_doc.find('callbacks :')]}**kwargs @@ -1659,7 +1582,7 @@ def predict( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ) -> dask_Array: """Docstring is inherited from the lightgbm.LGBMRanker.predict.""" return _predict( @@ -1672,7 +1595,7 @@ def predict( pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **kwargs + **kwargs, ) predict.__doc__ = _lgbmmodel_doc_predict.format( @@ -1681,7 +1604,7 @@ def predict( output_name="predicted_result", predicted_result_shape="Dask Array of shape = [n_samples]", X_leaves_shape="Dask Array of shape = [n_samples, n_trees]", - X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]" + X_SHAP_values_shape="Dask Array of shape = [n_samples, n_features + 1]", ) def to_local(self) -> LGBMRanker: diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index e1779f0723be..0e4f2b0a5858 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -28,9 +28,9 @@ from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold __all__ = [ - 'cv', - 'CVBooster', - 'train', + "cv", + "CVBooster", + "train", ] @@ -39,16 +39,10 @@ [np.ndarray, Dataset], _LGBM_EvalFunctionResultType, ], - Callable[ - [np.ndarray, Dataset], - List[_LGBM_EvalFunctionResultType] - ], + Callable[[np.ndarray, Dataset], List[_LGBM_EvalFunctionResultType]], ] -_LGBM_PreprocFunction = Callable[ - [Dataset, Dataset, Dict[str, Any]], - Tuple[Dataset, Dataset, Dict[str, Any]] -] +_LGBM_PreprocFunction = Callable[[Dataset, Dataset, Dict[str, Any]], Tuple[Dataset, Dataset, Dict[str, Any]]] def train( @@ -59,10 +53,10 @@ def train( valid_names: Optional[List[str]] = None, feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None, init_model: Optional[Union[str, Path, Booster]] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", keep_training_booster: bool = False, - callbacks: Optional[List[Callable]] = None + callbacks: Optional[List[Callable]] = None, ) -> Booster: """Perform the training with given parameters. @@ -168,51 +162,36 @@ def train( # create predictor first params = copy.deepcopy(params) - params = _choose_param_value( - main_param_name='objective', - params=params, - default_value=None - ) + params = _choose_param_value(main_param_name="objective", params=params, default_value=None) fobj: Optional[_LGBM_CustomObjectiveFunction] = None if callable(params["objective"]): fobj = params["objective"] - params["objective"] = 'none' + params["objective"] = "none" for alias in _ConfigAliases.get("num_iterations"): if alias in params: num_boost_round = params.pop(alias) _log_warning(f"Found `{alias}` in params. Will use it instead of argument") params["num_iterations"] = num_boost_round # setting early stopping via global params should be possible - params = _choose_param_value( - main_param_name="early_stopping_round", - params=params, - default_value=None - ) + params = _choose_param_value(main_param_name="early_stopping_round", params=params, default_value=None) if params["early_stopping_round"] is None: params.pop("early_stopping_round") - first_metric_only = params.get('first_metric_only', False) + first_metric_only = params.get("first_metric_only", False) predictor: Optional[_InnerPredictor] = None if isinstance(init_model, (str, Path)): - predictor = _InnerPredictor.from_model_file( - model_file=init_model, - pred_parameter=params - ) + predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params) elif isinstance(init_model, Booster): - predictor = _InnerPredictor.from_booster( - booster=init_model, - pred_parameter=dict(init_model.params, **params) - ) + predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params)) if predictor is not None: init_iteration = predictor.current_iteration() else: init_iteration = 0 - train_set._update_params(params) \ - ._set_predictor(predictor) \ - .set_feature_name(feature_name) \ - .set_categorical_feature(categorical_feature) + train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature( + categorical_feature + ) is_valid_contain_train = False train_data_name = "training" @@ -234,13 +213,13 @@ def train( if valid_names is not None and len(valid_names) > i: name_valid_sets.append(valid_names[i]) else: - name_valid_sets.append(f'valid_{i}') + name_valid_sets.append(f"valid_{i}") # process callbacks if callbacks is None: callbacks_set = set() else: for i, cb in enumerate(callbacks): - cb.__dict__.setdefault('order', i - len(callbacks)) + cb.__dict__.setdefault("order", i - len(callbacks)) callbacks_set = set(callbacks) if "early_stopping_round" in params: @@ -248,18 +227,17 @@ def train( callback.early_stopping( stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] first_metric_only=first_metric_only, - verbose=_choose_param_value( - main_param_name="verbosity", - params=params, - default_value=1 - ).pop("verbosity") > 0 + verbose=_choose_param_value(main_param_name="verbosity", params=params, default_value=1).pop( + "verbosity" + ) + > 0, ) ) - callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)} + callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)} callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set - callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter('order')) - callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter('order')) + callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order")) + callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order")) # construct booster try: @@ -277,12 +255,16 @@ def train( # start training for i in range(init_iteration, init_iteration + num_boost_round): for cb in callbacks_before_iter: - cb(callback.CallbackEnv(model=booster, - params=params, - iteration=i, - begin_iteration=init_iteration, - end_iteration=init_iteration + num_boost_round, - evaluation_result_list=None)) + cb( + callback.CallbackEnv( + model=booster, + params=params, + iteration=i, + begin_iteration=init_iteration, + end_iteration=init_iteration + num_boost_round, + evaluation_result_list=None, + ) + ) booster.update(fobj=fobj) @@ -294,12 +276,16 @@ def train( evaluation_result_list.extend(booster.eval_valid(feval)) try: for cb in callbacks_after_iter: - cb(callback.CallbackEnv(model=booster, - params=params, - iteration=i, - begin_iteration=init_iteration, - end_iteration=init_iteration + num_boost_round, - evaluation_result_list=evaluation_result_list)) + cb( + callback.CallbackEnv( + model=booster, + params=params, + iteration=i, + begin_iteration=init_iteration, + end_iteration=init_iteration + num_boost_round, + evaluation_result_list=evaluation_result_list, + ) + ) except callback.EarlyStopException as earlyStopException: booster.best_iteration = earlyStopException.best_iteration + 1 evaluation_result_list = earlyStopException.best_score @@ -332,10 +318,7 @@ class CVBooster: The best iteration of fitted model. """ - def __init__( - self, - model_file: Optional[Union[str, Path]] = None - ): + def __init__(self, model_file: Optional[Union[str, Path]] = None): """Initialize the CVBooster. Parameters @@ -361,18 +344,23 @@ def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importanc """Serialize CVBooster to dict.""" models_str = [] for booster in self.boosters: - models_str.append(booster.model_to_string(num_iteration=num_iteration, start_iteration=start_iteration, - importance_type=importance_type)) + models_str.append( + booster.model_to_string( + num_iteration=num_iteration, start_iteration=start_iteration, importance_type=importance_type + ) + ) return {"boosters": models_str, "best_iteration": self.best_iteration} def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]: """Redirect methods call of CVBooster.""" + def handler_function(*args: Any, **kwargs: Any) -> List[Any]: """Call methods with each booster, and concatenate their results.""" ret = [] for booster in self.boosters: ret.append(getattr(booster, name)(*args, **kwargs)) return ret + return handler_function def __getstate__(self) -> Dict[str, Any]: @@ -398,10 +386,7 @@ def model_from_string(self, model_str: str) -> "CVBooster": return self def model_to_string( - self, - num_iteration: Optional[int] = None, - start_iteration: int = 0, - importance_type: str = 'split' + self, num_iteration: Optional[int] = None, start_iteration: int = 0, importance_type: str = "split" ) -> str: """Save CVBooster to JSON string. @@ -430,7 +415,7 @@ def save_model( filename: Union[str, Path], num_iteration: Optional[int] = None, start_iteration: int = 0, - importance_type: str = 'split' + importance_type: str = "split", ) -> "CVBooster": """Save CVBooster to a file as JSON text. @@ -469,16 +454,18 @@ def _make_n_folds( fpreproc: Optional[_LGBM_PreprocFunction], stratified: bool, shuffle: bool, - eval_train_metric: bool + eval_train_metric: bool, ) -> CVBooster: """Make a n-fold list of Booster from random indices.""" full_data = full_data.construct() num_data = full_data.num_data() if folds is not None: - if not hasattr(folds, '__iter__') and not hasattr(folds, 'split'): - raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx) tuples " - "or scikit-learn splitter object with split method") - if hasattr(folds, 'split'): + if not hasattr(folds, "__iter__") and not hasattr(folds, "split"): + raise AttributeError( + "folds should be a generator or iterator of (train_idx, test_idx) tuples " + "or scikit-learn splitter object with split method" + ) + if hasattr(folds, "split"): group_info = full_data.get_group() if group_info is not None: group_info = np.array(group_info, dtype=np.int32, copy=False) @@ -487,11 +474,13 @@ def _make_n_folds( flatted_group = np.zeros(num_data, dtype=np.int32) folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group) else: - if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg", - "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"} - for obj_alias in _ConfigAliases.get("objective")): + if any( + params.get(obj_alias, "") + in {"lambdarank", "rank_xendcg", "xendcg", "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"} + for obj_alias in _ConfigAliases.get("objective") + ): if not SKLEARN_INSTALLED: - raise LightGBMError('scikit-learn is required for ranking cv') + raise LightGBMError("scikit-learn is required for ranking cv") # ranking task, split according to groups group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False) flatted_group = np.repeat(range(len(group_info)), repeats=group_info) @@ -499,7 +488,7 @@ def _make_n_folds( folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group) elif stratified: if not SKLEARN_INSTALLED: - raise LightGBMError('scikit-learn is required for stratified cv') + raise LightGBMError("scikit-learn is required for stratified cv") skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) folds = skf.split(X=np.empty(num_data), y=full_data.get_label()) else: @@ -508,7 +497,7 @@ def _make_n_folds( else: randidx = np.arange(num_data) kstep = int(num_data / nfold) - test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)] + test_id = [randidx[i : i + kstep] for i in range(0, num_data, kstep)] train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)] folds = zip(train_id, test_id) @@ -523,14 +512,14 @@ def _make_n_folds( tparam = params booster_for_fold = Booster(tparam, train_set) if eval_train_metric: - booster_for_fold.add_valid(train_set, 'train') - booster_for_fold.add_valid(valid_set, 'valid') + booster_for_fold.add_valid(train_set, "train") + booster_for_fold.add_valid(valid_set, "valid") ret.boosters.append(booster_for_fold) return ret def _agg_cv_result( - raw_results: List[List[_LGBM_BoosterEvalMethodResultType]] + raw_results: List[List[_LGBM_BoosterEvalMethodResultType]], ) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]: """Aggregate cross-validation results.""" cvmap: Dict[str, List[float]] = OrderedDict() @@ -541,7 +530,7 @@ def _agg_cv_result( metric_type[key] = one_line[3] cvmap.setdefault(key, []) cvmap[key].append(one_line[2]) - return [('cv_agg', k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] + return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()] def cv( @@ -555,13 +544,13 @@ def cv( metrics: Optional[Union[str, List[str]]] = None, feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None, init_model: Optional[Union[str, Path, Booster]] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", fpreproc: Optional[_LGBM_PreprocFunction] = None, seed: int = 0, callbacks: Optional[List[Callable]] = None, eval_train_metric: bool = False, - return_cvbooster: bool = False + return_cvbooster: bool = False, ) -> Dict[str, Union[List[float], CVBooster]]: """Perform the cross-validation with given parameters. @@ -682,65 +671,57 @@ def cv( raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") params = copy.deepcopy(params) - params = _choose_param_value( - main_param_name='objective', - params=params, - default_value=None - ) + params = _choose_param_value(main_param_name="objective", params=params, default_value=None) fobj: Optional[_LGBM_CustomObjectiveFunction] = None if callable(params["objective"]): fobj = params["objective"] - params["objective"] = 'none' + params["objective"] = "none" for alias in _ConfigAliases.get("num_iterations"): if alias in params: _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument") num_boost_round = params.pop(alias) params["num_iterations"] = num_boost_round # setting early stopping via global params should be possible - params = _choose_param_value( - main_param_name="early_stopping_round", - params=params, - default_value=None - ) + params = _choose_param_value(main_param_name="early_stopping_round", params=params, default_value=None) if params["early_stopping_round"] is None: params.pop("early_stopping_round") - first_metric_only = params.get('first_metric_only', False) + first_metric_only = params.get("first_metric_only", False) if isinstance(init_model, (str, Path)): - predictor = _InnerPredictor.from_model_file( - model_file=init_model, - pred_parameter=params - ) + predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params) elif isinstance(init_model, Booster): - predictor = _InnerPredictor.from_booster( - booster=init_model, - pred_parameter=dict(init_model.params, **params) - ) + predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params)) else: predictor = None if metrics is not None: for metric_alias in _ConfigAliases.get("metric"): params.pop(metric_alias, None) - params['metric'] = metrics + params["metric"] = metrics - train_set._update_params(params) \ - ._set_predictor(predictor) \ - .set_feature_name(feature_name) \ - .set_categorical_feature(categorical_feature) + train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature( + categorical_feature + ) results = defaultdict(list) - cvfolds = _make_n_folds(full_data=train_set, folds=folds, nfold=nfold, - params=params, seed=seed, fpreproc=fpreproc, - stratified=stratified, shuffle=shuffle, - eval_train_metric=eval_train_metric) + cvfolds = _make_n_folds( + full_data=train_set, + folds=folds, + nfold=nfold, + params=params, + seed=seed, + fpreproc=fpreproc, + stratified=stratified, + shuffle=shuffle, + eval_train_metric=eval_train_metric, + ) # setup callbacks if callbacks is None: callbacks_set = set() else: for i, cb in enumerate(callbacks): - cb.__dict__.setdefault('order', i - len(callbacks)) + cb.__dict__.setdefault("order", i - len(callbacks)) callbacks_set = set(callbacks) if "early_stopping_round" in params: @@ -748,49 +729,56 @@ def cv( callback.early_stopping( stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] first_metric_only=first_metric_only, - verbose=_choose_param_value( - main_param_name="verbosity", - params=params, - default_value=1 - ).pop("verbosity") > 0 + verbose=_choose_param_value(main_param_name="verbosity", params=params, default_value=1).pop( + "verbosity" + ) + > 0, ) ) - callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, 'before_iteration', False)} + callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)} callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set - callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter('order')) - callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter('order')) + callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order")) + callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order")) for i in range(num_boost_round): for cb in callbacks_before_iter: - cb(callback.CallbackEnv(model=cvfolds, - params=params, - iteration=i, - begin_iteration=0, - end_iteration=num_boost_round, - evaluation_result_list=None)) + cb( + callback.CallbackEnv( + model=cvfolds, + params=params, + iteration=i, + begin_iteration=0, + end_iteration=num_boost_round, + evaluation_result_list=None, + ) + ) cvfolds.update(fobj=fobj) # type: ignore[call-arg] res = _agg_cv_result(cvfolds.eval_valid(feval)) # type: ignore[call-arg] for _, key, mean, _, std in res: - results[f'{key}-mean'].append(mean) - results[f'{key}-stdv'].append(std) + results[f"{key}-mean"].append(mean) + results[f"{key}-stdv"].append(std) try: for cb in callbacks_after_iter: - cb(callback.CallbackEnv(model=cvfolds, - params=params, - iteration=i, - begin_iteration=0, - end_iteration=num_boost_round, - evaluation_result_list=res)) + cb( + callback.CallbackEnv( + model=cvfolds, + params=params, + iteration=i, + begin_iteration=0, + end_iteration=num_boost_round, + evaluation_result_list=res, + ) + ) except callback.EarlyStopException as earlyStopException: cvfolds.best_iteration = earlyStopException.best_iteration + 1 for bst in cvfolds.boosters: bst.best_iteration = cvfolds.best_iteration for k in results: - results[k] = results[k][:cvfolds.best_iteration] + results[k] = results[k][: cvfolds.best_iteration] break if return_cvbooster: - results['cvbooster'] = cvfolds # type: ignore[assignment] + results["cvbooster"] = cvfolds # type: ignore[assignment] return dict(results) diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py index 21222228b0c2..df540f2ba8fd 100644 --- a/python-package/lightgbm/libpath.py +++ b/python-package/lightgbm/libpath.py @@ -16,17 +16,15 @@ def find_lib_path() -> List[str]: List of all found library paths to LightGBM. """ curr_path = Path(__file__).absolute() - dll_path = [curr_path.parents[1], - curr_path.parents[0] / 'bin', - curr_path.parents[0] / 'lib'] - if system() in ('Windows', 'Microsoft'): - dll_path.append(curr_path.parents[1] / 'Release') - dll_path.append(curr_path.parents[1] / 'windows' / 'x64' / 'DLL') - dll_path = [p / 'lib_lightgbm.dll' for p in dll_path] + dll_path = [curr_path.parents[1], curr_path.parents[0] / "bin", curr_path.parents[0] / "lib"] + if system() in ("Windows", "Microsoft"): + dll_path.append(curr_path.parents[1] / "Release") + dll_path.append(curr_path.parents[1] / "windows" / "x64" / "DLL") + dll_path = [p / "lib_lightgbm.dll" for p in dll_path] else: - dll_path = [p / 'lib_lightgbm.so' for p in dll_path] + dll_path = [p / "lib_lightgbm.so" for p in dll_path] lib_path = [str(p) for p in dll_path if p.is_file()] if not lib_path: - dll_path_joined = '\n'.join(map(str, dll_path)) - raise Exception(f'Cannot find lightgbm library file in following paths:\n{dll_path_joined}') + dll_path_joined = "\n".join(map(str, dll_path)) + raise Exception(f"Cannot find lightgbm library file in following paths:\n{dll_path_joined}") return lib_path diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py index 85b245c187ef..9737fd4d26f8 100644 --- a/python-package/lightgbm/plotting.py +++ b/python-package/lightgbm/plotting.py @@ -12,11 +12,11 @@ from .sklearn import LGBMModel __all__ = [ - 'create_tree_digraph', - 'plot_importance', - 'plot_metric', - 'plot_split_value_histogram', - 'plot_tree', + "create_tree_digraph", + "plot_importance", + "plot_metric", + "plot_split_value_histogram", + "plot_tree", ] @@ -27,9 +27,7 @@ def _check_not_tuple_of_2_elements(obj: Any, obj_name: str) -> None: def _float2str(value: float, precision: Optional[int]) -> str: - return (f"{value:.{precision}f}" - if precision is not None and not isinstance(value, str) - else str(value)) + return f"{value:.{precision}f}" if precision is not None and not isinstance(value, str) else str(value) def plot_importance( @@ -38,17 +36,17 @@ def plot_importance( height: float = 0.2, xlim: Optional[Tuple[float, float]] = None, ylim: Optional[Tuple[float, float]] = None, - title: Optional[str] = 'Feature importance', - xlabel: Optional[str] = 'Feature importance', - ylabel: Optional[str] = 'Features', - importance_type: str = 'auto', + title: Optional[str] = "Feature importance", + xlabel: Optional[str] = "Feature importance", + ylabel: Optional[str] = "Features", + importance_type: str = "auto", max_num_features: Optional[int] = None, ignore_zero: bool = True, figsize: Optional[Tuple[float, float]] = None, dpi: Optional[int] = None, grid: bool = True, precision: Optional[int] = 3, - **kwargs: Any + **kwargs: Any, ) -> Any: """Plot model's feature importances. @@ -104,7 +102,7 @@ def plot_importance( if MATPLOTLIB_INSTALLED: import matplotlib.pyplot as plt else: - raise ImportError('You must install matplotlib and restart your session to plot importance.') + raise ImportError("You must install matplotlib and restart your session to plot importance.") if isinstance(booster, LGBMModel): if importance_type == "auto": @@ -114,7 +112,7 @@ def plot_importance( if importance_type == "auto": importance_type = "split" else: - raise TypeError('booster must be Booster or LGBMModel.') + raise TypeError("booster must be Booster or LGBMModel.") importance = booster.feature_importance(importance_type=importance_type) feature_name = booster.feature_name() @@ -131,28 +129,26 @@ def plot_importance( if ax is None: if figsize is not None: - _check_not_tuple_of_2_elements(figsize, 'figsize') + _check_not_tuple_of_2_elements(figsize, "figsize") _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) ylocs = np.arange(len(values)) - ax.barh(ylocs, values, align='center', height=height, **kwargs) + ax.barh(ylocs, values, align="center", height=height, **kwargs) for x, y in zip(values, ylocs): - ax.text(x + 1, y, - _float2str(x, precision) if importance_type == 'gain' else x, - va='center') + ax.text(x + 1, y, _float2str(x, precision) if importance_type == "gain" else x, va="center") ax.set_yticks(ylocs) ax.set_yticklabels(labels) if xlim is not None: - _check_not_tuple_of_2_elements(xlim, 'xlim') + _check_not_tuple_of_2_elements(xlim, "xlim") else: xlim = (0, max(values) * 1.1) ax.set_xlim(xlim) if ylim is not None: - _check_not_tuple_of_2_elements(ylim, 'ylim') + _check_not_tuple_of_2_elements(ylim, "ylim") else: ylim = (-1, len(values)) ax.set_ylim(ylim) @@ -160,7 +156,7 @@ def plot_importance( if title is not None: ax.set_title(title) if xlabel is not None: - xlabel = xlabel.replace('@importance_type@', importance_type) + xlabel = xlabel.replace("@importance_type@", importance_type) ax.set_xlabel(xlabel) if ylabel is not None: ax.set_ylabel(ylabel) @@ -176,13 +172,13 @@ def plot_split_value_histogram( width_coef: float = 0.8, xlim: Optional[Tuple[float, float]] = None, ylim: Optional[Tuple[float, float]] = None, - title: Optional[str] = 'Split value histogram for feature with @index/name@ @feature@', - xlabel: Optional[str] = 'Feature split value', - ylabel: Optional[str] = 'Count', + title: Optional[str] = "Split value histogram for feature with @index/name@ @feature@", + xlabel: Optional[str] = "Feature split value", + ylabel: Optional[str] = "Count", figsize: Optional[Tuple[float, float]] = None, dpi: Optional[int] = None, grid: bool = True, - **kwargs: Any + **kwargs: Any, ) -> Any: """Plot split value histogram for the specified feature of the model. @@ -238,29 +234,28 @@ def plot_split_value_histogram( import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator else: - raise ImportError('You must install matplotlib and restart your session to plot split value histogram.') + raise ImportError("You must install matplotlib and restart your session to plot split value histogram.") if isinstance(booster, LGBMModel): booster = booster.booster_ elif not isinstance(booster, Booster): - raise TypeError('booster must be Booster or LGBMModel.') + raise TypeError("booster must be Booster or LGBMModel.") hist, split_bins = booster.get_split_value_histogram(feature=feature, bins=bins, xgboost_style=False) if np.count_nonzero(hist) == 0: - raise ValueError('Cannot plot split value histogram, ' - f'because feature {feature} was not used in splitting') + raise ValueError("Cannot plot split value histogram, " f"because feature {feature} was not used in splitting") width = width_coef * (split_bins[1] - split_bins[0]) centred = (split_bins[:-1] + split_bins[1:]) / 2 if ax is None: if figsize is not None: - _check_not_tuple_of_2_elements(figsize, 'figsize') + _check_not_tuple_of_2_elements(figsize, "figsize") _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) - ax.bar(centred, hist, align='center', width=width, **kwargs) + ax.bar(centred, hist, align="center", width=width, **kwargs) if xlim is not None: - _check_not_tuple_of_2_elements(xlim, 'xlim') + _check_not_tuple_of_2_elements(xlim, "xlim") else: range_result = split_bins[-1] - split_bins[0] xlim = (split_bins[0] - range_result * 0.2, split_bins[-1] + range_result * 0.2) @@ -268,14 +263,14 @@ def plot_split_value_histogram( ax.yaxis.set_major_locator(MaxNLocator(integer=True)) if ylim is not None: - _check_not_tuple_of_2_elements(ylim, 'ylim') + _check_not_tuple_of_2_elements(ylim, "ylim") else: ylim = (0, max(hist) * 1.1) ax.set_ylim(ylim) if title is not None: - title = title.replace('@feature@', str(feature)) - title = title.replace('@index/name@', ('name' if isinstance(feature, str) else 'index')) + title = title.replace("@feature@", str(feature)) + title = title.replace("@index/name@", ("name" if isinstance(feature, str) else "index")) ax.set_title(title) if xlabel is not None: ax.set_xlabel(xlabel) @@ -292,12 +287,12 @@ def plot_metric( ax=None, xlim: Optional[Tuple[float, float]] = None, ylim: Optional[Tuple[float, float]] = None, - title: Optional[str] = 'Metric during training', - xlabel: Optional[str] = 'Iterations', - ylabel: Optional[str] = '@metric@', + title: Optional[str] = "Metric during training", + xlabel: Optional[str] = "Iterations", + ylabel: Optional[str] = "@metric@", figsize: Optional[Tuple[float, float]] = None, dpi: Optional[int] = None, - grid: bool = True + grid: bool = True, ) -> Any: """Plot one metric during training. @@ -345,31 +340,33 @@ def plot_metric( if MATPLOTLIB_INSTALLED: import matplotlib.pyplot as plt else: - raise ImportError('You must install matplotlib and restart your session to plot metric.') + raise ImportError("You must install matplotlib and restart your session to plot metric.") if isinstance(booster, LGBMModel): eval_results = deepcopy(booster.evals_result_) elif isinstance(booster, dict): eval_results = deepcopy(booster) elif isinstance(booster, Booster): - raise TypeError("booster must be dict or LGBMModel. To use plot_metric with Booster type, first record the metrics using record_evaluation callback then pass that to plot_metric as argument `booster`") + raise TypeError( + "booster must be dict or LGBMModel. To use plot_metric with Booster type, first record the metrics using record_evaluation callback then pass that to plot_metric as argument `booster`" + ) else: - raise TypeError('booster must be dict or LGBMModel.') + raise TypeError("booster must be dict or LGBMModel.") num_data = len(eval_results) if not num_data: - raise ValueError('eval results cannot be empty.') + raise ValueError("eval results cannot be empty.") if ax is None: if figsize is not None: - _check_not_tuple_of_2_elements(figsize, 'figsize') + _check_not_tuple_of_2_elements(figsize, "figsize") _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) if dataset_names is None: dataset_names_iter = iter(eval_results.keys()) elif not isinstance(dataset_names, (list, tuple, set)) or not dataset_names: - raise ValueError('dataset_names should be iterable and cannot be empty') + raise ValueError("dataset_names should be iterable and cannot be empty") else: dataset_names_iter = iter(dataset_names) @@ -382,7 +379,7 @@ def plot_metric( metric, results = metrics_for_one.popitem() else: if metric not in metrics_for_one: - raise KeyError('No given metric in eval results.') + raise KeyError("No given metric in eval results.") results = metrics_for_one[metric] num_iteration = len(results) max_result = max(results) @@ -397,16 +394,16 @@ def plot_metric( min_result = min(min(results), min_result) ax.plot(x_, results, label=name) - ax.legend(loc='best') + ax.legend(loc="best") if xlim is not None: - _check_not_tuple_of_2_elements(xlim, 'xlim') + _check_not_tuple_of_2_elements(xlim, "xlim") else: xlim = (0, num_iteration) ax.set_xlim(xlim) if ylim is not None: - _check_not_tuple_of_2_elements(ylim, 'ylim') + _check_not_tuple_of_2_elements(ylim, "ylim") else: range_result = max_result - min_result ylim = (min_result - range_result * 0.2, max_result + range_result * 0.2) @@ -417,7 +414,7 @@ def plot_metric( if xlabel is not None: ax.set_xlabel(xlabel) if ylabel is not None: - ylabel = ylabel.replace('@metric@', metric) + ylabel = ylabel.replace("@metric@", metric) ax.set_ylabel(ylabel) ax.grid(grid) return ax @@ -432,19 +429,20 @@ def _determine_direction_for_numeric_split( missing_type = _MissingType(missing_type_str) if math.isnan(fval) and missing_type != _MissingType.NAN: fval = 0.0 - if ((missing_type == _MissingType.ZERO and _is_zero(fval)) - or (missing_type == _MissingType.NAN and math.isnan(fval))): - direction = 'left' if default_left else 'right' + if (missing_type == _MissingType.ZERO and _is_zero(fval)) or ( + missing_type == _MissingType.NAN and math.isnan(fval) + ): + direction = "left" if default_left else "right" else: - direction = 'left' if fval <= threshold else 'right' + direction = "left" if fval <= threshold else "right" return direction def _determine_direction_for_categorical_split(fval: float, thresholds: str) -> str: if math.isnan(fval) or int(fval) < 0: - return 'right' - int_thresholds = {int(t) for t in thresholds.split('||')} - return 'left' if int(fval) in int_thresholds else 'right' + return "right" + int_thresholds = {int(t) for t in thresholds.split("||")} + return "left" if int(fval) in int_thresholds else "right" def _to_graphviz( @@ -456,7 +454,7 @@ def _to_graphviz( constraints: Optional[List[int]], example_case: Optional[Union[np.ndarray, pd_DataFrame]], max_category_values: int, - **kwargs: Any + **kwargs: Any, ) -> Any: """Convert specified tree to graphviz instance. @@ -466,120 +464,124 @@ def _to_graphviz( if GRAPHVIZ_INSTALLED: from graphviz import Digraph else: - raise ImportError('You must install graphviz and restart your session to plot tree.') + raise ImportError("You must install graphviz and restart your session to plot tree.") def add( - root: Dict[str, Any], - total_count: int, - parent: Optional[str], - decision: Optional[str], - highlight: bool + root: Dict[str, Any], total_count: int, parent: Optional[str], decision: Optional[str], highlight: bool ) -> None: """Recursively add node or edge.""" - fillcolor = 'white' - style = '' + fillcolor = "white" + style = "" tooltip = None if highlight: - color = 'blue' - penwidth = '3' + color = "blue" + penwidth = "3" else: - color = 'black' - penwidth = '1' - if 'split_index' in root: # non-leaf + color = "black" + penwidth = "1" + if "split_index" in root: # non-leaf shape = "rectangle" - l_dec = 'yes' - r_dec = 'no' - threshold = root['threshold'] - if root['decision_type'] == '<=': + l_dec = "yes" + r_dec = "no" + threshold = root["threshold"] + if root["decision_type"] == "<=": operator = "≤" - elif root['decision_type'] == '==': + elif root["decision_type"] == "==": operator = "=" else: - raise ValueError('Invalid decision type in tree model.') + raise ValueError("Invalid decision type in tree model.") name = f"split{root['split_index']}" - split_feature = root['split_feature'] + split_feature = root["split_feature"] if feature_names is not None: label = f"{feature_names[split_feature]} {operator}" else: label = f"feature {split_feature} {operator} " direction = None if example_case is not None: - if root['decision_type'] == '==': + if root["decision_type"] == "==": direction = _determine_direction_for_categorical_split( - fval=example_case[split_feature], - thresholds=root['threshold'] + fval=example_case[split_feature], thresholds=root["threshold"] ) else: direction = _determine_direction_for_numeric_split( fval=example_case[split_feature], - threshold=root['threshold'], - missing_type_str=root['missing_type'], - default_left=root['default_left'] + threshold=root["threshold"], + missing_type_str=root["missing_type"], + default_left=root["default_left"], ) - if root['decision_type'] == '==': - category_values = root['threshold'].split('||') + if root["decision_type"] == "==": + category_values = root["threshold"].split("||") if len(category_values) > max_category_values: - tooltip = root['threshold'] - threshold = '||'.join(category_values[:2]) + '||...||' + category_values[-1] + tooltip = root["threshold"] + threshold = "||".join(category_values[:2]) + "||...||" + category_values[-1] label += f"{_float2str(threshold, precision)}" - for info in ['split_gain', 'internal_value', 'internal_weight', "internal_count", "data_percentage"]: + for info in ["split_gain", "internal_value", "internal_weight", "internal_count", "data_percentage"]: if info in show_info: - output = info.split('_')[-1] - if info in {'split_gain', 'internal_value', 'internal_weight'}: + output = info.split("_")[-1] + if info in {"split_gain", "internal_value", "internal_weight"}: label += f"
{_float2str(root[info], precision)} {output}" - elif info == 'internal_count': + elif info == "internal_count": label += f"
{output}: {root[info]}" elif info == "data_percentage": label += f"
{_float2str(root['internal_count'] / total_count * 100, 2)}% of data" if constraints: - if constraints[root['split_feature']] == 1: + if constraints[root["split_feature"]] == 1: fillcolor = "#ddffdd" # light green - if constraints[root['split_feature']] == -1: + if constraints[root["split_feature"]] == -1: fillcolor = "#ffdddd" # light red style = "filled" label = f"<{label}>" add( - root=root['left_child'], + root=root["left_child"], total_count=total_count, parent=name, decision=l_dec, - highlight=highlight and direction == "left" + highlight=highlight and direction == "left", ) add( - root=root['right_child'], + root=root["right_child"], total_count=total_count, parent=name, decision=r_dec, - highlight=highlight and direction == "right" + highlight=highlight and direction == "right", ) else: # leaf shape = "ellipse" name = f"leaf{root['leaf_index']}" label = f"leaf {root['leaf_index']}: " label += f"{_float2str(root['leaf_value'], precision)}" - if 'leaf_weight' in show_info: + if "leaf_weight" in show_info: label += f"
{_float2str(root['leaf_weight'], precision)} weight" - if 'leaf_count' in show_info: + if "leaf_count" in show_info: label += f"
count: {root['leaf_count']}" if "data_percentage" in show_info: label += f"
{_float2str(root['leaf_count'] / total_count * 100, 2)}% of data" label = f"<{label}>" - graph.node(name, label=label, shape=shape, style=style, fillcolor=fillcolor, color=color, penwidth=penwidth, tooltip=tooltip) + graph.node( + name, + label=label, + shape=shape, + style=style, + fillcolor=fillcolor, + color=color, + penwidth=penwidth, + tooltip=tooltip, + ) if parent is not None: graph.edge(parent, name, decision, color=color, penwidth=penwidth) graph = Digraph(**kwargs) rankdir = "LR" if orientation == "horizontal" else "TB" graph.attr("graph", nodesep="0.05", ranksep="0.3", rankdir=rankdir) - if "internal_count" in tree_info['tree_structure']: + if "internal_count" in tree_info["tree_structure"]: add( - root=tree_info['tree_structure'], - total_count=tree_info['tree_structure']["internal_count"], + root=tree_info["tree_structure"], + total_count=tree_info["tree_structure"]["internal_count"], parent=None, decision=None, - highlight=example_case is not None + highlight=example_case is not None, ) else: raise Exception("Cannot plot trees with no split") @@ -610,10 +612,10 @@ def create_tree_digraph( tree_index: int = 0, show_info: Optional[List[str]] = None, precision: Optional[int] = 3, - orientation: str = 'horizontal', + orientation: str = "horizontal", example_case: Optional[Union[np.ndarray, pd_DataFrame]] = None, max_category_values: int = 10, - **kwargs: Any + **kwargs: Any, ) -> Any: """Create a digraph representation of specified tree. @@ -689,32 +691,32 @@ def create_tree_digraph( if isinstance(booster, LGBMModel): booster = booster.booster_ elif not isinstance(booster, Booster): - raise TypeError('booster must be Booster or LGBMModel.') + raise TypeError("booster must be Booster or LGBMModel.") model = booster.dump_model() - tree_infos = model['tree_info'] - feature_names = model.get('feature_names', None) - monotone_constraints = model.get('monotone_constraints', None) + tree_infos = model["tree_info"] + feature_names = model.get("feature_names", None) + monotone_constraints = model.get("monotone_constraints", None) if tree_index < len(tree_infos): tree_info = tree_infos[tree_index] else: - raise IndexError('tree_index is out of range.') + raise IndexError("tree_index is out of range.") if show_info is None: show_info = [] if example_case is not None: if not isinstance(example_case, (np.ndarray, pd_DataFrame)) or example_case.ndim != 2: - raise ValueError('example_case must be a numpy 2-D array or a pandas DataFrame') + raise ValueError("example_case must be a numpy 2-D array or a pandas DataFrame") if example_case.shape[0] != 1: - raise ValueError('example_case must have a single row.') + raise ValueError("example_case must have a single row.") if isinstance(example_case, pd_DataFrame): example_case = _data_from_pandas( data=example_case, feature_name="auto", categorical_feature="auto", - pandas_categorical=booster.pandas_categorical + pandas_categorical=booster.pandas_categorical, )[0] example_case = example_case[0] @@ -727,7 +729,7 @@ def create_tree_digraph( constraints=monotone_constraints, example_case=example_case, max_category_values=max_category_values, - **kwargs + **kwargs, ) @@ -739,9 +741,9 @@ def plot_tree( dpi: Optional[int] = None, show_info: Optional[List[str]] = None, precision: Optional[int] = 3, - orientation: str = 'horizontal', + orientation: str = "horizontal", example_case: Optional[Union[np.ndarray, pd_DataFrame]] = None, - **kwargs: Any + **kwargs: Any, ) -> Any: """Plot specified tree. @@ -807,22 +809,28 @@ def plot_tree( import matplotlib.image as image import matplotlib.pyplot as plt else: - raise ImportError('You must install matplotlib and restart your session to plot tree.') + raise ImportError("You must install matplotlib and restart your session to plot tree.") if ax is None: if figsize is not None: - _check_not_tuple_of_2_elements(figsize, 'figsize') + _check_not_tuple_of_2_elements(figsize, "figsize") _, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) - graph = create_tree_digraph(booster=booster, tree_index=tree_index, - show_info=show_info, precision=precision, - orientation=orientation, example_case=example_case, **kwargs) + graph = create_tree_digraph( + booster=booster, + tree_index=tree_index, + show_info=show_info, + precision=precision, + orientation=orientation, + example_case=example_case, + **kwargs, + ) s = BytesIO() - s.write(graph.pipe(format='png')) + s.write(graph.pipe(format="png")) s.seek(0) img = image.imread(s) ax.imshow(img) - ax.axis('off') + ax.axis("off") return ax diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 9eb2219c835c..a6a7554bdb62 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -46,69 +46,43 @@ from .engine import train __all__ = [ - 'LGBMClassifier', - 'LGBMModel', - 'LGBMRanker', - 'LGBMRegressor', + "LGBMClassifier", + "LGBMModel", + "LGBMRanker", + "LGBMRegressor", ] _LGBM_ScikitMatrixLike = Union[ - dt_DataTable, - List[Union[List[float], List[int]]], - np.ndarray, - pd_DataFrame, - scipy.sparse.spmatrix + dt_DataTable, List[Union[List[float], List[int]]], np.ndarray, pd_DataFrame, scipy.sparse.spmatrix ] _LGBM_ScikitCustomObjectiveFunction = Union[ # f(labels, preds) - Callable[ - [Optional[np.ndarray], np.ndarray], - Tuple[np.ndarray, np.ndarray] - ], + Callable[[Optional[np.ndarray], np.ndarray], Tuple[np.ndarray, np.ndarray]], # f(labels, preds, weights) - Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], - Tuple[np.ndarray, np.ndarray] - ], + Callable[[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], Tuple[np.ndarray, np.ndarray]], # f(labels, preds, weights, group) Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], - Tuple[np.ndarray, np.ndarray] + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], Tuple[np.ndarray, np.ndarray] ], ] _LGBM_ScikitCustomEvalFunction = Union[ # f(labels, preds) - Callable[ - [Optional[np.ndarray], np.ndarray], - _LGBM_EvalFunctionResultType - ], - Callable[ - [Optional[np.ndarray], np.ndarray], - List[_LGBM_EvalFunctionResultType] - ], + Callable[[Optional[np.ndarray], np.ndarray], _LGBM_EvalFunctionResultType], + Callable[[Optional[np.ndarray], np.ndarray], List[_LGBM_EvalFunctionResultType]], # f(labels, preds, weights) - Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], - _LGBM_EvalFunctionResultType - ], - Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], - List[_LGBM_EvalFunctionResultType] - ], + Callable[[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], _LGBM_EvalFunctionResultType], + Callable[[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], List[_LGBM_EvalFunctionResultType]], # f(labels, preds, weights, group) Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], - _LGBM_EvalFunctionResultType + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], _LGBM_EvalFunctionResultType ], Callable[ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], - List[_LGBM_EvalFunctionResultType] - ] + List[_LGBM_EvalFunctionResultType], + ], ] _LGBM_ScikitEvalMetricType = Union[ - str, - _LGBM_ScikitCustomEvalFunction, - List[Union[str, _LGBM_ScikitCustomEvalFunction]] + str, _LGBM_ScikitCustomEvalFunction, List[Union[str, _LGBM_ScikitCustomEvalFunction]] ] _LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType] @@ -119,7 +93,7 @@ def _get_group_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray "Estimators in lightgbm.sklearn should only retrieve query groups from a constructed Dataset. " "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues." ) - assert (group is None or isinstance(group, np.ndarray)), error_msg + assert group is None or isinstance(group, np.ndarray), error_msg return group @@ -139,7 +113,7 @@ def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarra "Estimators in lightgbm.sklearn should only retrieve weights from a constructed Dataset. " "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues." ) - assert (weight is None or isinstance(weight, np.ndarray)), error_msg + assert weight is None or isinstance(weight, np.ndarray), error_msg return weight @@ -269,9 +243,7 @@ def __init__(self, func: _LGBM_ScikitCustomEvalFunction): self.func = func def __call__( - self, - preds: np.ndarray, - dataset: Dataset + self, preds: np.ndarray, dataset: Dataset ) -> Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]: """Call passed function with appropriate arguments. @@ -310,8 +282,7 @@ def __call__( # documentation templates for LGBMModel methods are shared between the classes in # this module and those in the ``dask`` module -_lgbmmodel_doc_fit = ( - """ +_lgbmmodel_doc_fit = """ Build a gradient boosting model from the training set (X, y). Parameters @@ -372,7 +343,6 @@ def __call__( self : LGBMModel Returns self. """ -) _lgbmmodel_doc_custom_eval_note = """ Note @@ -405,8 +375,7 @@ def __call__( Is eval result higher better, e.g. AUC is ``is_higher_better``. """ -_lgbmmodel_doc_predict = ( - """ +_lgbmmodel_doc_predict = """ {description} Parameters @@ -451,7 +420,6 @@ def __call__( X_SHAP_values : {X_SHAP_values_shape} If ``pred_contrib=True``, the feature contributions for each sample. """ -) class LGBMModel(_LGBMModelBase): @@ -459,7 +427,7 @@ class LGBMModel(_LGBMModelBase): def __init__( self, - boosting_type: str = 'gbdt', + boosting_type: str = "gbdt", num_leaves: int = 31, max_depth: int = -1, learning_rate: float = 0.1, @@ -467,18 +435,18 @@ def __init__( subsample_for_bin: int = 200000, objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None, class_weight: Optional[Union[Dict, str]] = None, - min_split_gain: float = 0., + min_split_gain: float = 0.0, min_child_weight: float = 1e-3, min_child_samples: int = 20, - subsample: float = 1., + subsample: float = 1.0, subsample_freq: int = 0, - colsample_bytree: float = 1., - reg_alpha: float = 0., - reg_lambda: float = 0., - random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None, + colsample_bytree: float = 1.0, + reg_alpha: float = 0.0, + reg_lambda: float = 0.0, + random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, n_jobs: Optional[int] = None, - importance_type: str = 'split', - **kwargs + importance_type: str = "split", + **kwargs, ): r"""Construct a gradient boosting model. @@ -598,8 +566,10 @@ def __init__( and grad and hess should be returned in the same format. """ if not SKLEARN_INSTALLED: - raise LightGBMError('scikit-learn is required for lightgbm.sklearn. ' - 'You must install scikit-learn and restart your session to use this module.') + raise LightGBMError( + "scikit-learn is required for lightgbm.sklearn. " + "You must install scikit-learn and restart your session to use this module." + ) self.boosting_type = boosting_type self.objective = objective @@ -636,14 +606,13 @@ def __init__( def _more_tags(self) -> Dict[str, Any]: return { - 'allow_nan': True, - 'X_types': ['2darray', 'sparse', '1dlabels'], - '_xfail_checks': { - 'check_no_attributes_set_in_init': - 'scikit-learn incorrectly asserts that private attributes ' - 'cannot be set in __init__: ' - '(see https://github.com/microsoft/LightGBM/issues/2628)' - } + "allow_nan": True, + "X_types": ["2darray", "sparse", "1dlabels"], + "_xfail_checks": { + "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " + "cannot be set in __init__: " + "(see https://github.com/microsoft/LightGBM/issues/2628)" + }, } def __sklearn_is_fitted__(self) -> bool: @@ -703,8 +672,8 @@ def _process_params(self, stage: str) -> Dict[str, Any]: assert stage in {"fit", "predict"} params = self.get_params() - params.pop('objective', None) - for alias in _ConfigAliases.get('objective'): + params.pop("objective", None) + for alias in _ConfigAliases.get("objective"): if alias in params: obj = params.pop(alias) _log_warning(f"Found '{alias}' in params. Will use it instead of 'objective' argument") @@ -725,33 +694,31 @@ def _process_params(self, stage: str) -> Dict[str, Any]: raise ValueError("Unknown LGBMModel type.") if callable(self._objective): if stage == "fit": - params['objective'] = _ObjectiveFunctionWrapper(self._objective) + params["objective"] = _ObjectiveFunctionWrapper(self._objective) else: - params['objective'] = 'None' + params["objective"] = "None" else: - params['objective'] = self._objective + params["objective"] = self._objective - params.pop('importance_type', None) - params.pop('n_estimators', None) - params.pop('class_weight', None) + params.pop("importance_type", None) + params.pop("n_estimators", None) + params.pop("class_weight", None) - if isinstance(params['random_state'], np.random.RandomState): - params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max) - elif isinstance(params['random_state'], np_random_Generator): - params['random_state'] = int( - params['random_state'].integers(np.iinfo(np.int32).max) - ) + if isinstance(params["random_state"], np.random.RandomState): + params["random_state"] = params["random_state"].randint(np.iinfo(np.int32).max) + elif isinstance(params["random_state"], np_random_Generator): + params["random_state"] = int(params["random_state"].integers(np.iinfo(np.int32).max)) if self._n_classes > 2: - for alias in _ConfigAliases.get('num_class'): + for alias in _ConfigAliases.get("num_class"): params.pop(alias, None) - params['num_class'] = self._n_classes - if hasattr(self, '_eval_at'): + params["num_class"] = self._n_classes + if hasattr(self, "_eval_at"): eval_at = self._eval_at - for alias in _ConfigAliases.get('eval_at'): + for alias in _ConfigAliases.get("eval_at"): if alias in params: _log_warning(f"Found '{alias}' in params. Will use it instead of 'eval_at' argument") eval_at = params.pop(alias) - params['eval_at'] = eval_at + params["eval_at"] = eval_at # register default metric for consistency with callable eval_metric case original_metric = self._objective if isinstance(self._objective, str) else None @@ -809,10 +776,10 @@ def fit( eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, eval_group: Optional[List[_LGBM_GroupType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", callbacks: Optional[List[Callable]] = None, - init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None + init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None, ) -> "LGBMModel": """Docstring is set after definition, using a template.""" params = self._process_params(stage="fit") @@ -832,9 +799,9 @@ def fit( eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, str)] # concatenate metric from params (or default if not provided in params) and eval_metric - params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric'] - params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric'] - params['metric'] = [metric for metric in params['metric'] if metric is not None] + params["metric"] = [params["metric"]] if isinstance(params["metric"], (str, type(None))) else params["metric"] + params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"] + params["metric"] = [metric for metric in params["metric"] if metric is not None] if not isinstance(X, (pd_DataFrame, dt_DataTable)): _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2) @@ -856,9 +823,15 @@ def fit( # copy for consistency self._n_features_in = self._n_features - train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group, - init_score=init_score, categorical_feature=categorical_feature, - params=params) + train_set = Dataset( + data=_X, + label=_y, + weight=sample_weight, + group=group, + init_score=init_score, + categorical_feature=categorical_feature, + params=params, + ) valid_sets: List[Dataset] = [] if eval_set is not None: @@ -880,8 +853,8 @@ def _get_meta_data(collection, name, i): if valid_data[0] is X and valid_data[1] is y: valid_set = train_set else: - valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i) - valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i) + valid_weight = _get_meta_data(eval_sample_weight, "eval_sample_weight", i) + valid_class_weight = _get_meta_data(eval_class_weight, "eval_class_weight", i) if valid_class_weight is not None: if isinstance(valid_class_weight, dict) and self._class_map is not None: valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()} @@ -890,11 +863,17 @@ def _get_meta_data(collection, name, i): valid_weight = valid_class_sample_weight else: valid_weight = np.multiply(valid_weight, valid_class_sample_weight) - valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i) - valid_group = _get_meta_data(eval_group, 'eval_group', i) - valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight, - group=valid_group, init_score=valid_init_score, - categorical_feature='auto', params=params) + valid_init_score = _get_meta_data(eval_init_score, "eval_init_score", i) + valid_group = _get_meta_data(eval_group, "eval_group", i) + valid_set = Dataset( + data=valid_data[0], + label=valid_data[1], + weight=valid_weight, + group=valid_group, + init_score=valid_init_score, + categorical_feature="auto", + params=params, + ) valid_sets.append(valid_set) @@ -918,7 +897,7 @@ def _get_meta_data(collection, name, i): feval=eval_metrics_callable, # type: ignore[arg-type] init_model=init_model, feature_name=feature_name, - callbacks=callbacks + callbacks=callbacks, ) self._evals_result = evals_result @@ -932,16 +911,20 @@ def _get_meta_data(collection, name, i): del train_set, valid_sets return self - fit.__doc__ = _lgbmmodel_doc_fit.format( - X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", - y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]", - sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)", - init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", - group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)", - eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)", - eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)", - eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)" - ) + "\n\n" + _lgbmmodel_doc_custom_eval_note + fit.__doc__ = ( + _lgbmmodel_doc_fit.format( + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", + y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]", + sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)", + init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", + group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)", + eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)", + eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)", + eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)", + ) + + "\n\n" + + _lgbmmodel_doc_custom_eval_note + ) def predict( self, @@ -952,7 +935,7 @@ def predict( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ): """Docstring is set after definition, using a template.""" if not self.__sklearn_is_fitted__(): @@ -961,21 +944,16 @@ def predict( X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False) n_features = X.shape[1] if self._n_features != n_features: - raise ValueError("Number of features of the model must " - f"match the input. Model n_features_ is {self._n_features} and " - f"input n_features is {n_features}") + raise ValueError( + "Number of features of the model must " + f"match the input. Model n_features_ is {self._n_features} and " + f"input n_features is {n_features}" + ) # retrive original params that possibly can be used in both training and prediction # and then overwrite them (considering aliases) with params that were passed directly in prediction predict_params = self._process_params(stage="predict") for alias in _ConfigAliases.get_by_alias( - "data", - "X", - "raw_score", - "start_iteration", - "num_iteration", - "pred_leaf", - "pred_contrib", - *kwargs.keys() + "data", "X", "raw_score", "start_iteration", "num_iteration", "pred_leaf", "pred_contrib", *kwargs.keys() ): predict_params.pop(alias, None) predict_params.update(kwargs) @@ -986,9 +964,14 @@ def predict( predict_params["num_threads"] = self._process_n_jobs(predict_params["num_threads"]) return self._Booster.predict( # type: ignore[union-attr] - X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration, - pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **predict_params + X, + raw_score=raw_score, + start_iteration=start_iteration, + num_iteration=num_iteration, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + validate_features=validate_features, + **predict_params, ) predict.__doc__ = _lgbmmodel_doc_predict.format( @@ -997,42 +980,44 @@ def predict( output_name="predicted_result", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects", ) @property def n_features_(self) -> int: """:obj:`int`: The number of features of fitted model.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.') + raise LGBMNotFittedError("No n_features found. Need to call fit beforehand.") return self._n_features @property def n_features_in_(self) -> int: """:obj:`int`: The number of features of fitted model.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.') + raise LGBMNotFittedError("No n_features_in found. Need to call fit beforehand.") return self._n_features_in @property def best_score_(self) -> _LGBM_BoosterBestScoreType: """:obj:`dict`: The best score of fitted model.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No best_score found. Need to call fit beforehand.') + raise LGBMNotFittedError("No best_score found. Need to call fit beforehand.") return self._best_score @property def best_iteration_(self) -> int: """:obj:`int`: The best iteration of fitted model if ``early_stopping()`` callback has been specified.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping callback beforehand.') + raise LGBMNotFittedError( + "No best_iteration found. Need to call fit with early_stopping callback beforehand." + ) return self._best_iteration @property def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]: """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No objective found. Need to call fit beforehand.') + raise LGBMNotFittedError("No objective found. Need to call fit beforehand.") return self._objective # type: ignore[return-value] @property @@ -1041,11 +1026,11 @@ def n_estimators_(self) -> int: This might be less than parameter ``n_estimators`` if early stopping was enabled or if boosting stopped early due to limits on complexity like ``min_gain_to_split``. - + .. versionadded:: 4.0.0 """ if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No n_estimators found. Need to call fit beforehand.') + raise LGBMNotFittedError("No n_estimators found. Need to call fit beforehand.") return self._Booster.current_iteration() # type: ignore @property @@ -1054,25 +1039,25 @@ def n_iter_(self) -> int: This might be less than parameter ``n_estimators`` if early stopping was enabled or if boosting stopped early due to limits on complexity like ``min_gain_to_split``. - + .. versionadded:: 4.0.0 """ if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No n_iter found. Need to call fit beforehand.') + raise LGBMNotFittedError("No n_iter found. Need to call fit beforehand.") return self._Booster.current_iteration() # type: ignore @property def booster_(self) -> Booster: """Booster: The underlying Booster of this model.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No booster found. Need to call fit beforehand.') + raise LGBMNotFittedError("No booster found. Need to call fit beforehand.") return self._Booster # type: ignore[return-value] @property def evals_result_(self) -> _EvalResultDict: """:obj:`dict`: The evaluation results if validation sets have been specified.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No results found. Need to call fit with eval_set beforehand.') + raise LGBMNotFittedError("No results found. Need to call fit with eval_set beforehand.") return self._evals_result @property @@ -1085,14 +1070,14 @@ def feature_importances_(self) -> np.ndarray: to configure the type of importance values to be extracted. """ if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.') + raise LGBMNotFittedError("No feature_importances found. Need to call fit beforehand.") return self._Booster.feature_importance(importance_type=self.importance_type) # type: ignore[union-attr] @property def feature_name_(self) -> List[str]: """:obj:`list` of shape = [n_features]: The names of features.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No feature_name found. Need to call fit beforehand.') + raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.") return self._Booster.feature_name() # type: ignore[union-attr] @@ -1110,10 +1095,10 @@ def fit( # type: ignore[override] eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", callbacks: Optional[List[Callable]] = None, - init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None + init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None, ) -> "LGBMRegressor": """Docstring is inherited from the LGBMModel.""" super().fit( @@ -1129,17 +1114,17 @@ def fit( # type: ignore[override] feature_name=feature_name, categorical_feature=categorical_feature, callbacks=callbacks, - init_model=init_model + init_model=init_model, ) return self _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRegressor") # type: ignore - _base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore - + _base_doc[_base_doc.find('eval_set :'):]) # type: ignore - _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')] - + _base_doc[_base_doc.find('eval_init_score :'):]) - fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')] - + _base_doc[_base_doc.find('eval_metric :'):]) + _base_doc = ( + _base_doc[: _base_doc.find("group :")] # type: ignore + + _base_doc[_base_doc.find("eval_set :") :] + ) # type: ignore + _base_doc = _base_doc[: _base_doc.find("eval_class_weight :")] + _base_doc[_base_doc.find("eval_init_score :") :] + fit.__doc__ = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :] class LGBMClassifier(_LGBMClassifierBase, LGBMModel): @@ -1157,10 +1142,10 @@ def fit( # type: ignore[override] eval_class_weight: Optional[List[float]] = None, eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", callbacks: Optional[List[Callable]] = None, - init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None + init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None, ) -> "LGBMClassifier": """Docstring is inherited from the LGBMModel.""" _LGBMAssertAllFinite(y) @@ -1187,16 +1172,16 @@ def fit( # type: ignore[override] eval_metric_list = [] if self._n_classes > 2: for index, metric in enumerate(eval_metric_list): - if metric in {'logloss', 'binary_logloss'}: + if metric in {"logloss", "binary_logloss"}: eval_metric_list[index] = "multi_logloss" - elif metric in {'error', 'binary_error'}: + elif metric in {"error", "binary_error"}: eval_metric_list[index] = "multi_error" else: for index, metric in enumerate(eval_metric_list): - if metric in {'logloss', 'multi_logloss'}: - eval_metric_list[index] = 'binary_logloss' - elif metric in {'error', 'multi_error'}: - eval_metric_list[index] = 'binary_error' + if metric in {"logloss", "multi_logloss"}: + eval_metric_list[index] = "binary_logloss" + elif metric in {"error", "multi_error"}: + eval_metric_list[index] = "binary_error" eval_metric = eval_metric_list # do not modify args, as it causes errors in model selection tools @@ -1225,15 +1210,16 @@ def fit( # type: ignore[override] feature_name=feature_name, categorical_feature=categorical_feature, callbacks=callbacks, - init_model=init_model + init_model=init_model, ) return self _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMClassifier") # type: ignore - _base_doc = (_base_doc[:_base_doc.find('group :')] # type: ignore - + _base_doc[_base_doc.find('eval_set :'):]) # type: ignore - fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')] - + _base_doc[_base_doc.find('eval_metric :'):]) + _base_doc = ( + _base_doc[: _base_doc.find("group :")] # type: ignore + + _base_doc[_base_doc.find("eval_set :") :] + ) # type: ignore + fit.__doc__ = _base_doc[: _base_doc.find("eval_group :")] + _base_doc[_base_doc.find("eval_metric :") :] def predict( self, @@ -1244,7 +1230,7 @@ def predict( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ): """Docstring is inherited from the LGBMModel.""" result = self.predict_proba( @@ -1255,7 +1241,7 @@ def predict( pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **kwargs + **kwargs, ) if callable(self._objective) or raw_score or pred_leaf or pred_contrib: return result @@ -1274,7 +1260,7 @@ def predict_proba( pred_leaf: bool = False, pred_contrib: bool = False, validate_features: bool = False, - **kwargs: Any + **kwargs: Any, ): """Docstring is set after definition, using a template.""" result = super().predict( @@ -1285,17 +1271,19 @@ def predict_proba( pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features, - **kwargs + **kwargs, ) if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib): - _log_warning("Cannot compute class probabilities or labels " - "due to the usage of customized objective function.\n" - "Returning raw scores instead.") + _log_warning( + "Cannot compute class probabilities or labels " + "due to the usage of customized objective function.\n" + "Returning raw scores instead." + ) return result elif self._n_classes > 2 or raw_score or pred_leaf or pred_contrib: # type: ignore [operator] return result else: - return np.vstack((1. - result, result)).transpose() + return np.vstack((1.0 - result, result)).transpose() predict_proba.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted probability for each class for each sample.", @@ -1303,21 +1291,21 @@ def predict_proba( output_name="predicted_probability", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", - X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects" + X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects", ) @property def classes_(self) -> np.ndarray: """:obj:`array` of shape = [n_classes]: The class label array.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No classes found. Need to call fit beforehand.') + raise LGBMNotFittedError("No classes found. Need to call fit beforehand.") return self._classes # type: ignore[return-value] @property def n_classes_(self) -> int: """:obj:`int`: The number of classes.""" if not self.__sklearn_is_fitted__(): - raise LGBMNotFittedError('No classes found. Need to call fit beforehand.') + raise LGBMNotFittedError("No classes found. Need to call fit beforehand.") return self._n_classes @@ -1345,10 +1333,10 @@ def fit( # type: ignore[override] eval_group: Optional[List[_LGBM_GroupType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5), - feature_name: _LGBM_FeatureNameConfiguration = 'auto', - categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', + feature_name: _LGBM_FeatureNameConfiguration = "auto", + categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto", callbacks: Optional[List[Callable]] = None, - init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None + init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None, ) -> "LGBMRanker": """Docstring is inherited from the LGBMModel.""" # check group data @@ -1360,12 +1348,16 @@ def fit( # type: ignore[override] raise ValueError("Eval_group cannot be None when eval_set is not None") elif len(eval_group) != len(eval_set): raise ValueError("Length of eval_group should be equal to eval_set") - elif (isinstance(eval_group, dict) - and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group))) - or isinstance(eval_group, list) - and any(group is None for group in eval_group)): - raise ValueError("Should set group for all eval datasets for ranking task; " - "if you use dict, the index should start from 0") + elif ( + isinstance(eval_group, dict) + and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group))) + or isinstance(eval_group, list) + and any(group is None for group in eval_group) + ): + raise ValueError( + "Should set group for all eval datasets for ranking task; " + "if you use dict, the index should start from 0" + ) self._eval_at = eval_at super().fit( @@ -1383,15 +1375,17 @@ def fit( # type: ignore[override] feature_name=feature_name, categorical_feature=categorical_feature, callbacks=callbacks, - init_model=init_model + init_model=init_model, ) return self _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRanker") # type: ignore - fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')] # type: ignore - + _base_doc[_base_doc.find('eval_init_score :'):]) # type: ignore + fit.__doc__ = ( + _base_doc[: _base_doc.find("eval_class_weight :")] # type: ignore + + _base_doc[_base_doc.find("eval_init_score :") :] + ) # type: ignore _base_doc = fit.__doc__ - _before_feature_name, _feature_name, _after_feature_name = _base_doc.partition('feature_name :') + _before_feature_name, _feature_name, _after_feature_name = _base_doc.partition("feature_name :") fit.__doc__ = f"""{_before_feature_name}eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5)) The evaluation positions of the specified metric. {_feature_name}{_after_feature_name}""" diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 648d400a2c9f..359a09f87445 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -114,7 +114,6 @@ exclude = [ "compile/*.py", "external_libs/*.py", "lightgbm-python/*.py", - "python-package/*.py", ] indent-style = "space" quote-style = "double" From 6bc43913c5658c8d3d0ed57c54ec9ac65431ba91 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 17:30:50 -0600 Subject: [PATCH 02/15] preserve indentation for some short collections --- python-package/lightgbm/basic.py | 7 ++++++- python-package/lightgbm/libpath.py | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 30788db76829..a269749a5c8c 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -659,7 +659,12 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va } """String name to int feature importance type mapper""" -_FEATURE_IMPORTANCE_TYPE_MAPPER = {"split": _C_API_FEATURE_IMPORTANCE_SPLIT, "gain": _C_API_FEATURE_IMPORTANCE_GAIN} +# fmt: off +_FEATURE_IMPORTANCE_TYPE_MAPPER = { + "split": _C_API_FEATURE_IMPORTANCE_SPLIT, + "gain": _C_API_FEATURE_IMPORTANCE_GAIN +} +# fmt: on def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray: diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py index df540f2ba8fd..49ec74d2f9b2 100644 --- a/python-package/lightgbm/libpath.py +++ b/python-package/lightgbm/libpath.py @@ -16,7 +16,13 @@ def find_lib_path() -> List[str]: List of all found library paths to LightGBM. """ curr_path = Path(__file__).absolute() - dll_path = [curr_path.parents[1], curr_path.parents[0] / "bin", curr_path.parents[0] / "lib"] + # fmt: off + dll_path = [ + curr_path.parents[1], + curr_path.parents[0] / "bin", + curr_path.parents[0] / "lib" + ] + # fmt: on if system() in ("Windows", "Microsoft"): dll_path.append(curr_path.parents[1] / "Release") dll_path.append(curr_path.parents[1] / "windows" / "x64" / "DLL") From 004754106b6e3dffd8e62110bc585c40120d8e76 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:08:12 -0600 Subject: [PATCH 03/15] reset some handing argument lists --- python-package/lightgbm/basic.py | 295 ++++++++++++++++++++++++------- 1 file changed, 227 insertions(+), 68 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index a269749a5c8c..69d8de703f13 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -58,13 +58,21 @@ _BoosterHandle = ctypes.c_void_p _DatasetHandle = ctypes.c_void_p -_ctypes_int_ptr = Union["ctypes._Pointer[ctypes.c_int32]", "ctypes._Pointer[ctypes.c_int64]"] +_ctypes_int_ptr = Union[ + "ctypes._Pointer[ctypes.c_int32]", + "ctypes._Pointer[ctypes.c_int64]", +] _ctypes_int_array = Union[ - "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]", "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]" + "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]", + "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]", +] +_ctypes_float_ptr = Union[ + "ctypes._Pointer[ctypes.c_float]", + "ctypes._Pointer[ctypes.c_double]", ] -_ctypes_float_ptr = Union["ctypes._Pointer[ctypes.c_float]", "ctypes._Pointer[ctypes.c_double]"] _ctypes_float_array = Union[ - "ctypes.Array[ctypes._Pointer[ctypes.c_float]]", "ctypes.Array[ctypes._Pointer[ctypes.c_double]]" + "ctypes.Array[ctypes._Pointer[ctypes.c_float]]", + "ctypes.Array[ctypes._Pointer[ctypes.c_double]]", ] _LGBM_EvalFunctionResultType = Tuple[str, float, bool] _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] @@ -80,7 +88,10 @@ pa_Array, pa_ChunkedArray, ] -_LGBM_PositionType = Union[np.ndarray, pd_Series] +_LGBM_PositionType = Union[ + np.ndarray, + pd_Series, +] _LGBM_InitScoreType = Union[ List[float], List[List[float]], @@ -129,6 +140,19 @@ pa_Array, pa_ChunkedArray, ] +_LGBM_SetFieldType = Union[ + List[List[float]], + List[List[int]], + List[float], + List[int], + np.ndarray, + pd_Series, + pd_DataFrame, + pa_Table, + pa_Array, + pa_ChunkedArray, +] + ZERO_THRESHOLD = 1e-35 @@ -659,12 +683,10 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va } """String name to int feature importance type mapper""" -# fmt: off _FEATURE_IMPORTANCE_TYPE_MAPPER = { "split": _C_API_FEATURE_IMPORTANCE_SPLIT, - "gain": _C_API_FEATURE_IMPORTANCE_GAIN + "gain": _C_API_FEATURE_IMPORTANCE_GAIN, } -# fmt: on def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray: @@ -739,7 +761,10 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: ) -def _pandas_to_numpy(data: pd_DataFrame, target_dtype: "np.typing.DTypeLike") -> np.ndarray: +def _pandas_to_numpy( + data: pd_DataFrame, + target_dtype: "np.typing.DTypeLike", +) -> np.ndarray: _check_for_bad_pandas_dtypes(data.dtypes) try: # most common case (no nullable dtypes) @@ -797,7 +822,8 @@ def _data_from_pandas( def _dump_pandas_categorical( - pandas_categorical: Optional[List[List]], file_name: Optional[Union[str, Path]] = None + pandas_categorical: Optional[List[List]], + file_name: Optional[Union[str, Path]] = None, ) -> str: categorical_json = json.dumps(pandas_categorical, default=_json_default_with_numpy) pandas_str = f"\npandas_categorical:{categorical_json}\n" @@ -808,7 +834,8 @@ def _dump_pandas_categorical( def _load_pandas_categorical( - file_name: Optional[Union[str, Path]] = None, model_str: Optional[str] = None + file_name: Optional[Union[str, Path]] = None, + model_str: Optional[str] = None, ) -> Optional[List[List]]: pandas_key = "pandas_categorical:" offset = -len(pandas_key) @@ -943,7 +970,11 @@ def __init__( self.num_class = out_num_class.value @classmethod - def from_booster(cls, booster: "Booster", pred_parameter: Dict[str, Any]) -> "_InnerPredictor": + def from_booster( + cls, + booster: "Booster", + pred_parameter: Dict[str, Any], + ) -> "_InnerPredictor": """Initialize an ``_InnerPredictor`` from a ``Booster``. Parameters @@ -963,7 +994,11 @@ def from_booster(cls, booster: "Booster", pred_parameter: Dict[str, Any]) -> "_I ) @classmethod - def from_model_file(cls, model_file: Union[str, Path], pred_parameter: Dict[str, Any]) -> "_InnerPredictor": + def from_model_file( + cls, + model_file: Union[str, Path], + pred_parameter: Dict[str, Any], + ) -> "_InnerPredictor": """Initialize an ``_InnerPredictor`` from a text file containing a LightGBM model. Parameters @@ -1059,7 +1094,10 @@ def predict( if isinstance(data, pd_DataFrame): data = _data_from_pandas( - data=data, feature_name="auto", categorical_feature="auto", pandas_categorical=self.pandas_categorical + data=data, + feature_name="auto", + categorical_feature="auto", + pandas_categorical=self.pandas_categorical, )[0] predict_type = _C_API_PREDICT_NORMAL @@ -1088,19 +1126,31 @@ def predict( nrow = preds.shape[0] elif isinstance(data, scipy.sparse.csr_matrix): preds, nrow = self.__pred_for_csr( - csr=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + csr=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) elif isinstance(data, scipy.sparse.csc_matrix): preds, nrow = self.__pred_for_csc( - csc=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + csc=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) elif isinstance(data, np.ndarray): preds, nrow = self.__pred_for_np2d( - mat=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + mat=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) elif _is_pyarrow_table(data): preds, nrow = self.__pred_for_pyarrow_table( - table=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + table=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) elif isinstance(data, list): try: @@ -1108,7 +1158,10 @@ def predict( except BaseException as err: raise ValueError("Cannot convert data list to numpy array.") from err preds, nrow = self.__pred_for_np2d( - mat=data, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + mat=data, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) elif isinstance(data, dt_DataTable): preds, nrow = self.__pred_for_np2d( @@ -1124,7 +1177,10 @@ def predict( except BaseException as err: raise TypeError(f"Cannot predict data for type {type(data).__name__}") from err preds, nrow = self.__pred_for_csr( - csr=csr, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + csr=csr, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) if pred_leaf: preds = preds.astype(np.int32) @@ -1136,7 +1192,13 @@ def predict( raise ValueError(f"Length of predict result ({preds.size}) cannot be divide nrow ({nrow})") return preds - def __get_num_preds(self, start_iteration: int, num_iteration: int, nrow: int, predict_type: int) -> int: + def __get_num_preds( + self, + start_iteration: int, + num_iteration: int, + nrow: int, + predict_type: int, + ) -> int: """Get size of prediction result.""" if nrow > _MAX_INT32: raise LightGBMError( @@ -1159,7 +1221,12 @@ def __get_num_preds(self, start_iteration: int, num_iteration: int, nrow: int, p return n_preds.value def __inner_predict_np2d( - self, mat: np.ndarray, start_iteration: int, num_iteration: int, predict_type: int, preds: Optional[np.ndarray] + self, + mat: np.ndarray, + start_iteration: int, + num_iteration: int, + predict_type: int, + preds: Optional[np.ndarray], ) -> Tuple[np.ndarray, int]: if mat.dtype == np.float32 or mat.dtype == np.float64: data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False) @@ -1167,7 +1234,10 @@ def __inner_predict_np2d( data = np.array(mat.reshape(mat.size), dtype=np.float32) ptr_data, type_ptr_data, _ = _c_float_array(data) n_preds = self.__get_num_preds( - start_iteration=start_iteration, num_iteration=num_iteration, nrow=mat.shape[0], predict_type=predict_type + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=mat.shape[0], + predict_type=predict_type, ) if preds is None: preds = np.empty(n_preds, dtype=np.float64) @@ -1195,7 +1265,11 @@ def __inner_predict_np2d( return preds, mat.shape[0] def __pred_for_np2d( - self, mat: np.ndarray, start_iteration: int, num_iteration: int, predict_type: int + self, + mat: np.ndarray, + start_iteration: int, + num_iteration: int, + predict_type: int, ) -> Tuple[np.ndarray, int]: """Predict for a 2-D numpy matrix.""" if len(mat.shape) != 2: @@ -1300,7 +1374,10 @@ def __inner_predict_csr( ) -> Tuple[np.ndarray, int]: nrow = len(csr.indptr) - 1 n_preds = self.__get_num_preds( - start_iteration=start_iteration, num_iteration=num_iteration, nrow=nrow, predict_type=predict_type + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=nrow, + predict_type=predict_type, ) if preds is None: preds = np.empty(n_preds, dtype=np.float64) @@ -1428,7 +1505,11 @@ def __pred_for_csr( ) def __inner_predict_sparse_csc( - self, csc: scipy.sparse.csc_matrix, start_iteration: int, num_iteration: int, predict_type: int + self, + csc: scipy.sparse.csc_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int, ): ptr_indptr, type_ptr_indptr, __ = _c_int_array(csc.indptr) ptr_data, type_ptr_data, _ = _c_float_array(csc.data) @@ -1482,20 +1563,33 @@ def __inner_predict_sparse_csc( return matrices, nrow def __pred_for_csc( - self, csc: scipy.sparse.csc_matrix, start_iteration: int, num_iteration: int, predict_type: int + self, + csc: scipy.sparse.csc_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int, ) -> Tuple[np.ndarray, int]: """Predict for a CSC data.""" nrow = csc.shape[0] if nrow > _MAX_INT32: return self.__pred_for_csr( - csr=csc.tocsr(), start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + csr=csc.tocsr(), + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) if predict_type == _C_API_PREDICT_CONTRIB: return self.__inner_predict_sparse_csc( - csc=csc, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + csc=csc, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) n_preds = self.__get_num_preds( - start_iteration=start_iteration, num_iteration=num_iteration, nrow=nrow, predict_type=predict_type + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=nrow, + predict_type=predict_type, ) preds = np.empty(n_preds, dtype=np.float64) out_num_preds = ctypes.c_int64(0) @@ -1530,7 +1624,11 @@ def __pred_for_csc( return preds, nrow def __pred_for_pyarrow_table( - self, table: pa_Table, start_iteration: int, num_iteration: int, predict_type: int + self, + table: pa_Table, + start_iteration: int, + num_iteration: int, + predict_type: int, ) -> Tuple[np.ndarray, int]: """Predict for a PyArrow table.""" if not PYARROW_INSTALLED: @@ -1542,7 +1640,10 @@ def __pred_for_pyarrow_table( # Prepare prediction output array n_preds = self.__get_num_preds( - start_iteration=start_iteration, num_iteration=num_iteration, nrow=table.num_rows, predict_type=predict_type + start_iteration=start_iteration, + num_iteration=num_iteration, + nrow=table.num_rows, + predict_type=predict_type, ) preds = np.empty(n_preds, dtype=np.float64) out_num_preds = ctypes.c_int64(0) @@ -1698,7 +1799,11 @@ def _create_sample_indices(self, total_nrow: int) -> np.ndarray: assert sample_cnt == actual_sample_cnt.value return indices - def _init_from_ref_dataset(self, total_nrow: int, ref_dataset: _DatasetHandle) -> "Dataset": + def _init_from_ref_dataset( + self, + total_nrow: int, + ref_dataset: _DatasetHandle, + ) -> "Dataset": """Create dataset from a reference dataset. Parameters @@ -2069,7 +2174,11 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr return filtered, filtered_idx - def __init_from_seqs(self, seqs: List[Sequence], ref_dataset: Optional[_DatasetHandle]) -> "Dataset": + def __init_from_seqs( + self, + seqs: List[Sequence], + ref_dataset: Optional[_DatasetHandle], + ) -> "Dataset": """ Initialize data from list of Sequence objects. @@ -2098,7 +2207,12 @@ def __init_from_seqs(self, seqs: List[Sequence], ref_dataset: Optional[_DatasetH self._push_rows(seq[start:end]) return self - def __init_from_np2d(self, mat: np.ndarray, params_str: str, ref_dataset: Optional[_DatasetHandle]) -> "Dataset": + def __init_from_np2d( + self, + mat: np.ndarray, + params_str: str, + ref_dataset: Optional[_DatasetHandle], + ) -> "Dataset": """Initialize data from a 2-D numpy matrix.""" if len(mat.shape) != 2: raise ValueError("Input numpy.ndarray must be 2 dimensional") @@ -2125,7 +2239,10 @@ def __init_from_np2d(self, mat: np.ndarray, params_str: str, ref_dataset: Option return self def __init_from_list_np2d( - self, mats: List[np.ndarray], params_str: str, ref_dataset: Optional[_DatasetHandle] + self, + mats: List[np.ndarray], + params_str: str, + ref_dataset: Optional[_DatasetHandle], ) -> "Dataset": """Initialize data from a list of 2-D numpy matrices.""" ncol = mats[0].shape[1] @@ -2177,7 +2294,10 @@ def __init_from_list_np2d( return self def __init_from_csr( - self, csr: scipy.sparse.csr_matrix, params_str: str, ref_dataset: Optional[_DatasetHandle] + self, + csr: scipy.sparse.csr_matrix, + params_str: str, + ref_dataset: Optional[_DatasetHandle], ) -> "Dataset": """Initialize data from a CSR matrix.""" if len(csr.indices) != len(csr.data): @@ -2208,7 +2328,10 @@ def __init_from_csr( return self def __init_from_csc( - self, csc: scipy.sparse.csc_matrix, params_str: str, ref_dataset: Optional[_DatasetHandle] + self, + csc: scipy.sparse.csc_matrix, + params_str: str, + ref_dataset: Optional[_DatasetHandle], ) -> "Dataset": """Initialize data from a CSC matrix.""" if len(csc.indices) != len(csc.data): @@ -2239,7 +2362,10 @@ def __init_from_csc( return self def __init_from_pyarrow_table( - self, table: pa_Table, params_str: str, ref_dataset: Optional[_DatasetHandle] + self, + table: pa_Table, + params_str: str, + ref_dataset: Optional[_DatasetHandle], ) -> "Dataset": """Initialize data from a PyArrow table.""" if not PYARROW_INSTALLED: @@ -2266,7 +2392,9 @@ def __init_from_pyarrow_table( @staticmethod def _compare_params_for_warning( - params: Dict[str, Any], other_params: Dict[str, Any], ignore_keys: Set[str] + params: Dict[str, Any], + other_params: Dict[str, Any], + ignore_keys: Set[str], ) -> bool: """Compare two dictionaries with params ignoring some keys. @@ -2441,7 +2569,11 @@ def create_valid( ret.pandas_categorical = self.pandas_categorical return ret - def subset(self, used_indices: List[int], params: Optional[Dict[str, Any]] = None) -> "Dataset": + def subset( + self, + used_indices: List[int], + params: Optional[Dict[str, Any]] = None, + ) -> "Dataset": """Get subset of current Dataset. Parameters @@ -2528,20 +2660,7 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data: Optional[ - Union[ - List[List[float]], - List[List[int]], - List[float], - List[int], - np.ndarray, - pd_Series, - pd_DataFrame, - pa_Table, - pa_Array, - pa_ChunkedArray, - ] - ], + data: Optional[_LGBM_SetFieldType], ) -> "Dataset": """Set property into the Dataset. @@ -2688,7 +2807,10 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]: arr = arr.reshape((num_data, num_classes), order="F") return arr - def set_categorical_feature(self, categorical_feature: _LGBM_CategoricalFeatureConfiguration) -> "Dataset": + def set_categorical_feature( + self, + categorical_feature: _LGBM_CategoricalFeatureConfiguration, + ) -> "Dataset": """Set categorical features. Parameters @@ -2723,7 +2845,10 @@ def set_categorical_feature(self, categorical_feature: _LGBM_CategoricalFeatureC "set free_raw_data=False when construct Dataset to avoid this." ) - def _set_predictor(self, predictor: Optional[_InnerPredictor]) -> "Dataset": + def _set_predictor( + self, + predictor: Optional[_InnerPredictor], + ) -> "Dataset": """Set predictor for continued training. It is not recommended for user to call this function. @@ -2740,11 +2865,17 @@ def _set_predictor(self, predictor: Optional[_InnerPredictor]) -> "Dataset": self._predictor = predictor elif self.data is not None: self._predictor = predictor - self._set_init_score_by_predictor(predictor=self._predictor, data=self.data, used_indices=None) + self._set_init_score_by_predictor( + predictor=self._predictor, + data=self.data, + used_indices=None, + ) elif self.used_indices is not None and self.reference is not None and self.reference.data is not None: self._predictor = predictor self._set_init_score_by_predictor( - predictor=self._predictor, data=self.reference.data, used_indices=self.used_indices + predictor=self._predictor, + data=self.reference.data, + used_indices=self.used_indices, ) else: raise LightGBMError( @@ -2836,7 +2967,10 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": self.label = self.get_field("label") # original values can be modified at cpp side return self - def set_weight(self, weight: Optional[_LGBM_WeightType]) -> "Dataset": + def set_weight( + self, + weight: Optional[_LGBM_WeightType], + ) -> "Dataset": """Set weight of each instance. Parameters @@ -2866,7 +3000,10 @@ def set_weight(self, weight: Optional[_LGBM_WeightType]) -> "Dataset": self.weight = self.get_field("weight") # original values can be modified at cpp side return self - def set_init_score(self, init_score: Optional[_LGBM_InitScoreType]) -> "Dataset": + def set_init_score( + self, + init_score: Optional[_LGBM_InitScoreType], + ) -> "Dataset": """Set init score of Booster to start from. Parameters @@ -2913,7 +3050,10 @@ def set_group(self, group: Optional[_LGBM_GroupType]) -> "Dataset": self.group = np.diff(constructed_group) return self - def set_position(self, position: Optional[_LGBM_PositionType]) -> "Dataset": + def set_position( + self, + position: Optional[_LGBM_PositionType], + ) -> "Dataset": """Set position of Dataset (used for ranking). Parameters @@ -3281,10 +3421,19 @@ def _dump_text(self, filename: Union[str, Path]) -> "Dataset": return self -_LGBM_CustomObjectiveFunction = Callable[[np.ndarray, Dataset], Tuple[np.ndarray, np.ndarray]] +_LGBM_CustomObjectiveFunction = Callable[ + [np.ndarray, Dataset], + Tuple[np.ndarray, np.ndarray], +] _LGBM_CustomEvalFunction = Union[ - Callable[[np.ndarray, Dataset], _LGBM_EvalFunctionResultType], - Callable[[np.ndarray, Dataset], List[_LGBM_EvalFunctionResultType]], + Callable[ + [np.ndarray, Dataset], + _LGBM_EvalFunctionResultType, + ], + Callable[ + [np.ndarray, Dataset], + List[_LGBM_EvalFunctionResultType], + ], ] @@ -3323,7 +3472,11 @@ def __init__( # Training task if not isinstance(train_set, Dataset): raise TypeError(f"Training data should be Dataset instance, met {type(train_set).__name__}") - params = _choose_param_value(main_param_name="machines", params=params, default_value=None) + params = _choose_param_value( + main_param_name="machines", + params=params, + default_value=None, + ) # if "machines" is given, assume user wants to do distributed learning, and set up network if params["machines"] is None: params.pop("machines", None) @@ -3338,7 +3491,9 @@ def __init__( raise ValueError("Invalid machines in params.") params = _choose_param_value( - main_param_name="num_machines", params=params, default_value=num_machines_from_machine_list + main_param_name="num_machines", + params=params, + default_value=num_machines_from_machine_list, ) params = _choose_param_value(main_param_name="local_listen_port", params=params, default_value=12400) self.set_network( @@ -4462,7 +4617,11 @@ def refit( nrow, ncol = leaf_preds.shape out_is_linear = ctypes.c_int(0) _safe_call(_LIB.LGBM_BoosterGetLinear(self._handle, ctypes.byref(out_is_linear))) - new_params = _choose_param_value(main_param_name="linear_tree", params=self.params, default_value=None) + new_params = _choose_param_value( + main_param_name="linear_tree", + params=self.params, + default_value=None, + ) new_params["linear_tree"] = bool(out_is_linear.value) new_params.update(dataset_params) train_set = Dataset( From d26009ee03a721aa63a55aadde65c0849a1d01aa Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:12:52 -0600 Subject: [PATCH 04/15] more hanging indents --- python-package/lightgbm/basic.py | 39 +++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 69d8de703f13..63d013f776ff 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3734,7 +3734,10 @@ def _get_node_index(tree: Dict[str, Any], tree_index: Optional[int]) -> str: node_num = tree.get("split_index" if is_split else "leaf_index", 0) return f"{tree_num}{node_type}{node_num}" - def _get_split_feature(tree: Dict[str, Any], feature_names: Optional[List[str]]) -> Optional[str]: + def _get_split_feature( + tree: Dict[str, Any], + feature_names: Optional[List[str]], + ) -> Optional[str]: if _is_split_node(tree): if feature_names is not None: feature_name = feature_names[tree["split_feature"]] @@ -3892,7 +3895,11 @@ def reset_parameter(self, params: Dict[str, Any]) -> "Booster": self.params.update(params) return self - def update(self, train_set: Optional[Dataset] = None, fobj: Optional[_LGBM_CustomObjectiveFunction] = None) -> bool: + def update( + self, + train_set: Optional[Dataset] = None, + fobj: Optional[_LGBM_CustomObjectiveFunction] = None, + ) -> bool: """Update Booster for one iteration. Parameters @@ -3954,7 +3961,11 @@ def update(self, train_set: Optional[Dataset] = None, fobj: Optional[_LGBM_Custo grad, hess = fobj(self.__inner_predict(0), self.train_set) return self.__boost(grad, hess) - def __boost(self, grad: np.ndarray, hess: np.ndarray) -> bool: + def __boost( + self, + grad: np.ndarray, + hess: np.ndarray, + ) -> bool: """Boost Booster for one iteration with customized gradient statistics. .. note:: @@ -4134,7 +4145,8 @@ def eval( return self.__inner_eval(name, data_idx, feval) def eval_train( - self, feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + self, + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None, ) -> List[_LGBM_BoosterEvalMethodResultType]: """Evaluate for training data. @@ -4167,7 +4179,8 @@ def eval_train( return self.__inner_eval(self._train_data_name, 0, feval) def eval_valid( - self, feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None + self, + feval: Optional[Union[_LGBM_CustomEvalFunction, List[_LGBM_CustomEvalFunction]]] = None, ) -> List[_LGBM_BoosterEvalMethodResultType]: """Evaluate for validation data. @@ -4299,7 +4312,10 @@ def model_from_string(self, model_str: str) -> "Booster": return self def model_to_string( - self, num_iteration: Optional[int] = None, start_iteration: int = 0, importance_type: str = "split" + self, + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = "split", ) -> str: """Save Booster to string. @@ -4612,7 +4628,10 @@ def refit( dataset_params = {} predictor = _InnerPredictor.from_booster(booster=self, pred_parameter=deepcopy(kwargs)) leaf_preds: np.ndarray = predictor.predict( # type: ignore[assignment] - data=data, start_iteration=-1, pred_leaf=True, validate_features=validate_features + data=data, + start_iteration=-1, + pred_leaf=True, + validate_features=validate_features, ) nrow, ncol = leaf_preds.shape out_is_linear = ctypes.c_int(0) @@ -4754,7 +4773,11 @@ def feature_name(self) -> List[str]: ) return [string_buffers[i].value.decode("utf-8") for i in range(num_feature)] - def feature_importance(self, importance_type: str = "split", iteration: Optional[int] = None) -> np.ndarray: + def feature_importance( + self, + importance_type: str = "split", + iteration: Optional[int] = None, + ) -> np.ndarray: """Get feature importances. Parameters From dd212965cf661cd3379ed78c20527aec6db73719 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:22:02 -0600 Subject: [PATCH 05/15] more hanging indents --- python-package/lightgbm/basic.py | 65 ++++++++++++++++++++++++----- python-package/lightgbm/callback.py | 6 ++- 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 63d013f776ff..2bf9ca955a77 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -195,7 +195,11 @@ def _has_method(logger: Any, method_name: str) -> bool: return callable(getattr(logger, method_name, None)) -def register_logger(logger: Any, info_method_name: str = "info", warning_method_name: str = "warning") -> None: +def register_logger( + logger: Any, + info_method_name: str = "info", + warning_method_name: str = "warning", +) -> None: """Register custom logger. Parameters @@ -338,7 +342,11 @@ def _is_1d_collection(data: Any) -> bool: return _is_numpy_1d_array(data) or _is_numpy_column_array(data) or _is_1d_list(data) or isinstance(data, pd_Series) -def _list_to_1d_numpy(data: Any, dtype: "np.typing.DTypeLike", name: str) -> np.ndarray: +def _list_to_1d_numpy( + data: Any, + dtype: "np.typing.DTypeLike", + name: str, +) -> np.ndarray: """Convert data to numpy 1-D array.""" if _is_numpy_1d_array(data): return _cast_numpy_array_to_dtype(data, dtype) @@ -433,7 +441,11 @@ def _export_arrow_to_c(data: pa_Table) -> _ArrowCArray: return _ArrowCArray(len(chunks), chunks, schema) -def _data_to_2d_numpy(data: Any, dtype: "np.typing.DTypeLike", name: str) -> np.ndarray: +def _data_to_2d_numpy( + data: Any, + dtype: "np.typing.DTypeLike", + name: str, +) -> np.ndarray: """Convert data to numpy 2-D array.""" if _is_numpy_2d_array(data): return _cast_numpy_array_to_dtype(data, dtype) @@ -818,7 +830,12 @@ def _data_from_pandas( df_dtypes.append(np.float32) target_dtype = np.result_type(*df_dtypes) - return (_pandas_to_numpy(data, target_dtype=target_dtype), feature_name, categorical_feature, pandas_categorical) + return ( + _pandas_to_numpy(data, target_dtype=target_dtype), + feature_name, + categorical_feature, + pandas_categorical, + ) def _dump_pandas_categorical( @@ -1415,7 +1432,11 @@ def __inner_predict_csr( return preds, nrow def __inner_predict_csr_sparse( - self, csr: scipy.sparse.csr_matrix, start_iteration: int, num_iteration: int, predict_type: int + self, + csr: scipy.sparse.csr_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int, ) -> Tuple[Union[List[scipy.sparse.csc_matrix], List[scipy.sparse.csr_matrix]], int]: ptr_indptr, type_ptr_indptr, __ = _c_int_array(csr.indptr) ptr_data, type_ptr_data, _ = _c_float_array(csr.data) @@ -1469,7 +1490,11 @@ def __inner_predict_csr_sparse( return matrices, nrow def __pred_for_csr( - self, csr: scipy.sparse.csr_matrix, start_iteration: int, num_iteration: int, predict_type: int + self, + csr: scipy.sparse.csr_matrix, + start_iteration: int, + num_iteration: int, + predict_type: int, ) -> Tuple[np.ndarray, int]: """Predict for a CSR data.""" if predict_type == _C_API_PREDICT_CONTRIB: @@ -1981,7 +2006,9 @@ def _set_init_score_by_predictor( num_data = self.num_data() if predictor is not None: init_score: Union[np.ndarray, scipy.sparse.spmatrix] = predictor.predict( - data=data, raw_score=True, data_has_header=data_has_header + data=data, + raw_score=True, + data_has_header=data_has_header, ) init_score = init_score.ravel() if used_indices is not None: @@ -3495,7 +3522,11 @@ def __init__( params=params, default_value=num_machines_from_machine_list, ) - params = _choose_param_value(main_param_name="local_listen_port", params=params, default_value=12400) + params = _choose_param_value( + main_param_name="local_listen_port", + params=params, + default_value=12400, + ) self.set_network( machines=machines, local_listen_port=params["local_listen_port"], @@ -3726,7 +3757,10 @@ def create_node_record( feature_names: Optional[List[str]] = None, parent_node: Optional[str] = None, ) -> Dict[str, Any]: - def _get_node_index(tree: Dict[str, Any], tree_index: Optional[int]) -> str: + def _get_node_index( + tree: Dict[str, Any], + tree_index: Optional[int], + ) -> str: tree_num = f"{tree_index}-" if tree_index is not None else "" is_split = _is_split_node(tree) node_type = "S" if is_split else "L" @@ -4260,7 +4294,11 @@ def save_model( _dump_pandas_categorical(self.pandas_categorical, filename) return self - def shuffle_models(self, start_iteration: int = 0, end_iteration: int = -1) -> "Booster": + def shuffle_models( + self, + start_iteration: int = 0, + end_iteration: int = -1, + ) -> "Booster": """Shuffle models. Parameters @@ -4945,7 +4983,12 @@ def __inner_predict(self, data_idx: int) -> np.ndarray: tmp_out_len = ctypes.c_int64(0) data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double)) # type: ignore[union-attr] _safe_call( - _LIB.LGBM_BoosterGetPredict(self._handle, ctypes.c_int(data_idx), ctypes.byref(tmp_out_len), data_ptr) + _LIB.LGBM_BoosterGetPredict( + self._handle, + ctypes.c_int(data_idx), + ctypes.byref(tmp_out_len), + data_ptr, + ) ) if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): # type: ignore[arg-type] raise ValueError(f"Wrong length of predict results for data {data_idx}") diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index a6683421325c..4254f1e76267 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -416,7 +416,11 @@ def __call__(self, env: CallbackEnv) -> None: eval_name_splitted = env.evaluation_result_list[i][1].split(" ") if self.first_metric_only and self.first_metric != eval_name_splitted[-1]: continue # use only the first metric for early stopping - if self._is_train_set(ds_name=env.evaluation_result_list[i][0], eval_name=eval_name_splitted[0], env=env): + if self._is_train_set( + ds_name=env.evaluation_result_list[i][0], + eval_name=eval_name_splitted[0], + env=env, + ): continue # train data for lgb.cv or sklearn wrapper (underlying lgb.train) elif env.iteration - self.best_iter[i] >= self.stopping_rounds: if self.verbose: From 423cf40874c33fb8e8150230078b2c4ba98bb9ac Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:27:09 -0600 Subject: [PATCH 06/15] more indentation --- python-package/lightgbm/dask.py | 57 +++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 333600ac7566..c5714c2fc1f8 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -521,8 +521,19 @@ def _train( # capture whether machines or its aliases were provided machines_in_params = any(alias in params for alias in _ConfigAliases.get("machines")) - params = _choose_param_value(main_param_name="tree_learner", params=params, default_value="data") - allowed_tree_learners = {"data", "data_parallel", "feature", "feature_parallel", "voting", "voting_parallel"} + params = _choose_param_value( + main_param_name="tree_learner", + params=params, + default_value="data", + ) + allowed_tree_learners = { + 'data', + 'data_parallel', + 'feature', + 'feature_parallel', + 'voting', + 'voting_parallel', + } if params["tree_learner"] not in allowed_tree_learners: _log_warning( f'Parameter tree_learner set to {params["tree_learner"]}, which is not allowed. Using "data" as default' @@ -720,10 +731,18 @@ def _train( # resolve aliases for network parameters and pop the result off params. # these values are added back in calls to `_train_part()` - params = _choose_param_value(main_param_name="local_listen_port", params=params, default_value=12400) + params = _choose_param_value( + main_param_name="local_listen_port", + params=params, + default_value=12400, + ) local_listen_port = params.pop("local_listen_port") - params = _choose_param_value(main_param_name="machines", params=params, default_value=None) + params = _choose_param_value( + main_param_name="machines", + params=params, + default_value=None, + ) machines = params.pop("machines") # figure out network params @@ -731,7 +750,10 @@ def _train( worker_addresses = worker_map.keys() if machines is not None: _log_info("Using passed-in 'machines' parameter") - worker_address_to_port = _machines_to_worker_map(machines=machines, worker_addresses=worker_addresses) + worker_address_to_port = _machines_to_worker_map( + machines=machines, + worker_addresses=worker_addresses, + ) else: if listen_port_in_params: _log_info("Using passed-in 'local_listen_port' for all workers") @@ -821,10 +843,20 @@ def _predict_part( result = np.array([]) elif pred_proba: result = model.predict_proba( - part, raw_score=raw_score, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs + part, + raw_score=raw_score, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + **kwargs, ) else: - result = model.predict(part, raw_score=raw_score, pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) + result = model.predict( + part, + raw_score=raw_score, + pred_leaf=pred_leaf, + pred_contrib=pred_contrib, + **kwargs, + ) # dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series if isinstance(part, pd_DataFrame): @@ -927,7 +959,9 @@ def _extract(items: List[Any], i: int) -> Any: for j, partition in enumerate(preds.to_delayed()): for i in range(num_classes): part = dask_array_from_delayed( - value=_extract(partition, i), shape=(nrows_per_chunk[j], num_cols), meta=pred_meta + value=_extract(partition, i), + shape=(nrows_per_chunk[j], num_cols), + meta=pred_meta, ) out[i].append(part) @@ -946,7 +980,9 @@ def _extract(items: List[Any], i: int) -> Any: for i in range(num_classes): out_arrays.append( dask_array_from_delayed( - value=delayed(concat_fn)(out[i]), shape=(data.shape[0], num_cols), meta=pred_meta + value=delayed(concat_fn)(out[i]), + shape=(data.shape[0], num_cols), + meta=pred_meta, ) ) @@ -1063,7 +1099,8 @@ def _lgb_dask_to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel: @staticmethod def _lgb_dask_copy_extra_params( - source: Union["_DaskLGBMModel", LGBMModel], dest: Union["_DaskLGBMModel", LGBMModel] + source: Union["_DaskLGBMModel", LGBMModel], + dest: Union["_DaskLGBMModel", LGBMModel], ) -> None: params = source.get_params() # type: ignore[union-attr] attributes = source.__dict__ From e4b377e5d9ec4098e65cf2ad7a9b9c44044348aa Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:31:03 -0600 Subject: [PATCH 07/15] revert more indentation changes --- python-package/lightgbm/dask.py | 12 +++--- python-package/lightgbm/engine.py | 62 +++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index c5714c2fc1f8..928fe51bddce 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -527,12 +527,12 @@ def _train( default_value="data", ) allowed_tree_learners = { - 'data', - 'data_parallel', - 'feature', - 'feature_parallel', - 'voting', - 'voting_parallel', + "data", + "data_parallel", + "feature", + "feature_parallel", + "voting", + "voting_parallel", } if params["tree_learner"] not in allowed_tree_learners: _log_warning( diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 0e4f2b0a5858..7bb3902bf59a 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -39,10 +39,16 @@ [np.ndarray, Dataset], _LGBM_EvalFunctionResultType, ], - Callable[[np.ndarray, Dataset], List[_LGBM_EvalFunctionResultType]], + Callable[ + [np.ndarray, Dataset], + List[_LGBM_EvalFunctionResultType], + ], ] -_LGBM_PreprocFunction = Callable[[Dataset, Dataset, Dict[str, Any]], Tuple[Dataset, Dataset, Dict[str, Any]]] +_LGBM_PreprocFunction = Callable[ + [Dataset, Dataset, Dict[str, Any]], + Tuple[Dataset, Dataset, Dict[str, Any]], +] def train( @@ -162,7 +168,11 @@ def train( # create predictor first params = copy.deepcopy(params) - params = _choose_param_value(main_param_name="objective", params=params, default_value=None) + params = _choose_param_value( + main_param_name="objective", + params=params, + default_value=None, + ) fobj: Optional[_LGBM_CustomObjectiveFunction] = None if callable(params["objective"]): fobj = params["objective"] @@ -173,7 +183,11 @@ def train( _log_warning(f"Found `{alias}` in params. Will use it instead of argument") params["num_iterations"] = num_boost_round # setting early stopping via global params should be possible - params = _choose_param_value(main_param_name="early_stopping_round", params=params, default_value=None) + params = _choose_param_value( + main_param_name="early_stopping_round", + params=params, + default_value=None, + ) if params["early_stopping_round"] is None: params.pop("early_stopping_round") first_metric_only = params.get("first_metric_only", False) @@ -227,10 +241,11 @@ def train( callback.early_stopping( stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] first_metric_only=first_metric_only, - verbose=_choose_param_value(main_param_name="verbosity", params=params, default_value=1).pop( - "verbosity" - ) - > 0, + verbose=_choose_param_value( + main_param_name="verbosity", + params=params, + default_value=1, + ).pop("verbosity")> 0, ) ) @@ -318,7 +333,10 @@ class CVBooster: The best iteration of fitted model. """ - def __init__(self, model_file: Optional[Union[str, Path]] = None): + def __init__( + self, + model_file: Optional[Union[str, Path]] = None, + ): """Initialize the CVBooster. Parameters @@ -386,7 +404,10 @@ def model_from_string(self, model_str: str) -> "CVBooster": return self def model_to_string( - self, num_iteration: Optional[int] = None, start_iteration: int = 0, importance_type: str = "split" + self, + num_iteration: Optional[int] = None, + start_iteration: int = 0, + importance_type: str = "split", ) -> str: """Save CVBooster to JSON string. @@ -671,7 +692,11 @@ def cv( raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") params = copy.deepcopy(params) - params = _choose_param_value(main_param_name="objective", params=params, default_value=None) + params = _choose_param_value( + main_param_name="objective", + params=params, + default_value=None, + ) fobj: Optional[_LGBM_CustomObjectiveFunction] = None if callable(params["objective"]): fobj = params["objective"] @@ -682,7 +707,11 @@ def cv( num_boost_round = params.pop(alias) params["num_iterations"] = num_boost_round # setting early stopping via global params should be possible - params = _choose_param_value(main_param_name="early_stopping_round", params=params, default_value=None) + params = _choose_param_value( + main_param_name="early_stopping_round", + params=params, + default_value=None, + ) if params["early_stopping_round"] is None: params.pop("early_stopping_round") first_metric_only = params.get("first_metric_only", False) @@ -729,10 +758,11 @@ def cv( callback.early_stopping( stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] first_metric_only=first_metric_only, - verbose=_choose_param_value(main_param_name="verbosity", params=params, default_value=1).pop( - "verbosity" - ) - > 0, + verbose=_choose_param_value( + main_param_name="verbosity", + params=params, + default_value=1, + ).pop("verbosity") > 0, ) ) From e326820f7de5b9c96fbb7f6f04b4fad901196889 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:31:31 -0600 Subject: [PATCH 08/15] misc --- python-package/lightgbm/engine.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 7bb3902bf59a..0084c6e28544 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -184,10 +184,10 @@ def train( params["num_iterations"] = num_boost_round # setting early stopping via global params should be possible params = _choose_param_value( - main_param_name="early_stopping_round", - params=params, + main_param_name="early_stopping_round", + params=params, default_value=None, - ) + ) if params["early_stopping_round"] is None: params.pop("early_stopping_round") first_metric_only = params.get("first_metric_only", False) @@ -245,7 +245,8 @@ def train( main_param_name="verbosity", params=params, default_value=1, - ).pop("verbosity")> 0, + ).pop("verbosity") + > 0, ) ) @@ -711,7 +712,7 @@ def cv( main_param_name="early_stopping_round", params=params, default_value=None, - ) + ) if params["early_stopping_round"] is None: params.pop("early_stopping_round") first_metric_only = params.get("first_metric_only", False) @@ -762,7 +763,8 @@ def cv( main_param_name="verbosity", params=params, default_value=1, - ).pop("verbosity") > 0, + ).pop("verbosity") + > 0, ) ) From de5421122ead54fcba526ffd7f994af1a388efed Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:43:41 -0600 Subject: [PATCH 09/15] more indentation --- python-package/lightgbm/basic.py | 189 +++++++++++++++++++++++++------ 1 file changed, 156 insertions(+), 33 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 2bf9ca955a77..2ba4b6f1ef90 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -572,14 +572,24 @@ def _get_all_param_aliases() -> Dict[str, List[str]]: tmp_out_len = ctypes.c_int64(0) string_buffer = ctypes.create_string_buffer(buffer_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) - _safe_call(_LIB.LGBM_DumpParamAliases(ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), ptr_string_buffer)) + _safe_call( + _LIB.LGBM_DumpParamAliases( + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer, + ) + ) actual_len = tmp_out_len.value # if buffer length is not long enough, re-allocate a buffer if actual_len > buffer_len: string_buffer = ctypes.create_string_buffer(actual_len) ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call( - _LIB.LGBM_DumpParamAliases(ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer) + _LIB.LGBM_DumpParamAliases( + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer, + ) ) return json.loads( string_buffer.value.decode("utf-8"), object_hook=lambda obj: {k: [k] + v for k, v in obj.items()} @@ -983,7 +993,12 @@ def __init__( self.pred_parameter = _param_dict_to_str(pred_parameter) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) + _safe_call( + _LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class), + ) + ) self.num_class = out_num_class.value @classmethod @@ -1002,7 +1017,12 @@ def from_booster( Other parameters for the prediction. """ out_cur_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(booster._handle, ctypes.byref(out_cur_iter))) + _safe_call( + _LIB.LGBM_BoosterGetCurrentIteration( + booster._handle, + ctypes.byref(out_cur_iter), + ) + ) return cls( booster_handle=booster._handle, pandas_categorical=booster.pandas_categorical, @@ -1029,7 +1049,9 @@ def from_model_file( out_num_iterations = ctypes.c_int(0) _safe_call( _LIB.LGBM_BoosterCreateFromModelfile( - _c_str(str(model_file)), ctypes.byref(out_num_iterations), ctypes.byref(booster_handle) + _c_str(str(model_file)), + ctypes.byref(out_num_iterations), + ctypes.byref(booster_handle), ) ) return cls( @@ -1374,7 +1396,11 @@ def __create_sparse_native( # free the temporary native indptr, indices, and data _safe_call( _LIB.LGBM_BoosterFreePredictSparse( - out_ptr_indptr, out_ptr_indices, out_ptr_data, ctypes.c_int(indptr_type), ctypes.c_int(data_type) + out_ptr_indptr, + out_ptr_indices, + out_ptr_data, + ctypes.c_int(indptr_type), + ctypes.c_int(data_type), ) ) if len(cs_output_matrices) == 1: @@ -1499,7 +1525,10 @@ def __pred_for_csr( """Predict for a CSR data.""" if predict_type == _C_API_PREDICT_CONTRIB: return self.__inner_predict_csr_sparse( - csr=csr, start_iteration=start_iteration, num_iteration=num_iteration, predict_type=predict_type + csr=csr, + start_iteration=start_iteration, + num_iteration=num_iteration, + predict_type=predict_type, ) nrow = len(csr.indptr) - 1 if nrow > _MAX_INT32: @@ -1702,7 +1731,12 @@ def current_iteration(self) -> int: The index of the current iteration. """ out_cur_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(self._handle, ctypes.byref(out_cur_iter))) + _safe_call( + _LIB.LGBM_BoosterGetCurrentIteration( + self._handle, + ctypes.byref(out_cur_iter), + ) + ) return out_cur_iter.value @@ -2108,7 +2142,10 @@ def _lazy_init( self._handle = ctypes.c_void_p() _safe_call( _LIB.LGBM_DatasetCreateFromFile( - _c_str(str(data)), _c_str(params_str), ref_dataset, ctypes.byref(self._handle) + _c_str(str(data)), + _c_str(params_str), + ref_dataset, + ctypes.byref(self._handle), ) ) elif isinstance(data, scipy.sparse.csr_matrix): @@ -2648,7 +2685,12 @@ def save_binary(self, filename: Union[str, Path]) -> "Dataset": self : Dataset Returns self. """ - _safe_call(_LIB.LGBM_DatasetSaveBinary(self.construct()._handle, _c_str(str(filename)))) + _safe_call( + _LIB.LGBM_DatasetSaveBinary( + self.construct()._handle, + _c_str(str(filename)), + ) + ) return self def _update_params(self, params: Optional[Dict[str, Any]]) -> "Dataset": @@ -2667,7 +2709,8 @@ def update(): update() elif params is not None: ret = _LIB.LGBM_DatasetUpdateParamChecking( - _c_str(_param_dict_to_str(self.params)), _c_str(_param_dict_to_str(params)) + _c_str(_param_dict_to_str(self.params)), + _c_str(_param_dict_to_str(params)), ) if ret != 0: # could be updated if data is not freed @@ -2774,7 +2817,11 @@ def set_field( raise TypeError("Input type error for set_field") _safe_call( _LIB.LGBM_DatasetSetField( - self._handle, _c_str(field_name), ptr_data, ctypes.c_int(len(data)), ctypes.c_int(type_data) + self._handle, + _c_str(field_name), + ptr_data, + ctypes.c_int(len(data)), + ctypes.c_int(type_data), ) ) self.version += 1 @@ -2806,7 +2853,11 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]: ret = ctypes.POINTER(ctypes.c_void_p)() _safe_call( _LIB.LGBM_DatasetGetField( - self._handle, _c_str(field_name), ctypes.byref(tmp_out_len), ctypes.byref(ret), ctypes.byref(out_type) + self._handle, + _c_str(field_name), + ctypes.byref(tmp_out_len), + ctypes.byref(ret), + ctypes.byref(out_type), ) ) if out_type.value != _FIELD_TYPE_MAPPER[field_name]: @@ -2815,15 +2866,18 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]: return None if out_type.value == _C_API_DTYPE_INT32: arr = _cint32_array_to_numpy( - cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), length=tmp_out_len.value + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), + length=tmp_out_len.value, ) elif out_type.value == _C_API_DTYPE_FLOAT32: arr = _cfloat32_array_to_numpy( - cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), length=tmp_out_len.value + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), + length=tmp_out_len.value, ) elif out_type.value == _C_API_DTYPE_FLOAT64: arr = _cfloat64_array_to_numpy( - cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), length=tmp_out_len.value + cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), + length=tmp_out_len.value, ) else: raise TypeError("Unknown type") @@ -2962,7 +3016,9 @@ def set_feature_name(self, feature_name: _LGBM_FeatureNameConfiguration) -> "Dat c_feature_name = [_c_str(name) for name in feature_name] _safe_call( _LIB.LGBM_DatasetSetFeatureNames( - self._handle, _c_array(ctypes.c_char_p, c_feature_name), ctypes.c_int(len(feature_name)) + self._handle, + _c_array(ctypes.c_char_p, c_feature_name), + ctypes.c_int(len(feature_name)), ) ) return self @@ -3049,7 +3105,10 @@ def set_init_score( self.init_score = self.get_field("init_score") # original values can be modified at cpp side return self - def set_group(self, group: Optional[_LGBM_GroupType]) -> "Dataset": + def set_group( + self, + group: Optional[_LGBM_GroupType], + ) -> "Dataset": """Set group size of Dataset (used for ranking). Parameters @@ -3261,7 +3320,12 @@ def num_data(self) -> int: """ if self._handle is not None: ret = ctypes.c_int(0) - _safe_call(_LIB.LGBM_DatasetGetNumData(self._handle, ctypes.byref(ret))) + _safe_call( + _LIB.LGBM_DatasetGetNumData( + self._handle, + ctypes.byref(ret), + ) + ) return ret.value else: raise LightGBMError("Cannot get num_data before construct dataset") @@ -3276,7 +3340,12 @@ def num_feature(self) -> int: """ if self._handle is not None: ret = ctypes.c_int(0) - _safe_call(_LIB.LGBM_DatasetGetNumFeature(self._handle, ctypes.byref(ret))) + _safe_call( + _LIB.LGBM_DatasetGetNumFeature( + self._handle, + ctypes.byref(ret), + ) + ) return ret.value else: raise LightGBMError("Cannot get num_feature before construct dataset") @@ -3302,7 +3371,13 @@ def feature_num_bin(self, feature: Union[int, str]) -> int: else: feature_index = feature ret = ctypes.c_int(0) - _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self._handle, ctypes.c_int(feature_index), ctypes.byref(ret))) + _safe_call( + _LIB.LGBM_DatasetGetFeatureNumBin( + self._handle, + ctypes.c_int(feature_index), + ctypes.byref(ret), + ) + ) return ret.value else: raise LightGBMError("Cannot get feature_num_bin before construct dataset") @@ -3444,7 +3519,12 @@ def _dump_text(self, filename: Union[str, Path]) -> "Dataset": self : Dataset Returns self. """ - _safe_call(_LIB.LGBM_DatasetDumpText(self.construct()._handle, _c_str(str(filename)))) + _safe_call( + _LIB.LGBM_DatasetDumpText( + self.construct()._handle, + _c_str(str(filename)), + ) + ) return self @@ -3538,7 +3618,13 @@ def __init__( # copy the parameters from train_set params.update(train_set.get_params()) params_str = _param_dict_to_str(params) - _safe_call(_LIB.LGBM_BoosterCreate(train_set._handle, _c_str(params_str), ctypes.byref(self._handle))) + _safe_call( + _LIB.LGBM_BoosterCreate( + train_set._handle, + _c_str(params_str), + ctypes.byref(self._handle), + ) + ) # save reference to data self.train_set = train_set self.valid_sets: List[Dataset] = [] @@ -3546,9 +3632,19 @@ def __init__( self.__num_dataset = 1 self.__init_predictor = train_set._predictor if self.__init_predictor is not None: - _safe_call(_LIB.LGBM_BoosterMerge(self._handle, self.__init_predictor._handle)) + _safe_call( + _LIB.LGBM_BoosterMerge( + self._handle, + self.__init_predictor._handle, + ) + ) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) + _safe_call( + _LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class), + ) + ) self.__num_class = out_num_class.value # buffer for inner predict self.__inner_predict_buffer: List[Optional[np.ndarray]] = [None] @@ -3561,11 +3657,18 @@ def __init__( out_num_iterations = ctypes.c_int(0) _safe_call( _LIB.LGBM_BoosterCreateFromModelfile( - _c_str(str(model_file)), ctypes.byref(out_num_iterations), ctypes.byref(self._handle) + _c_str(str(model_file)), + ctypes.byref(out_num_iterations), + ctypes.byref(self._handle), ) ) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) + _safe_call( + _LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class), + ) + ) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(file_name=model_file) if params: @@ -3614,7 +3717,9 @@ def __setstate__(self, state: Dict[str, Any]) -> None: out_num_iterations = ctypes.c_int(0) _safe_call( _LIB.LGBM_BoosterLoadModelFromString( - _c_str(model_str), ctypes.byref(out_num_iterations), ctypes.byref(handle) + _c_str(model_str), + ctypes.byref(out_num_iterations), + ctypes.byref(handle), ) ) state["_handle"] = handle @@ -3627,7 +3732,10 @@ def _get_loaded_param(self) -> Dict[str, Any]: ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call( _LIB.LGBM_BoosterGetLoadedParam( - self._handle, ctypes.c_int64(buffer_len), ctypes.byref(tmp_out_len), ptr_string_buffer + self._handle, + ctypes.c_int64(buffer_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer, ) ) actual_len = tmp_out_len.value @@ -3637,7 +3745,10 @@ def _get_loaded_param(self) -> Dict[str, Any]: ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer)) _safe_call( _LIB.LGBM_BoosterGetLoadedParam( - self._handle, ctypes.c_int64(actual_len), ctypes.byref(tmp_out_len), ptr_string_buffer + self._handle, + ctypes.c_int64(actual_len), + ctypes.byref(tmp_out_len), + ptr_string_buffer, ) ) return json.loads(string_buffer.value.decode("utf-8")) @@ -3979,7 +4090,12 @@ def update( if train_set._predictor is not self.__init_predictor: raise LightGBMError("Replace training data failed, " "you should use same predictor for these data") self.train_set = train_set - _safe_call(_LIB.LGBM_BoosterResetTrainingData(self._handle, self.train_set.construct()._handle)) + _safe_call( + _LIB.LGBM_BoosterResetTrainingData( + self._handle, + self.train_set.construct()._handle, + ) + ) self.__inner_predict_buffer[0] = None self.train_set_version = self.train_set.version is_finished = ctypes.c_int(0) @@ -4072,7 +4188,12 @@ def current_iteration(self) -> int: The index of the current iteration. """ out_cur_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(self._handle, ctypes.byref(out_cur_iter))) + _safe_call( + _LIB.LGBM_BoosterGetCurrentIteration( + self._handle, + ctypes.byref(out_cur_iter), + ) + ) return out_cur_iter.value def num_model_per_iteration(self) -> int: @@ -4340,7 +4461,9 @@ def model_from_string(self, model_str: str) -> "Booster": out_num_iterations = ctypes.c_int(0) _safe_call( _LIB.LGBM_BoosterLoadModelFromString( - _c_str(model_str), ctypes.byref(out_num_iterations), ctypes.byref(self._handle) + _c_str(model_str), + ctypes.byref(out_num_iterations), + ctypes.byref(self._handle), ) ) out_num_class = ctypes.c_int(0) From 01c56bcbbd5b6bf4bbf9ab33008c9f30e1d0d903 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 20:48:06 -0600 Subject: [PATCH 10/15] more indentation --- python-package/lightgbm/sklearn.py | 61 ++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index a6a7554bdb62..745da04dd2d4 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -53,28 +53,52 @@ ] _LGBM_ScikitMatrixLike = Union[ - dt_DataTable, List[Union[List[float], List[int]]], np.ndarray, pd_DataFrame, scipy.sparse.spmatrix + dt_DataTable, + List[Union[List[float], List[int]]], + np.ndarray, + pd_DataFrame, + scipy.sparse.spmatrix, ] _LGBM_ScikitCustomObjectiveFunction = Union[ # f(labels, preds) - Callable[[Optional[np.ndarray], np.ndarray], Tuple[np.ndarray, np.ndarray]], + Callable[ + [Optional[np.ndarray], np.ndarray], + Tuple[np.ndarray, np.ndarray], + ], # f(labels, preds, weights) - Callable[[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], Tuple[np.ndarray, np.ndarray]], + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + Tuple[np.ndarray, np.ndarray], + ], # f(labels, preds, weights, group) Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], Tuple[np.ndarray, np.ndarray] + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + Tuple[np.ndarray, np.ndarray], ], ] _LGBM_ScikitCustomEvalFunction = Union[ # f(labels, preds) - Callable[[Optional[np.ndarray], np.ndarray], _LGBM_EvalFunctionResultType], - Callable[[Optional[np.ndarray], np.ndarray], List[_LGBM_EvalFunctionResultType]], + Callable[ + [Optional[np.ndarray], np.ndarray], + _LGBM_EvalFunctionResultType, + ], + Callable[ + [Optional[np.ndarray], np.ndarray], + List[_LGBM_EvalFunctionResultType], + ], # f(labels, preds, weights) - Callable[[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], _LGBM_EvalFunctionResultType], - Callable[[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], List[_LGBM_EvalFunctionResultType]], + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + _LGBM_EvalFunctionResultType, + ], + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + List[_LGBM_EvalFunctionResultType], + ], # f(labels, preds, weights, group) Callable[ - [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], _LGBM_EvalFunctionResultType + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + _LGBM_EvalFunctionResultType, ], Callable[ [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], @@ -82,7 +106,9 @@ ], ] _LGBM_ScikitEvalMetricType = Union[ - str, _LGBM_ScikitCustomEvalFunction, List[Union[str, _LGBM_ScikitCustomEvalFunction]] + str, + _LGBM_ScikitCustomEvalFunction, + List[Union[str, _LGBM_ScikitCustomEvalFunction]], ] _LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType] @@ -163,7 +189,11 @@ def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction): """ self.func = func - def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.ndarray]: + def __call__( + self, + preds: np.ndarray, + dataset: Dataset, + ) -> Tuple[np.ndarray, np.ndarray]: """Call passed function with appropriate arguments. Parameters @@ -953,7 +983,14 @@ def predict( # and then overwrite them (considering aliases) with params that were passed directly in prediction predict_params = self._process_params(stage="predict") for alias in _ConfigAliases.get_by_alias( - "data", "X", "raw_score", "start_iteration", "num_iteration", "pred_leaf", "pred_contrib", *kwargs.keys() + "data", + "X", + "raw_score", + "start_iteration", + "num_iteration", + "pred_leaf", + "pred_contrib", + *kwargs.keys(), ): predict_params.pop(alias, None) predict_params.update(kwargs) From 88c121a0ba5d51477d2f2e3f79cc604271a56d38 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 21:05:37 -0600 Subject: [PATCH 11/15] more indentation --- python-package/lightgbm/basic.py | 93 +++++++++++++++++++++++++----- python-package/lightgbm/sklearn.py | 4 +- 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 2ba4b6f1ef90..c13927c5f14b 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3429,7 +3429,12 @@ def add_features_from(self, other: "Dataset") -> "Dataset": """ if self._handle is None or other._handle is None: raise ValueError("Both source and target Datasets must be constructed before adding features") - _safe_call(_LIB.LGBM_DatasetAddFeaturesFrom(self._handle, other._handle)) + _safe_call( + _LIB.LGBM_DatasetAddFeaturesFrom( + self._handle, + other._handle, + ) + ) was_none = self.data is None old_self_data_type = type(self.data).__name__ if other.data is None: @@ -4013,7 +4018,12 @@ def add_valid(self, data: Dataset, name: str) -> "Booster": raise TypeError(f"Validation data should be Dataset instance, met {type(data).__name__}") if data._predictor is not self.__init_predictor: raise LightGBMError("Add validation data failed, " "you should use same predictor for these data") - _safe_call(_LIB.LGBM_BoosterAddValidData(self._handle, data.construct()._handle)) + _safe_call( + _LIB.LGBM_BoosterAddValidData( + self._handle, + data.construct()._handle, + ) + ) self.valid_sets.append(data) self.name_valid_sets.append(name) self.__num_dataset += 1 @@ -4036,7 +4046,12 @@ def reset_parameter(self, params: Dict[str, Any]) -> "Booster": """ params_str = _param_dict_to_str(params) if params_str: - _safe_call(_LIB.LGBM_BoosterResetParameter(self._handle, _c_str(params_str))) + _safe_call( + _LIB.LGBM_BoosterResetParameter( + self._handle, + _c_str(params_str), + ) + ) self.params.update(params) return self @@ -4102,7 +4117,12 @@ def update( if fobj is None: if self.__set_objective_to_none: raise LightGBMError("Cannot update due to null objective function.") - _safe_call(_LIB.LGBM_BoosterUpdateOneIter(self._handle, ctypes.byref(is_finished))) + _safe_call( + _LIB.LGBM_BoosterUpdateOneIter( + self._handle, + ctypes.byref(is_finished), + ) + ) self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] return is_finished.value == 1 else: @@ -4205,7 +4225,12 @@ def num_model_per_iteration(self) -> int: The number of models per iteration. """ model_per_iter = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterNumModelPerIteration(self._handle, ctypes.byref(model_per_iter))) + _safe_call( + _LIB.LGBM_BoosterNumModelPerIteration( + self._handle, + ctypes.byref(model_per_iter), + ) + ) return model_per_iter.value def num_trees(self) -> int: @@ -4229,7 +4254,12 @@ def upper_bound(self) -> float: Upper bound value of the model. """ ret = ctypes.c_double(0) - _safe_call(_LIB.LGBM_BoosterGetUpperBoundValue(self._handle, ctypes.byref(ret))) + _safe_call( + _LIB.LGBM_BoosterGetUpperBoundValue( + self._handle, + ctypes.byref(ret), + ) + ) return ret.value def lower_bound(self) -> float: @@ -4241,7 +4271,12 @@ def lower_bound(self) -> float: Lower bound value of the model. """ ret = ctypes.c_double(0) - _safe_call(_LIB.LGBM_BoosterGetLowerBoundValue(self._handle, ctypes.byref(ret))) + _safe_call( + _LIB.LGBM_BoosterGetLowerBoundValue( + self._handle, + ctypes.byref(ret), + ) + ) return ret.value def eval( @@ -4467,7 +4502,12 @@ def model_from_string(self, model_str: str) -> "Booster": ) ) out_num_class = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumClasses(self._handle, ctypes.byref(out_num_class))) + _safe_call( + _LIB.LGBM_BoosterGetNumClasses( + self._handle, + ctypes.byref(out_num_class), + ) + ) self.__num_class = out_num_class.value self.pandas_categorical = _load_pandas_categorical(model_str=model_str) return self @@ -4796,7 +4836,12 @@ def refit( ) nrow, ncol = leaf_preds.shape out_is_linear = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetLinear(self._handle, ctypes.byref(out_is_linear))) + _safe_call( + _LIB.LGBM_BoosterGetLinear( + self._handle, + ctypes.byref(out_is_linear), + ) + ) new_params = _choose_param_value( main_param_name="linear_tree", params=self.params, @@ -4819,10 +4864,22 @@ def refit( new_params["refit_decay_rate"] = decay_rate new_booster = Booster(new_params, train_set) # Copy models - _safe_call(_LIB.LGBM_BoosterMerge(new_booster._handle, predictor._handle)) + _safe_call( + _LIB.LGBM_BoosterMerge( + new_booster._handle, + predictor._handle, + ) + ) leaf_preds = leaf_preds.reshape(-1) ptr_data, _, _ = _c_int_array(leaf_preds) - _safe_call(_LIB.LGBM_BoosterRefit(new_booster._handle, ptr_data, ctypes.c_int32(nrow), ctypes.c_int32(ncol))) + _safe_call( + _LIB.LGBM_BoosterRefit( + new_booster._handle, + ptr_data, + ctypes.c_int32(nrow), + ctypes.c_int32(ncol), + ) + ) new_booster._network = self._network return new_booster @@ -4887,7 +4944,12 @@ def num_feature(self) -> int: The number of features. """ out_num_feature = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterGetNumFeature(self._handle, ctypes.byref(out_num_feature))) + _safe_call( + _LIB.LGBM_BoosterGetNumFeature( + self._handle, + ctypes.byref(out_num_feature), + ) + ) return out_num_feature.value def feature_name(self) -> List[str]: @@ -5128,7 +5190,12 @@ def __get_eval_info(self) -> None: self.__need_reload_eval_info = False out_num_eval = ctypes.c_int(0) # Get num of inner evals - _safe_call(_LIB.LGBM_BoosterGetEvalCounts(self._handle, ctypes.byref(out_num_eval))) + _safe_call( + _LIB.LGBM_BoosterGetEvalCounts( + self._handle, + ctypes.byref(out_num_eval), + ) + ) self.__num_inner_eval = out_num_eval.value if self.__num_inner_eval > 0: # Get name of eval metrics diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 745da04dd2d4..5e0d51f4546d 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -273,7 +273,9 @@ def __init__(self, func: _LGBM_ScikitCustomEvalFunction): self.func = func def __call__( - self, preds: np.ndarray, dataset: Dataset + self, + preds: np.ndarray, + dataset: Dataset, ) -> Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]: """Call passed function with appropriate arguments. From 638621f77275becfc01e9ef241663ea0795c69b5 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 21:23:17 -0600 Subject: [PATCH 12/15] more indentation --- python-package/lightgbm/basic.py | 37 ++++++++++++++++++++++++----- python-package/lightgbm/callback.py | 8 +++++-- python-package/lightgbm/engine.py | 10 ++++++-- python-package/lightgbm/libpath.py | 4 +--- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index c13927c5f14b..476d19e9cfeb 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -4242,7 +4242,12 @@ def num_trees(self) -> int: The number of weak sub-models. """ num_trees = ctypes.c_int(0) - _safe_call(_LIB.LGBM_BoosterNumberOfTotalModel(self._handle, ctypes.byref(num_trees))) + _safe_call( + _LIB.LGBM_BoosterNumberOfTotalModel( + self._handle, + ctypes.byref(num_trees), + ) + ) return num_trees.value def upper_bound(self) -> float: @@ -4471,7 +4476,11 @@ def shuffle_models( Booster with shuffled models. """ _safe_call( - _LIB.LGBM_BoosterShuffleModels(self._handle, ctypes.c_int(start_iteration), ctypes.c_int(end_iteration)) + _LIB.LGBM_BoosterShuffleModels( + self._handle, + ctypes.c_int(start_iteration), + ctypes.c_int(end_iteration), + ) ) return self @@ -4647,7 +4656,12 @@ def dump_model( ) ) ret = json.loads(string_buffer.value.decode("utf-8"), object_hook=object_hook) - ret["pandas_categorical"] = json.loads(json.dumps(self.pandas_categorical, default=_json_default_with_numpy)) + ret["pandas_categorical"] = json.loads( + json.dumps( + self.pandas_categorical, + default=_json_default_with_numpy, + ) + ) return ret def predict( @@ -4900,7 +4914,12 @@ def get_leaf_output(self, tree_id: int, leaf_id: int) -> float: """ ret = ctypes.c_double(0) _safe_call( - _LIB.LGBM_BoosterGetLeafValue(self._handle, ctypes.c_int(tree_id), ctypes.c_int(leaf_id), ctypes.byref(ret)) + _LIB.LGBM_BoosterGetLeafValue( + self._handle, + ctypes.c_int(tree_id), + ctypes.c_int(leaf_id), + ctypes.byref(ret), + ) ) return ret.value @@ -4930,7 +4949,10 @@ def set_leaf_output( """ _safe_call( _LIB.LGBM_BoosterSetLeafValue( - self._handle, ctypes.c_int(tree_id), ctypes.c_int(leaf_id), ctypes.c_double(value) + self._handle, + ctypes.c_int(tree_id), + ctypes.c_int(leaf_id), + ctypes.c_double(value), ) ) return self @@ -5037,7 +5059,10 @@ def feature_importance( return result def get_split_value_histogram( - self, feature: Union[int, str], bins: Optional[Union[int, str]] = None, xgboost_style: bool = False + self, + feature: Union[int, str], + bins: Optional[Union[int, str]] = None, + xgboost_style: bool = False, ) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray, pd_DataFrame]: """Get split value histogram for the specified feature. diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index 4254f1e76267..5b06eaec5690 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -26,9 +26,13 @@ ] _EvalResultDict = Dict[str, Dict[str, List[Any]]] -_EvalResultTuple = Union[_LGBM_BoosterEvalMethodResultType, _LGBM_BoosterEvalMethodResultWithStandardDeviationType] +_EvalResultTuple = Union[ + _LGBM_BoosterEvalMethodResultType, + _LGBM_BoosterEvalMethodResultWithStandardDeviationType, +] _ListOfEvalResultTuples = Union[ - List[_LGBM_BoosterEvalMethodResultType], List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType] + List[_LGBM_BoosterEvalMethodResultType], + List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType], ] diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 0084c6e28544..561349a44146 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -718,9 +718,15 @@ def cv( first_metric_only = params.get("first_metric_only", False) if isinstance(init_model, (str, Path)): - predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params) + predictor = _InnerPredictor.from_model_file( + model_file=init_model, + pred_parameter=params, + ) elif isinstance(init_model, Booster): - predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params)) + predictor = _InnerPredictor.from_booster( + booster=init_model, + pred_parameter=dict(init_model.params, **params), + ) else: predictor = None diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py index 49ec74d2f9b2..5dcd4fc2a4da 100644 --- a/python-package/lightgbm/libpath.py +++ b/python-package/lightgbm/libpath.py @@ -16,13 +16,11 @@ def find_lib_path() -> List[str]: List of all found library paths to LightGBM. """ curr_path = Path(__file__).absolute() - # fmt: off dll_path = [ curr_path.parents[1], curr_path.parents[0] / "bin", - curr_path.parents[0] / "lib" + curr_path.parents[0] / "lib", ] - # fmt: on if system() in ("Windows", "Microsoft"): dll_path.append(curr_path.parents[1] / "Release") dll_path.append(curr_path.parents[1] / "windows" / "x64" / "DLL") From cb6973bfc1332854f665372002ad9e2e5f42736c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 21 Feb 2024 21:25:36 -0600 Subject: [PATCH 13/15] move type-ignore comment back to the correct line --- python-package/lightgbm/basic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 476d19e9cfeb..fced1b0dc4e0 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2771,8 +2771,8 @@ def set_field( data = pa_chunked_array( [ chunk - for array in data.columns - for chunk in array.chunks # type: ignore + for array in data.columns # type: ignore + for chunk in array.chunks ] ) From 5a0b5475d0a94265296593af9ee148375902cd96 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 24 Feb 2024 01:07:42 -0600 Subject: [PATCH 14/15] formatting --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 04547abd70af..42f8c8d7d18c 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2803,7 +2803,7 @@ def set_field( "In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame." ) else: - if field_name in {'group', 'position'}: + if field_name in {"group", "position"}: dtype = np.int32 else: dtype = np.float32 From 25631586bc4f13782d9b2622e347e7c8d37a0aa3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 26 Feb 2024 15:44:44 -0600 Subject: [PATCH 15/15] change EarlyStoppingCallback indentation --- python-package/lightgbm/callback.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index 5b06eaec5690..5947796dcb3f 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -478,5 +478,8 @@ def early_stopping( The callback that activates early stopping. """ return _EarlyStoppingCallback( - stopping_rounds=stopping_rounds, first_metric_only=first_metric_only, verbose=verbose, min_delta=min_delta + stopping_rounds=stopping_rounds, + first_metric_only=first_metric_only, + verbose=verbose, + min_delta=min_delta, )