Skip to content

Commit

Permalink
remove the dependence on Bitset of CTR, no SplitCategorical is needed
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Nov 14, 2019
1 parent 4d38d40 commit cb6e9f4
Show file tree
Hide file tree
Showing 17 changed files with 445 additions and 138 deletions.
31 changes: 29 additions & 2 deletions include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,33 @@ class BinMapper {
}
}

inline void set_ctr_info(int real_cat_fid, const std::vector<double>& ctr_values, const std::vector<int>& seen_cat_value) {
inline void set_ctr_info(const int real_cat_fid, const std::vector<double>& ctr_values, const std::unique_ptr<BinMapper>& cat_bin_mapper) {
ctr_info_.is_ctr = true;
ctr_info_.real_cat_fid = real_cat_fid;
ctr_info_.ctr_values = ctr_values;
ctr_info_.seen_cat_values = seen_cat_value;
ctr_info_.seen_cat_values = cat_bin_mapper->GetSeenCategories();
for(int cat_value : ctr_info_.seen_cat_values) {
uint32_t cat_bin = cat_bin_mapper->ValueToBin(cat_value);
double ctr_value = ctr_info_.ctr_values[cat_bin];
ctr_info_.ctr_value_map[cat_value] = ctr_value;
}
uint32_t cat_bin = cat_bin_mapper->ValueToBin(-1);
//CHECK(missing_type_ != MissingType::NaN);
CHECK(cat_bin == ctr_info_.ctr_values.size() - 1);
double ctr_value = missing_type_ == MissingType::NaN ? std::numeric_limits<double>::quiet_NaN() : ctr_info_.ctr_values[cat_bin];
//-1 for unseen values
ctr_info_.ctr_value_map[-1] = ctr_value;
//-2 for NaN
if(missing_type_ != MissingType::NaN) {
ctr_info_.ctr_value_map[-2] = 0.0;
}
else {
ctr_info_.ctr_value_map[-2] = std::numeric_limits<double>::quiet_NaN();
}
//handle zero
if(ctr_info_.ctr_value_map.count(0) == 0) {
ctr_info_.ctr_value_map[0] = 0.0;
}
}

inline bool is_ctr() const {
Expand All @@ -222,6 +244,10 @@ class BinMapper {

std::vector<int> GetSeenCategories() const;

std::unordered_map<int, double> GetCTRInformation() const {
return ctr_info_.ctr_value_map;
}

private:
/*! \brief Number of bins */
int num_bin_;
Expand Down Expand Up @@ -250,6 +276,7 @@ class BinMapper {
int real_cat_fid;
std::vector<double> ctr_values;
std::vector<int> seen_cat_values;
std::unordered_map<int, double> ctr_value_map;

CTRInfo() {
is_ctr = false;
Expand Down
3 changes: 3 additions & 0 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,9 @@ class LIGHTGBM_EXPORT Boosting {
* \return The boosting object
*/
static Boosting* CreateBoosting(const std::string& type, const char* filename);

virtual void FillInCTRValues(std::unordered_map<int, double>& map) const = 0;
virtual void FillInCTRValues(double* values) const = 0;
};

class GBDTBase : public Boosting {
Expand Down
79 changes: 69 additions & 10 deletions include/LightGBM/ctr_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,35 @@ class CTRProvider {

void PushCTRData(const int cat_fid, const int tid, const int real_index, const double value, Dataset* data, bool is_valid) const;

std::unordered_map<int, std::unordered_map<int, double>> GetCTRInformation(
const std::function<const BinMapper*(int)>& feature_index_to_bin_mapper, const int num_total_features) const;

private:
BinMapper* ConstructCTRBinMapper(const std::vector<double>& sample_values_one_column,
void ConstructCTRBinMapper(const std::vector<double>& sample_values_one_column,
const std::vector<int>& sample_indices_one_column,
const std::vector<double>& sample_labels,
const std::unique_ptr<BinMapper>& bin_mapper,
const int num_sample_data,
const std::vector<int>& all_sample_indices,
const int real_cat_fid,
std::vector<std::vector<double>>& out_fold_ctr_values,
std::vector<double>& out_ctr_values);
std::vector<double>& out_ctr_values,
std::vector<double>& out_count_values,
std::unique_ptr<BinMapper>& out_ctr_bin_mapper,
std::unique_ptr<BinMapper>& out_count_bin_mapper);

BinMapper* ConstructCTRBinMapperParallel(const std::vector<double>& sample_values_one_column,
void ConstructCTRBinMapperParallel(const std::vector<double>& sample_values_one_column,
const std::vector<int>& sample_indices_one_column,
const std::vector<double>& sample_labels,
const std::unique_ptr<BinMapper>& bin_mapper,
const int num_sample_data,
const std::vector<int>& all_sample_indices,
const int real_cat_fid,
std::vector<std::vector<double>>& out_fold_ctr_values,
std::vector<double>& out_ctr_values);
std::vector<double>& out_ctr_values,
std::vector<double>& out_count_values,
std::unique_ptr<BinMapper>& out_ctr_bin_mapper,
std::unique_ptr<BinMapper>& out_count_bin_mapper);

void GetCTRMetaInfo(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const int num_total_features,
Expand All @@ -58,6 +67,8 @@ class CTRProvider {

void GenRandomFoldPartition(const int num_sample_data, const std::vector<int>& all_sample_indices);

void RemoveCategoricalBinMappers(std::vector<std::vector<int>>& sample_indices, std::vector<std::unique_ptr<BinMapper>>& bin_mappers);

const int max_bin_;
const int min_data_in_bin_;
const bool use_missing_;
Expand All @@ -75,40 +86,88 @@ class CTRProvider {
std::vector<int> cat_fids_;
std::unordered_map<int, int> cat_fid_2_ctr_fid_;
std::unordered_map<int, int> ctr_fid_2_cat_fid_;
std::unordered_map<int, int> cat_fid_2_count_fid_;
std::unordered_map<int, int> count_fid_2_cat_fid_;
std::unordered_map<int, int> cat_fid_2_inner_cat_fid_;
std::vector<std::vector<std::vector<double>>> fold_ctr_values_;
std::vector<std::vector<double>> ctr_values_;
std::vector<std::vector<double>> count_values_;
std::vector<int> fold_ids_;
std::vector<std::unique_ptr<BinMapper>> categorical_bin_mappers_;
};

inline void CTRProvider::PushCTRData(const int cat_fid, const int tid, const int real_index, const double value, Dataset* data, bool is_valid) const {
if(cat_fid_2_ctr_fid_.count(cat_fid)) {
if(!is_valid) {
const BinMapper* cat_bin_mapper = data->FeatureBinMapper(data->InnerFeatureIndex(cat_fid));
CHECK(cat_bin_mapper->bin_type() == BinType::CategoricalBin);
const bool value_seen_in_train = cat_bin_mapper->HasValueInCat(value);
CHECK(value_seen_in_train == true);
const int cat_value = static_cast<int>(cat_bin_mapper->ValueToBin(value));
const int fold_id = fold_ids_[real_index];
const int inner_cat_fid = cat_fid_2_inner_cat_fid_.at(cat_fid);
double ctr_value = fold_ctr_values_[inner_cat_fid][fold_id][cat_value];
double count_value = count_values_[inner_cat_fid][cat_value];

const int ctr_fid = cat_fid_2_ctr_fid_.at(cat_fid);
const int inner_ctr_fid = data->used_feature_map_[ctr_fid];
const int count_fid = cat_fid_2_count_fid_.at(cat_fid);
const int inner_count_fid = data->used_feature_map_[count_fid];
const BinMapper* ctr_bin_mapper = data->FeatureBinMapper(inner_ctr_fid);
const BinMapper* count_bin_mapper = data->FeatureBinMapper(inner_count_fid);

CHECK(ctr_bin_mapper->missing_type() == cat_bin_mapper->missing_type());
CHECK(ctr_bin_mapper->missing_type() == count_bin_mapper->missing_type());
if(cat_bin_mapper->missing_type() == MissingType::NaN) {
if(cat_value == cat_bin_mapper->num_bin() - 1) {
ctr_value = NaN;
ctr_value = std::numeric_limits<double>::quiet_NaN();
count_value = std::numeric_limits<double>::quiet_NaN();
}
}

if(inner_ctr_fid >= 0) {
const int group_id = data->feature2group_[inner_ctr_fid];
const int sub_feature_id = data->feature2subfeature_[inner_ctr_fid];
data->PushOneData(tid, static_cast<data_size_t>(real_index), group_id, sub_feature_id, ctr_value);
const int ctr_group_id = data->feature2group_[inner_ctr_fid];
const int ctr_sub_feature_id = data->feature2subfeature_[inner_ctr_fid];
data->PushOneData(tid, static_cast<data_size_t>(real_index), ctr_group_id, ctr_sub_feature_id, ctr_value);
}

if(inner_count_fid >= 0) {
const int count_group_id = data->feature2group_[inner_count_fid];
const int count_sub_feature_id = data->feature2subfeature_[inner_count_fid];
data->PushOneData(tid, static_cast<data_size_t>(real_index), count_group_id, count_sub_feature_id, count_value);
}
}
else {
const BinMapper* cat_bin_mapper = data->FeatureBinMapper(data->InnerFeatureIndex(cat_fid));
CHECK(cat_bin_mapper->bin_type() == BinType::CategoricalBin);
const int cat_bin = static_cast<int>(cat_bin_mapper->ValueToBin(value));
const int inner_cat_fid = cat_fid_2_inner_cat_fid_.at(cat_fid);
double ctr_value = ctr_values_[inner_cat_fid][cat_bin];
double count_value = count_values_[inner_cat_fid][cat_bin];
const int ctr_fid = cat_fid_2_ctr_fid_.at(cat_fid);
const int inner_ctr_fid = data->used_feature_map_[ctr_fid];
const int count_fid = cat_fid_2_count_fid_.at(cat_fid);
const int inner_count_fid = data->used_feature_map_[count_fid];
const BinMapper* ctr_bin_mapper = data->FeatureBinMapper(inner_ctr_fid);
const BinMapper* count_bin_mapper = data->FeatureBinMapper(inner_count_fid);

CHECK(ctr_bin_mapper->missing_type() == cat_bin_mapper->missing_type());
CHECK(count_bin_mapper->missing_type() == cat_bin_mapper->missing_type());
if(cat_bin_mapper->missing_type() == MissingType::NaN) {
if(cat_bin == cat_bin_mapper->num_bin() - 1) {
ctr_value = std::numeric_limits<double>::quiet_NaN();
count_value = std::numeric_limits<double>::quiet_NaN();
}
}
if(inner_ctr_fid >= 0) {
const int ctr_group_id = data->feature2group_[inner_ctr_fid];
const int ctr_sub_feature_id = data->feature2subfeature_[inner_ctr_fid];
data->PushOneData(tid, static_cast<data_size_t>(real_index), ctr_group_id, ctr_sub_feature_id, ctr_value);
}
if(inner_count_fid >= 0) {
const int count_group_id = data->feature2group_[inner_count_fid];
const int count_sub_feature_id = data->feature2subfeature_[inner_count_fid];
data->PushOneData(tid, static_cast<data_size_t>(real_index), count_group_id, count_sub_feature_id, count_value);
}
}
}
}

Expand Down
8 changes: 6 additions & 2 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,8 @@ class Dataset {
inline std::vector<int> ValidFeatureIndices() const {
std::vector<int> ret;
for (int i = 0; i < num_total_features_; ++i) {
if (used_feature_map_[i] >= 0) {
//skip categorical features, since using CTR
if (used_feature_map_[i] >= 0 && FeatureBinMapper(used_feature_map_[i])->bin_type() != BinType::CategoricalBin) {
ret.push_back(i);
}
}
Expand Down Expand Up @@ -612,6 +613,8 @@ class Dataset {

void addFeaturesFrom(Dataset* other);

const std::unordered_map<int, std::unordered_map<int, double>>& GetCTRInformation() const;

private:
std::string data_filename_;
/*! \brief Store used features */
Expand Down Expand Up @@ -651,7 +654,8 @@ class Dataset {
bool use_missing_;
bool zero_as_missing_;
/*! \brief CTR provider, responsible for CTR calculation, and CTR bin mapping */
CTRProvider* ctr_provider_;
CTRProvider* ctr_provider_ = nullptr;
std::unordered_map<int, std::unordered_map<int, double>> ctr_information_;
};

} // namespace LightGBM
Expand Down
32 changes: 3 additions & 29 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,32 +87,6 @@ class Tree {
const uint32_t* threshold, int num_threshold, double left_value, double right_value,
int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type);

/*!
* \brief Performing a split on tree leaves, with categorical feature
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param real_feature Index of feature, the original index on data
* \param threshold_bin Threshold(bin) of split, use bitset to represent
* \param num_threshold_bin size of threshold_bin
* \param threshold Thresholds of real feature value, use bitset to represent
* \param num_threshold size of threshold
* \param seen_categories Categories met in training data
* \param num_seen_categories Number of seen categories
* \param left_value Model Left child output
* \param right_value Model Right child output
* \param left_cnt Count of left child
* \param right_cnt Count of right child
* \param left_weight Weight of left child
* \param right_weight Weight of right child
* \param gain Split gain
* \param missing_type missing type of the categorical feature
* \param missing_to_left whether missing value is split to left or right
* \return The index of new leaf.
*/
int SplitCTR(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin,
const uint32_t* threshold, int num_threshold, double left_value, double right_value,
int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool missing_to_left);

/*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }

Expand Down Expand Up @@ -351,18 +325,18 @@ class Tree {
}

inline int Decision(double fval, int node) const {
if(GetDecisionType(decision_type_[node], kCTRMask)) {
/*if(GetDecisionType(decision_type_[node], kCTRMask)) {
return CTRDecision(fval, node);
}
else if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
else */if (GetDecisionType(decision_type_[node], kCategoricalMask)) {
return CategoricalDecision(fval, node);
} else {
return NumericalDecision(fval, node);
}
}

inline int DecisionInner(uint32_t fval, int node, uint32_t default_bin, uint32_t max_bin) const {
if (GetDecisionType(decision_type_[node], kCategoricalMask) || GetDecisionType(decision_type_[node], kCTRMask)) {
if (GetDecisionType(decision_type_[node], kCategoricalMask)/* || GetDecisionType(decision_type_[node], kCTRMask)*/) {
return CategoricalDecisionInner(fval, node);
} else {
return NumericalDecisionInner(fval, node, default_bin, max_bin);
Expand Down
63 changes: 63 additions & 0 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,69 @@ inline static std::unordered_map<int, std::vector<uint32_t>> UnorderedIntVectorM
return map;
}

inline static std::string CTRInformationToString(const std::unordered_map<int, std::unordered_map<int, double>>& map, const int num_total_features) {
std::stringstream str_buf;
std::vector<std::string> pair_values;
const size_t buffer_len = 32;
std::vector<char> buffer(buffer_len);
str_buf << num_total_features << " ";
for(const auto& pair : map) {
pair_values.clear();
//skip empty items
if(pair.second.empty()) continue;
const int cat_fid = pair.first / num_total_features;
const int ctr_fid = pair.first % num_total_features;
str_buf << cat_fid << ":" << ctr_fid << " ";
for(const auto& inner_pair : pair.second) {
if(std::isnan(inner_pair.second)) {
pair_values.push_back(std::to_string(inner_pair.first) + std::string(":") + std::to_string(-1));
}
else {
double value = inner_pair.second;
DoubleToStr(value, buffer.data(), buffer_len);
pair_values.push_back(std::to_string(inner_pair.first) + std::string(":") + std::string(buffer.data()));
}
}
for(size_t i = 0; i < pair_values.size(); ++i) {
str_buf << pair_values[i];
if(i != pair_values.size() - 1) {
str_buf << " ";
}
}
str_buf << ",";
}
return str_buf.str();
}

inline static std::unordered_map<int, std::unordered_map<int, double>> CTRInformationFromString(std::string str) {
std::stringstream sin(str);
std::unordered_map<int, std::unordered_map<int, double>> map;
int fid = 0;
int ctr_fid = 0;
int num_total_features = 0;
sin >> num_total_features;
while(sin >> fid) {
CHECK(sin.get() == ':');
sin >> ctr_fid;
int cat_value = 0;
while(sin >> cat_value) {
CHECK(sin.get() == ':');
double ctr_value = 0.0;
sin >> ctr_value;
if(ctr_value < 0.0) {
map[fid * num_total_features + ctr_fid][cat_value] = std::numeric_limits<double>::quiet_NaN();
}
else {
map[fid * num_total_features + ctr_fid][cat_value] = ctr_value;
}
if(sin.get() == ',') {
break;
}
}
}
return map;
}

template<typename T, bool is_float>
struct __StringToTHelper {
T operator()(const std::string& str) const {
Expand Down
Loading

0 comments on commit cb6e9f4

Please sign in to comment.