Skip to content

Commit

Permalink
- table and text preprocessing are now distinguished for easier reada…
Browse files Browse the repository at this point in the history
…bility
  • Loading branch information
andreygetmanov committed Aug 31, 2022
1 parent 90ac72b commit 64d320d
Showing 1 changed file with 43 additions and 38 deletions.
81 changes: 43 additions & 38 deletions fedot/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,36 +187,39 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
if data.supplementary_data.was_preprocessed:
# Preprocessing was already done - return data
return data
# TODO target encoding must be obligatory for all data types
if data_type_is_text(data) or data_type_is_table(data):
# Fix tables / time series sizes
data = self._correct_shapes(data)
replace_inf_with_nans(data)
# Find incorrect features which must be removed
self._find_features_full_of_nans(data, source_name)
self.take_only_correct_features(data, source_name)
# TODO andreygetmanov to new class text preprocessing?
if data_type_is_text(data):
replace_nans_with_empty_strings(data)
data = self._drop_rows_with_nan_in_target(data)
if not (data_type_is_text(data) or data_type_is_table(data)):
# Time series or image - do nothing
return data
# Wrap indices in numpy array
data.idx = np.array(data.idx)

# Fix tables / time series sizes
data = self._correct_shapes(data)
replace_inf_with_nans(data)

# Column types processing - launch after correct features selection
self.types_correctors[source_name].convert_data_for_fit(data)
if self.types_correctors[source_name].target_converting_has_errors:
data = self._drop_rows_with_nan_in_target(data)
# Find incorrect features which must be removed
self._find_features_full_of_nans(data, source_name)
self.take_only_correct_features(data, source_name)
data = self._drop_rows_with_nan_in_target(data)

# Train Label Encoder for categorical target if necessary and apply it
self._train_target_encoder(data, source_name)
data.target = self._apply_target_encoding(data, source_name)
if data_type_is_table(data):
data = self._clean_extra_spaces(data)
# Wrap indices in numpy array
data.idx = np.array(data.idx)
# Column types processing - launch after correct features selection
self.types_correctors[source_name].convert_data_for_fit(data)
if self.types_correctors[source_name].target_converting_has_errors:
data = self._drop_rows_with_nan_in_target(data)

# Train Label Encoder for categorical target if necessary and apply it
self._train_target_encoder(data, source_name)
data.target = self._apply_target_encoding(data, source_name)

if data_type_is_table(data):
# Process binary categorical features
self.binary_categorical_processors[source_name].fit(data)
data = self.binary_categorical_processors[source_name].transform(data)
# TODO andreygetmanov target encoding must be obligatory for all data types
if data_type_is_text(data):
# TODO andreygetmanov to new class text preprocessing?
replace_nans_with_empty_strings(data)
elif data_type_is_table(data):
data = self._clean_extra_spaces(data)
# Process binary categorical features
self.binary_categorical_processors[source_name].fit(data)
data = self.binary_categorical_processors[source_name].transform(data)

return data

Expand All @@ -225,21 +228,23 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name:
if data.supplementary_data.was_preprocessed:
# Preprocessing was already done - return data
return data

if not (data_type_is_text(data) or data_type_is_table(data)):
return data
data = self._correct_shapes(data)
replace_inf_with_nans(data)

# Wrap indices in numpy array
data.idx = np.array(data.idx)
if data_type_is_table(data) or data_type_is_text(data):
replace_inf_with_nans(data)
self.take_only_correct_features(data, source_name)
if data_type_is_text(data):
replace_nans_with_empty_strings(data)
# Perform preprocessing for types - launch after correct features selection
self.types_correctors[source_name].convert_data_for_predict(data)
if data_type_is_table(data):
data = self._clean_extra_spaces(data)
data = self.binary_categorical_processors[source_name].transform(data)

# Perform preprocessing for types - launch after correct features selection
self.take_only_correct_features(data, source_name)
self.types_correctors[source_name].convert_data_for_predict(data)

if data_type_is_text(data):
replace_nans_with_empty_strings(data)
if data_type_is_table(data):
data = self._clean_extra_spaces(data)
data = self.binary_categorical_processors[source_name].transform(data)

return data

Expand Down

0 comments on commit 64d320d

Please sign in to comment.