Skip to content

Commit

Permalink
- added substitution of nans to '' in text features
Browse files Browse the repository at this point in the history
- refactoring of text preprocessing
- refactoring of multimodal data test
  • Loading branch information
andreygetmanov committed Aug 19, 2022
1 parent 4db1c9a commit a5c3751
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 21 deletions.
9 changes: 9 additions & 0 deletions fedot/core/data/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ def replace_inf_with_nans(input_data: InputData):
input_data.features = features_with_replaced_inf


def replace_nans_with_empty_strings(input_data: InputData):
"""
Replace NaNs with empty strings in input_data.features
"""
input_data.features = np.where(pd.isna(input_data.features),
'',
input_data.features)


def convert_into_column(array: np.array):
""" Perform conversion for data if it is necessary """
if len(array.shape) == 1:
Expand Down
15 changes: 9 additions & 6 deletions fedot/core/data/multi_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ def _column_contains_text(column: pd.Series) -> bool:
:return: True if column contains text
"""
if column.dtype == object and not _is_float_compatible(column):
return len(column.unique()) / len(column) > 0.95
return len(column.unique()) / len(column) > 0.95 if pd.isna(column).sum() == 0 \
else (len(column.unique()) - 1) / (len(column) - pd.isna(column).sum()) > 0.95
return False


Expand All @@ -214,11 +215,13 @@ def _is_float_compatible(column: pd.Series) -> bool:
:param column: pandas series with data
:return: True if column contains only float or nan values
"""
try:
column.astype(float)
return True
except ValueError:
return False
nans_number = column.isna().sum()
converted_column = pd.to_numeric(column, errors='coerce')
result_nans_number = converted_column.isna().sum()
failed_objects_number = result_nans_number - nans_number
non_nan_all_objects_number = len(column) - nans_number
failed_ratio = failed_objects_number / non_nan_all_objects_number
return failed_ratio < 0.5


def _prepare_multimodal_text_data(dataframe: pd.DataFrame, text_columns: List[str]) -> dict:
Expand Down
33 changes: 18 additions & 15 deletions fedot/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
data_has_categorical_features,
data_has_missing_values,
find_categorical_columns,
replace_inf_with_nans
replace_inf_with_nans,
replace_nans_with_empty_strings
)
from fedot.core.data.multi_modal import MultiModalData
from fedot.core.log import default_log
Expand Down Expand Up @@ -190,12 +191,13 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
if data_type_is_text(data) or data_type_is_table(data):
# Fix tables / time series sizes
data = self._correct_shapes(data)
if data_type_is_table(data):
replace_inf_with_nans(data)
# Find incorrect features which must be removed
self._find_features_full_of_nans(data, source_name)
self.take_only_correct_features(data, source_name)

replace_inf_with_nans(data)
# Find incorrect features which must be removed
self._find_features_full_of_nans(data, source_name)
self.take_only_correct_features(data, source_name)
# TODO andreygetmanov to new class text preprocessing?
if data_type_is_text(data):
replace_nans_with_empty_strings(data)
data = self._drop_rows_with_nan_in_target(data)

# Column types processing - launch after correct features selection
Expand All @@ -211,9 +213,10 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
# Wrap indices in numpy array
data.idx = np.array(data.idx)

# Process binary categorical features
self.binary_categorical_processors[source_name].fit(data)
data = self.binary_categorical_processors[source_name].transform(data)
if data_type_is_table(data):
# Process binary categorical features
self.binary_categorical_processors[source_name].fit(data)
data = self.binary_categorical_processors[source_name].transform(data)

return data

Expand All @@ -228,15 +231,15 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name:
# Wrap indices in numpy array
data.idx = np.array(data.idx)
if data_type_is_table(data) or data_type_is_text(data):
if data_type_is_table(data):
replace_inf_with_nans(data)
self.take_only_correct_features(data, source_name)

replace_inf_with_nans(data)
self.take_only_correct_features(data, source_name)
if data_type_is_text(data):
replace_nans_with_empty_strings(data)
# Perform preprocessing for types - launch after correct features selection
self.types_correctors[source_name].convert_data_for_predict(data)
if data_type_is_table(data):
data = self._clean_extra_spaces(data)
data = self.binary_categorical_processors[source_name].transform(data)
data = self.binary_categorical_processors[source_name].transform(data)

return data

Expand Down
19 changes: 19 additions & 0 deletions test/data/multimodal_data_with_complicated_types.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0.0,,1.0,1.0,1.0,,monday,a ,True,,,1.0,0,a,no
,5.0,2.0,2.0,0.0,,tuesday,b,,,2.3,0.0,1,inf,yes
2.0,,3.0,3.0,,,3,c,False,,,1.0,?,c,yes
3.0,,4.0,4.0,3.0,,4, a ,True,,,0.0,2,d,yes
4.0,,5.0,5.0,0.0,,5, b ,,,,0.0,3,e,no
5.0,,6.0,6.0,0.0,,6, c ,False,,,0.0,4,f,no
6.0,inf,7.0,7.0,0.0,,7, a ,True,sample text,sample text,1.0,5,g,no
7.0,inf,8.0,8.0,1.0,,1, b ,,,4,0.0,6,h,no
inf,inf,9.0,9.0,2.0,,2,,True,,,1.0,7,i,no
9.0,inf,10.0,10.0,2.0,,3, c ,False,,,0.0,8,j,yes
10.0,,11.0,11.0,0.0,,4,c ,False,,0.0001,0.0,9,k,yes
11.0,,12.0,12.0,2.0,,5,,False,,,1.0,10,l,yes
12.0,,1.0,1.0,1.0,,6, b ,False,,,0.0,11,m,yes
13.0,,2.0,2.0,1.0,,7, c ,True,,,,12,n,yes
14.0,,3.0,3.0,2.0,,1,a,False,,,,error,o,no
15.0,,4.0,4.0,1.0,,2,a ,False,,,,13,p,no
16.0,2.0,5.0,12.0,0.0,,3, d ,True,,,1.0,16,r,yes
17.0,3.0,6.0,13.0,0.0,sample text,4, d ,False,another sample text,another sample text,0.0,17,s,no
20 changes: 20 additions & 0 deletions test/unit/data/test_multimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
from pathlib import Path

from fedot.api.main import Fedot
from fedot.core.data.data import InputData
from fedot.core.data.multi_modal import MultiModalData
from fedot.core.repository.dataset_types import DataTypesEnum
Expand Down Expand Up @@ -116,3 +117,22 @@ def test_table_data_only():
assert file_mm_data['data_source_table'].data_type is DataTypesEnum.table
assert file_mm_data['data_source_table'].features.all() == file_data.features.all()
assert file_mm_data['data_source_table'].target.all() == file_data.target.all()


def test_multimodal_data_with_complicated_types():
"""
For more detailed description of the table part
of dataset look at data_with_complicated_types.
Combines complicated table data with some text columns.
"""
# TODO check file content
file_path = 'test/data/multimodal_data_with_complicated_types.csv'
path = Path(fedot_project_root(), file_path)
df = pd.read_csv(path)
file_mm_data = MultiModalData.from_csv(path)
model = Fedot(problem='classification')
model.fit(features=file_mm_data,
target=file_mm_data.target,
predefined_model='auto')
model.predict(file_mm_data)
assert len(file_mm_data) == 3

0 comments on commit a5c3751

Please sign in to comment.