- added substitution of nans to '' in text features

- refactoring of text preprocessing - refactoring of multimodal data test
aimclub · Aug 19, 2022 · a5c3751 · a5c3751
1 parent 4db1c9a
commit a5c3751
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 21 deletions.
diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
@@ -20,6 +20,15 @@ def replace_inf_with_nans(input_data: InputData):
     input_data.features = features_with_replaced_inf
 
 
+def replace_nans_with_empty_strings(input_data: InputData):
+    """
+    Replace NaNs with empty strings in input_data.features
+    """
+    input_data.features = np.where(pd.isna(input_data.features),
+                                   '',
+                                   input_data.features)
+
+
 def convert_into_column(array: np.array):
     """ Perform conversion for data if it is necessary """
     if len(array.shape) == 1:

diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py
@@ -205,7 +205,8 @@ def _column_contains_text(column: pd.Series) -> bool:
     :return: True if column contains text
     """
     if column.dtype == object and not _is_float_compatible(column):
-        return len(column.unique()) / len(column) > 0.95
+        return len(column.unique()) / len(column) > 0.95 if pd.isna(column).sum() == 0 \
+            else (len(column.unique()) - 1) / (len(column) - pd.isna(column).sum()) > 0.95
     return False
 
 
@@ -214,11 +215,13 @@ def _is_float_compatible(column: pd.Series) -> bool:
     :param column: pandas series with data
     :return: True if column contains only float or nan values
     """
-    try:
-        column.astype(float)
-        return True
-    except ValueError:
-        return False
+    nans_number = column.isna().sum()
+    converted_column = pd.to_numeric(column, errors='coerce')
+    result_nans_number = converted_column.isna().sum()
+    failed_objects_number = result_nans_number - nans_number
+    non_nan_all_objects_number = len(column) - nans_number
+    failed_ratio = failed_objects_number / non_nan_all_objects_number
+    return failed_ratio < 0.5
 
 
 def _prepare_multimodal_text_data(dataframe: pd.DataFrame, text_columns: List[str]) -> dict:

diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
@@ -10,7 +10,8 @@
     data_has_categorical_features,
     data_has_missing_values,
     find_categorical_columns,
-    replace_inf_with_nans
+    replace_inf_with_nans,
+    replace_nans_with_empty_strings
 )
 from fedot.core.data.multi_modal import MultiModalData
 from fedot.core.log import default_log
@@ -190,12 +191,13 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
         if data_type_is_text(data) or data_type_is_table(data):
             # Fix tables / time series sizes
             data = self._correct_shapes(data)
-            if data_type_is_table(data):
-                replace_inf_with_nans(data)
-                # Find incorrect features which must be removed
-                self._find_features_full_of_nans(data, source_name)
-                self.take_only_correct_features(data, source_name)
-
+            replace_inf_with_nans(data)
+            # Find incorrect features which must be removed
+            self._find_features_full_of_nans(data, source_name)
+            self.take_only_correct_features(data, source_name)
+            # TODO andreygetmanov to new class text preprocessing?
+            if data_type_is_text(data):
+                replace_nans_with_empty_strings(data)
             data = self._drop_rows_with_nan_in_target(data)
 
             # Column types processing - launch after correct features selection
@@ -211,9 +213,10 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
             # Wrap indices in numpy array
             data.idx = np.array(data.idx)
 
-            # Process binary categorical features
-            self.binary_categorical_processors[source_name].fit(data)
-            data = self.binary_categorical_processors[source_name].transform(data)
+            if data_type_is_table(data):
+                # Process binary categorical features
+                self.binary_categorical_processors[source_name].fit(data)
+                data = self.binary_categorical_processors[source_name].transform(data)
 
         return data
 
@@ -228,15 +231,15 @@ def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name:
         # Wrap indices in numpy array
         data.idx = np.array(data.idx)
         if data_type_is_table(data) or data_type_is_text(data):
-            if data_type_is_table(data):
-                replace_inf_with_nans(data)
-                self.take_only_correct_features(data, source_name)
-
+            replace_inf_with_nans(data)
+            self.take_only_correct_features(data, source_name)
+            if data_type_is_text(data):
+                replace_nans_with_empty_strings(data)
             # Perform preprocessing for types - launch after correct features selection
             self.types_correctors[source_name].convert_data_for_predict(data)
             if data_type_is_table(data):
                 data = self._clean_extra_spaces(data)
-            data = self.binary_categorical_processors[source_name].transform(data)
+                data = self.binary_categorical_processors[source_name].transform(data)
 
         return data
 

diff --git a/test/data/multimodal_data_with_complicated_types.csv b/test/data/multimodal_data_with_complicated_types.csv
@@ -0,0 +1,19 @@
+0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+0.0,,1.0,1.0,1.0,,monday,a ,True,,,1.0,0,a,no
+,5.0,2.0,2.0,0.0,,tuesday,b,,,2.3,0.0,1,inf,yes
+2.0,,3.0,3.0,,,3,c,False,,,1.0,?,c,yes
+3.0,,4.0,4.0,3.0,,4,  a  ,True,,,0.0,2,d,yes
+4.0,,5.0,5.0,0.0,,5,   b ,,,,0.0,3,e,no
+5.0,,6.0,6.0,0.0,,6,   c  ,False,,,0.0,4,f,no
+6.0,inf,7.0,7.0,0.0,,7,    a  ,True,sample text,sample text,1.0,5,g,no
+7.0,inf,8.0,8.0,1.0,,1, b   ,,,4,0.0,6,h,no
+inf,inf,9.0,9.0,2.0,,2,,True,,,1.0,7,i,no
+9.0,inf,10.0,10.0,2.0,,3, c  ,False,,,0.0,8,j,yes
+10.0,,11.0,11.0,0.0,,4,c ,False,,0.0001,0.0,9,k,yes
+11.0,,12.0,12.0,2.0,,5,,False,,,1.0,10,l,yes
+12.0,,1.0,1.0,1.0,,6, b  ,False,,,0.0,11,m,yes
+13.0,,2.0,2.0,1.0,,7, c  ,True,,,,12,n,yes
+14.0,,3.0,3.0,2.0,,1,a,False,,,,error,o,no
+15.0,,4.0,4.0,1.0,,2,a  ,False,,,,13,p,no
+16.0,2.0,5.0,12.0,0.0,,3,   d       ,True,,,1.0,16,r,yes
+17.0,3.0,6.0,13.0,0.0,sample text,4,  d      ,False,another sample text,another sample text,0.0,17,s,no
diff --git a/test/unit/data/test_multimodal_data.py b/test/unit/data/test_multimodal_data.py
@@ -2,6 +2,7 @@
 import pandas as pd
 from pathlib import Path
 
+from fedot.api.main import Fedot
 from fedot.core.data.data import InputData
 from fedot.core.data.multi_modal import MultiModalData
 from fedot.core.repository.dataset_types import DataTypesEnum
@@ -116,3 +117,22 @@ def test_table_data_only():
     assert file_mm_data['data_source_table'].data_type is DataTypesEnum.table
     assert file_mm_data['data_source_table'].features.all() == file_data.features.all()
     assert file_mm_data['data_source_table'].target.all() == file_data.target.all()
+
+
+def test_multimodal_data_with_complicated_types():
+    """
+    For more detailed description of the table part
+    of dataset look at data_with_complicated_types.
+    Combines complicated table data with some text columns.
+    """
+    # TODO check file content
+    file_path = 'test/data/multimodal_data_with_complicated_types.csv'
+    path = Path(fedot_project_root(), file_path)
+    df = pd.read_csv(path)
+    file_mm_data = MultiModalData.from_csv(path)
+    model = Fedot(problem='classification')
+    model.fit(features=file_mm_data,
+              target=file_mm_data.target,
+              predefined_model='auto')
+    model.predict(file_mm_data)
+    assert len(file_mm_data) == 3