- tests of multimodal data class are finished

- now if text column contains a lot of nans, it's dropped
aimclub · Aug 22, 2022 · e2554e7 · e2554e7
1 parent 8bb8f5a
commit e2554e7
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 29 deletions.
diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py
@@ -10,6 +10,8 @@
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 
+ALLOWED_NAN_PERCENT = 0.8
+
 
 class MultiModalData(dict):
     """ Dictionary with InputData as values and primary node names as keys """
@@ -167,10 +169,10 @@ def from_csv(cls,
         data_part_transformation_func = partial(_array_to_input_data, idx=idx,
                                                 target_array=target, task=task)
 
-        # create labels for text data sources
+        # create labels for text data sources and remove source if there are many nans
         sources = dict((_new_key_name_text(data_part_key),
                         data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text))
-                       for (data_part_key, data_part) in data_text.items())
+                       for (data_part_key, data_part) in data_text.items() if not _full_of_nans(data_part))
 
         # add table features if they exist
         if table_features.size != 0:
@@ -197,16 +199,19 @@ def _define_text_columns(data_frame: pd.DataFrame) -> List[str]:
 def _column_contains_text(column: pd.Series) -> bool:
     """
     Column contains text if:
-    1. it's not numerical or latent numerical
-    (e.g. ['1.2', '2.3', '3.4', ...] is numerical too)
-    2. fraction of unique values is more than 0.95
+    1. it's not float or float compatible
+    (e.g. ['1.2', '2.3', '3.4', ...] is float too)
+    2. fraction of unique values (except nans) is more than 0.95
 
     :param column: pandas series with data
     :return: True if column contains text
     """
     if column.dtype == object and not _is_float_compatible(column):
-        return len(column.unique()) / len(column) > 0.95 if pd.isna(column).sum() == 0 \
-            else (len(column.unique()) - 1) / (len(column) - pd.isna(column).sum()) > 0.95
+        unique_frac = 0.95
+        unique_num = len(column.unique())
+        nan_num = pd.isna(column).sum()
+        return unique_num / len(column) > unique_frac if nan_num == 0 \
+            else (unique_num - 1) / (len(column) - nan_num) > unique_frac
     return False
 
 
@@ -224,6 +229,12 @@ def _is_float_compatible(column: pd.Series) -> bool:
     return failed_ratio < 0.5
 
 
+def _full_of_nans(text_data: np.array) -> bool:
+    if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT:
+        return True
+    return False
+
+
 def _prepare_multimodal_text_data(dataframe: pd.DataFrame, text_columns: List[str]) -> dict:
     """ Prepares MultiModal text data in a form of dictionary
 

diff --git a/test/data/multimodal_data_with_complicated_types.csv b/test/data/multimodal_data_with_complicated_types.csv
@@ -1,19 +1,19 @@
-0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
-0.0,,1.0,1.0,1.0,,monday,a ,True,,,1.0,0,a,no
-,5.0,2.0,2.0,0.0,,tuesday,b,,,2.3,0.0,1,inf,yes
-2.0,,3.0,3.0,,,3,c,False,,,1.0,?,c,yes
-3.0,,4.0,4.0,3.0,,4,  a  ,True,,,0.0,2,d,yes
-4.0,,5.0,5.0,0.0,,5,   b ,,,,0.0,3,e,no
-5.0,,6.0,6.0,0.0,,6,   c  ,False,,,0.0,4,f,no
-6.0,inf,7.0,7.0,0.0,,7,    a  ,True,sample text,sample text,1.0,5,g,no
-7.0,inf,8.0,8.0,1.0,,1, b   ,,,4,0.0,6,h,no
-inf,inf,9.0,9.0,2.0,,2,,True,,,1.0,7,i,no
-9.0,inf,10.0,10.0,2.0,,3, c  ,False,,,0.0,8,j,yes
-10.0,,11.0,11.0,0.0,,4,c ,False,,0.0001,0.0,9,k,yes
-11.0,,12.0,12.0,2.0,,5,,False,,,1.0,10,l,yes
-12.0,,1.0,1.0,1.0,,6, b  ,False,,,0.0,11,m,yes
-13.0,,2.0,2.0,1.0,,7, c  ,True,,,,12,n,yes
-14.0,,3.0,3.0,2.0,,1,a,False,,,,error,o,no
-15.0,,4.0,4.0,1.0,,2,a  ,False,,,,13,p,no
-16.0,2.0,5.0,12.0,0.0,,3,   d       ,True,,,1.0,16,r,yes
-17.0,3.0,6.0,13.0,0.0,sample text,4,  d      ,False,another sample text,another sample text,0.0,17,s,no
+,0,1,2,3,4,5,6,7,8,9,10,11,12,13
+0,0.0,,1.0,1.0,1.0,Lorem Ipsum is simply dummy text of the printing and typesetting,monday,a ,True,,,1.0,0,no
+1,,5.0,2.0,2.0,0.0,industry. Lorem Ipsum has been the industry standard dummy text ever since,tuesday,b,,,2.3,0.0,1,yes
+2,2.0,,3.0,3.0,,"the 1500s, when an unknown printer took a galley of type and scrambled it to",3,c,False,,,1.0,?,yes
+3,3.0,,4.0,4.0,3.0,"make a type specimen book. It has survived not only five centuries, but also",4,  a  ,True,,,0.0,2,yes
+4,4.0,,5.0,5.0,0.0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5,   b ,,,,0.0,3,no
+5,5.0,,6.0,6.0,0.0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6,   c  ,False,,,0.0,4,no
+6,6.0,inf,7.0,7.0,0.0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7,    a  ,True,sample text,sample text,1.0,5,no
+7,7.0,inf,8.0,8.0,1.0,PageMaker including versions of Lorem Ipsum.,1, b   ,,,4,0.0,6,no
+8,inf,inf,9.0,9.0,2.0,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1.0,7,no
+9,9.0,inf,10.0,10.0,2.0,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c  ,False,,,0.0,8,yes
+10,10.0,,11.0,11.0,0.0,"old. Richard McClintock, a Latin professor at Hampden-Sydney College in ",4,c ,False,,0.0001,0.0,9,yes
+11,11.0,,12.0,12.0,2.0,"Virginia, looked up one of the more obscure Latin words, consectetur, from a ",5,,False,,,1.0,10,yes
+12,12.0,,1.0,1.0,1.0,"Lorem Ipsum passage, and going through the cites of the word in classical",6, b  ,False,,,0.0,11,yes
+13,13.0,,2.0,2.0,1.0,"literature, discovered the undoubtable source. Lorem Ipsum comes from sections ",7, c  ,True,,,,12,yes
+14,14.0,,3.0,3.0,2.0,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no
+15,15.0,,4.0,4.0,1.0,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a  ,False,,,,13,no
+16,16.0,2.0,5.0,12.0,0.0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3,   d       ,True,,,1.0,16,yes
+17,17.0,3.0,6.0,13.0,0.0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4,  d      ,False,another sample text,another sample text,0.0,17,no
diff --git a/test/unit/data/test_multimodal_data.py b/test/unit/data/test_multimodal_data.py
@@ -125,14 +125,15 @@ def test_multimodal_data_with_complicated_types():
     of dataset look at data_with_complicated_types.
     Combines complicated table data with some text columns.
     """
-    # TODO check file content
     file_path = 'test/data/multimodal_data_with_complicated_types.csv'
     path = Path(fedot_project_root(), file_path)
-    df = pd.read_csv(path)
     file_mm_data = MultiModalData.from_csv(path)
     model = Fedot(problem='classification')
     model.fit(features=file_mm_data,
               target=file_mm_data.target,
               predefined_model='auto')
     model.predict(file_mm_data)
-    assert len(file_mm_data) == 3
+
+    assert len(file_mm_data) == 2
+    assert 'data_source_text/5' in file_mm_data
+    assert file_mm_data['data_source_table'].features.shape == (18, 11)