Skip to content

Commit

Permalink
- tests of multimodal data class are finished
Browse files Browse the repository at this point in the history
- now if text column contains a lot of nans, it's dropped
  • Loading branch information
andreygetmanov committed Aug 22, 2022
1 parent 8bb8f5a commit e2554e7
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 29 deletions.
25 changes: 18 additions & 7 deletions fedot/core/data/multi_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum

ALLOWED_NAN_PERCENT = 0.8


class MultiModalData(dict):
""" Dictionary with InputData as values and primary node names as keys """
Expand Down Expand Up @@ -167,10 +169,10 @@ def from_csv(cls,
data_part_transformation_func = partial(_array_to_input_data, idx=idx,
target_array=target, task=task)

# create labels for text data sources
# create labels for text data sources and remove source if there are many nans
sources = dict((_new_key_name_text(data_part_key),
data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text))
for (data_part_key, data_part) in data_text.items())
for (data_part_key, data_part) in data_text.items() if not _full_of_nans(data_part))

# add table features if they exist
if table_features.size != 0:
Expand All @@ -197,16 +199,19 @@ def _define_text_columns(data_frame: pd.DataFrame) -> List[str]:
def _column_contains_text(column: pd.Series) -> bool:
"""
Column contains text if:
1. it's not numerical or latent numerical
(e.g. ['1.2', '2.3', '3.4', ...] is numerical too)
2. fraction of unique values is more than 0.95
1. it's not float or float compatible
(e.g. ['1.2', '2.3', '3.4', ...] is float too)
2. fraction of unique values (except nans) is more than 0.95
:param column: pandas series with data
:return: True if column contains text
"""
if column.dtype == object and not _is_float_compatible(column):
return len(column.unique()) / len(column) > 0.95 if pd.isna(column).sum() == 0 \
else (len(column.unique()) - 1) / (len(column) - pd.isna(column).sum()) > 0.95
unique_frac = 0.95
unique_num = len(column.unique())
nan_num = pd.isna(column).sum()
return unique_num / len(column) > unique_frac if nan_num == 0 \
else (unique_num - 1) / (len(column) - nan_num) > unique_frac
return False


Expand All @@ -224,6 +229,12 @@ def _is_float_compatible(column: pd.Series) -> bool:
return failed_ratio < 0.5


def _full_of_nans(text_data: np.array) -> bool:
if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT:
return True
return False


def _prepare_multimodal_text_data(dataframe: pd.DataFrame, text_columns: List[str]) -> dict:
""" Prepares MultiModal text data in a form of dictionary
Expand Down
38 changes: 19 additions & 19 deletions test/data/multimodal_data_with_complicated_types.csv
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0.0,,1.0,1.0,1.0,,monday,a ,True,,,1.0,0,a,no
,5.0,2.0,2.0,0.0,,tuesday,b,,,2.3,0.0,1,inf,yes
2.0,,3.0,3.0,,,3,c,False,,,1.0,?,c,yes
3.0,,4.0,4.0,3.0,,4, a ,True,,,0.0,2,d,yes
4.0,,5.0,5.0,0.0,,5, b ,,,,0.0,3,e,no
5.0,,6.0,6.0,0.0,,6, c ,False,,,0.0,4,f,no
6.0,inf,7.0,7.0,0.0,,7, a ,True,sample text,sample text,1.0,5,g,no
7.0,inf,8.0,8.0,1.0,,1, b ,,,4,0.0,6,h,no
inf,inf,9.0,9.0,2.0,,2,,True,,,1.0,7,i,no
9.0,inf,10.0,10.0,2.0,,3, c ,False,,,0.0,8,j,yes
10.0,,11.0,11.0,0.0,,4,c ,False,,0.0001,0.0,9,k,yes
11.0,,12.0,12.0,2.0,,5,,False,,,1.0,10,l,yes
12.0,,1.0,1.0,1.0,,6, b ,False,,,0.0,11,m,yes
13.0,,2.0,2.0,1.0,,7, c ,True,,,,12,n,yes
14.0,,3.0,3.0,2.0,,1,a,False,,,,error,o,no
15.0,,4.0,4.0,1.0,,2,a ,False,,,,13,p,no
16.0,2.0,5.0,12.0,0.0,,3, d ,True,,,1.0,16,r,yes
17.0,3.0,6.0,13.0,0.0,sample text,4, d ,False,another sample text,another sample text,0.0,17,s,no
,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,,1.0,1.0,1.0,Lorem Ipsum is simply dummy text of the printing and typesetting,monday,a ,True,,,1.0,0,no
1,,5.0,2.0,2.0,0.0,industry. Lorem Ipsum has been the industry standard dummy text ever since,tuesday,b,,,2.3,0.0,1,yes
2,2.0,,3.0,3.0,,"the 1500s, when an unknown printer took a galley of type and scrambled it to",3,c,False,,,1.0,?,yes
3,3.0,,4.0,4.0,3.0,"make a type specimen book. It has survived not only five centuries, but also",4, a ,True,,,0.0,2,yes
4,4.0,,5.0,5.0,0.0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5, b ,,,,0.0,3,no
5,5.0,,6.0,6.0,0.0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6, c ,False,,,0.0,4,no
6,6.0,inf,7.0,7.0,0.0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,sample text,sample text,1.0,5,no
7,7.0,inf,8.0,8.0,1.0,PageMaker including versions of Lorem Ipsum.,1, b ,,,4,0.0,6,no
8,inf,inf,9.0,9.0,2.0,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1.0,7,no
9,9.0,inf,10.0,10.0,2.0,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c ,False,,,0.0,8,yes
10,10.0,,11.0,11.0,0.0,"old. Richard McClintock, a Latin professor at Hampden-Sydney College in ",4,c ,False,,0.0001,0.0,9,yes
11,11.0,,12.0,12.0,2.0,"Virginia, looked up one of the more obscure Latin words, consectetur, from a ",5,,False,,,1.0,10,yes
12,12.0,,1.0,1.0,1.0,"Lorem Ipsum passage, and going through the cites of the word in classical",6, b ,False,,,0.0,11,yes
13,13.0,,2.0,2.0,1.0,"literature, discovered the undoubtable source. Lorem Ipsum comes from sections ",7, c ,True,,,,12,yes
14,14.0,,3.0,3.0,2.0,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no
15,15.0,,4.0,4.0,1.0,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a ,False,,,,13,no
16,16.0,2.0,5.0,12.0,0.0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3, d ,True,,,1.0,16,yes
17,17.0,3.0,6.0,13.0,0.0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,another sample text,another sample text,0.0,17,no
7 changes: 4 additions & 3 deletions test/unit/data/test_multimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,15 @@ def test_multimodal_data_with_complicated_types():
of dataset look at data_with_complicated_types.
Combines complicated table data with some text columns.
"""
# TODO check file content
file_path = 'test/data/multimodal_data_with_complicated_types.csv'
path = Path(fedot_project_root(), file_path)
df = pd.read_csv(path)
file_mm_data = MultiModalData.from_csv(path)
model = Fedot(problem='classification')
model.fit(features=file_mm_data,
target=file_mm_data.target,
predefined_model='auto')
model.predict(file_mm_data)
assert len(file_mm_data) == 3

assert len(file_mm_data) == 2
assert 'data_source_text/5' in file_mm_data
assert file_mm_data['data_source_table'].features.shape == (18, 11)

0 comments on commit e2554e7

Please sign in to comment.