Skip to content

Commit

Permalink
API run of multimodal cases
Browse files Browse the repository at this point in the history
- added checking of text and image features in data during preprocessing building
- multi_modal_pipeline.py now runs via API
- preprocessing builder now can process each source of multimodal data separately
  • Loading branch information
andreygetmanov committed Apr 29, 2022
1 parent cd62826 commit cb8bc5c
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 19 deletions.
41 changes: 24 additions & 17 deletions examples/advanced/multi_modal_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from typing import Union

from fedot.api.main import Fedot

from sklearn.metrics import f1_score as f1

from cases.dataset_preparation import unpack_archived_data
Expand Down Expand Up @@ -61,7 +63,7 @@ def prepare_multi_modal_data(files_path: str, task: Task, images_size: tuple = (
data_type=DataTypesEnum.text, is_multilabel=True, shuffle=False)

data = MultiModalData({
'data_source_img': data_img,
# 'data_source_img': data_img,
'data_source_table': data_num,
'data_source_text': data_text
})
Expand All @@ -86,18 +88,18 @@ def generate_initial_pipeline_and_data(data: Union[InputData, MultiModalData],
else:
num_classes = data.num_classes
# image
images_size = data['data_source_img'].features.shape[1:4]
ds_image = PrimaryNode('data_source_img')
image_node = SecondaryNode('cnn', nodes_from=[ds_image])
image_node.custom_params = {'image_shape': images_size,
'architecture_type': 'vgg16',
'num_classes': num_classes,
'epochs': 2,
'batch_size': 16,
'optimizer_parameters': {'loss': "binary_crossentropy",
'optimizer': "adam",
'metrics': 'categorical_crossentropy'}
}
# images_size = data['data_source_img'].features.shape[1:4]
# ds_image = PrimaryNode('data_source_img')
# image_node = SecondaryNode('cnn', nodes_from=[ds_image])
# image_node.custom_params = {'image_shape': images_size,
# 'architecture_type': 'simplified',
# 'num_classes': num_classes,
# 'epochs': 2,
# 'batch_size': 16,
# 'optimizer_parameters': {'loss': "binary_crossentropy",
# 'optimizer': "adam",
# 'metrics': 'categorical_crossentropy'}
# }

# table
ds_table = PrimaryNode('data_source_table')
Expand All @@ -110,7 +112,7 @@ def generate_initial_pipeline_and_data(data: Union[InputData, MultiModalData],
text_node.custom_params = {'ngram_range': (1, 3), 'min_df': 0.001, 'max_df': 0.9}

# combining all sources together
logit_node = SecondaryNode('logit', nodes_from=[image_node, numeric_node, text_node])
logit_node = SecondaryNode('logit', nodes_from=[numeric_node, text_node])
logit_node.custom_params = {'max_iter': 100000, 'random_state': 42}
pipeline = Pipeline(logit_node)

Expand All @@ -124,15 +126,20 @@ def generate_initial_pipeline_and_data(data: Union[InputData, MultiModalData],
return pipeline, fit_data, predict_data


def run_multi_modal_pipeline(files_path: str, is_visualise=False) -> float:
def run_multi_modal_pipeline(files_path: str, is_visualise=True) -> float:
task = Task(TaskTypesEnum.classification)
images_size = (224, 224)

data = prepare_multi_modal_data(files_path, task, images_size)

pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(data, with_split=True)
initial_pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(data, with_split=True)

automl_model = Fedot(problem='classification', timeout=0.1)
pipeline = automl_model.fit(features=fit_data,
target=fit_data.target,
predefined_model='auto')

pipeline.fit(input_data=fit_data)
# pipeline.fit(input_data=fit_data)

if is_visualise:
pipeline.show()
Expand Down
19 changes: 18 additions & 1 deletion fedot/api/api_utils/assumptions/preprocessing_builder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Union, Optional

from fedot.core.data.data import InputData
from fedot.core.data.data_preprocessing import data_has_missing_values, data_has_categorical_features
from fedot.core.data.data_preprocessing import data_has_missing_values, data_has_categorical_features, \
data_has_text_features, data_has_image_features
from fedot.core.data.multi_modal import MultiModalData
from fedot.core.pipelines.node import Node
from fedot.core.pipelines.pipeline import Pipeline
Expand All @@ -21,10 +22,16 @@ def builder_for_data(cls,
data: Union[InputData, MultiModalData],
*initial_nodes: Optional[Node]) -> PipelineBuilder:
preprocessing_builder = cls(task_type, *initial_nodes)
if isinstance(data, MultiModalData):
data = data[str(initial_nodes[0])]
if data_has_missing_values(data):
preprocessing_builder = preprocessing_builder.with_gaps()
if data_has_categorical_features(data):
preprocessing_builder = preprocessing_builder.with_categorical()
if data_has_text_features(data):
preprocessing_builder = preprocessing_builder.with_tfidf()
if data_has_image_features(data):
preprocessing_builder = preprocessing_builder.with_cnn()
return preprocessing_builder.to_builder()

def with_gaps(self):
Expand All @@ -41,6 +48,16 @@ def with_scaling(self):
self._builder.add_node('scaling')
return self

def with_tfidf(self):
if self.task_type != TaskTypesEnum.ts_forecasting:
self._builder.add_node('tfidf')
return self

def with_cnn(self):
if self.task_type != TaskTypesEnum.ts_forecasting:
self._builder.add_node('cnn')
return self

def to_builder(self) -> PipelineBuilder:
""" Return result as PipelineBuilder. Scaling is applied final by default. """
return self.with_scaling()._builder
Expand Down
27 changes: 26 additions & 1 deletion fedot/core/data/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.data.multi_modal import MultiModalData


def data_type_is_suitable_preprocessing(data: InputData) -> bool:
Expand Down Expand Up @@ -127,7 +128,7 @@ def data_has_categorical_features(data: Union[InputData, 'MultiModalData']) -> b
Check data for categorical columns.
Return bool, whether data has categorical columns or not
"""
if data.data_type is not DataTypesEnum.table:
if data.data_type is not DataTypesEnum.table: # this method doesn't work when data is MultiModalData
return False
data_has_categorical_columns = False

Expand All @@ -144,3 +145,27 @@ def data_has_categorical_features(data: Union[InputData, 'MultiModalData']) -> b
data_has_categorical_columns = len(cat_ids) > 0

return data_has_categorical_columns

# TODO make these checks more accurate and complex


def data_has_text_features(data: InputData) -> bool:
"""
Check data for text fields.
Return bool, whether data has text fields or not
"""
if data.data_type is DataTypesEnum.text:
return True
else:
return False


def data_has_image_features(data: InputData) -> bool:
"""
Check data for text fields.
Return bool, whether data has text fields or not
"""
if data.data_type is DataTypesEnum.image:
return True
else:
return False
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def _reasonability_check(features):
# For every column in table make check
for column_id in range(0, columns_amount):
column = features[:, column_id] if columns_amount >= 1 else features
# column = features[:, column_id] if columns_amount > 1 else features
if len(np.unique(column)) > 2:
non_bool_ids.append(column_id)
else:
Expand Down
3 changes: 3 additions & 0 deletions test/unit/preprocessing/test_preprocessing_though_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def data_with_categorical_target(with_nan: bool = False):

return train_input

# TODO test data with text features
# TODO test data with image features


def test_correct_api_dataset_preprocessing():
""" Check if dataset preprocessing was performed correctly when API launch using. """
Expand Down

0 comments on commit cb8bc5c

Please sign in to comment.