From 9b8085bf6857445d945a11102ecc67a8d836f171 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Fri, 17 Jun 2022 17:48:01 +0300 Subject: [PATCH] - CNN node now is added by processing_builder, not by preprocessing --- .../assumptions/assumptions_builder.py | 2 +- .../api_utils/assumptions/operations_filter.py | 9 ++++++--- .../assumptions/preprocessing_builder.py | 18 +++++++----------- .../api_utils/assumptions/task_assumptions.py | 1 + fedot/core/data/data_preprocessing.py | 12 ------------ .../data/data_operation_repository.json | 2 +- .../core/repository/data/model_repository.json | 6 ------ test/unit/api/test_assumption_builder.py | 18 +++++++++--------- 8 files changed, 25 insertions(+), 43 deletions(-) diff --git a/fedot/api/api_utils/assumptions/assumptions_builder.py b/fedot/api/api_utils/assumptions/assumptions_builder.py index f21701af9d..6587c1de60 100644 --- a/fedot/api/api_utils/assumptions/assumptions_builder.py +++ b/fedot/api/api_utils/assumptions/assumptions_builder.py @@ -93,7 +93,7 @@ def to_builders(self, initial_node: Optional[Node] = None) -> List[PipelineBuild valid_builders = [] for processing in self.assumptions_generator.processing_builders(): candidate_builder = preprocessing.merge_with(processing) - if self.ops_filter.satisfies(candidate_builder.to_pipeline()): + if self.ops_filter.satisfies(candidate_builder.to_pipeline(), self.data_type): valid_builders.append(candidate_builder) return valid_builders or [self.assumptions_generator.fallback_builder(self.ops_filter)] diff --git a/fedot/api/api_utils/assumptions/operations_filter.py b/fedot/api/api_utils/assumptions/operations_filter.py index f2607ddc5e..166e956fb2 100644 --- a/fedot/api/api_utils/assumptions/operations_filter.py +++ b/fedot/api/api_utils/assumptions/operations_filter.py @@ -1,12 +1,15 @@ from random import choice -from typing import Optional, List, Iterable +from typing import Optional, Iterable from fedot.core.pipelines.pipeline import Pipeline +from fedot.core.repository.dataset_types import DataTypesEnum class OperationsFilter: - def satisfies(self, pipeline: Optional[Pipeline]) -> bool: + def satisfies(self, pipeline: Optional[Pipeline], data_type: DataTypesEnum) -> bool: """ Checks if all operations in a Pipeline satisify this filter. """ + if data_type is DataTypesEnum.image and 'cnn' not in [node.operation.operation_type for node in pipeline.nodes]: + return False return True def sample(self) -> str: @@ -23,7 +26,7 @@ def __init__(self, available_operations: Iterable[str], available_task_operation self._whitelist = tuple(available_operations) self._choice_operations = tuple(available_task_operations) if available_task_operations else self._whitelist - def satisfies(self, pipeline: Optional[Pipeline]) -> bool: + def satisfies(self, pipeline: Optional[Pipeline], data_type: DataTypesEnum) -> bool: def node_ok(node): return node.operation.operation_type in self._whitelist diff --git a/fedot/api/api_utils/assumptions/preprocessing_builder.py b/fedot/api/api_utils/assumptions/preprocessing_builder.py index 16d9289d14..4351c753e5 100644 --- a/fedot/api/api_utils/assumptions/preprocessing_builder.py +++ b/fedot/api/api_utils/assumptions/preprocessing_builder.py @@ -2,18 +2,20 @@ from fedot.core.data.data import InputData from fedot.core.data.data_preprocessing import data_has_missing_values, data_has_categorical_features, \ - data_has_text_features, data_has_image_features + data_has_text_features from fedot.core.data.multi_modal import MultiModalData from fedot.core.pipelines.node import Node from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.pipeline_builder import PipelineBuilder +from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum class PreprocessingBuilder: - def __init__(self, task_type: TaskTypesEnum, *initial_nodes: Node): + def __init__(self, task_type: TaskTypesEnum, data_type: DataTypesEnum, *initial_nodes: Node): self.task_type = task_type + self.data_type = data_type self._builder = PipelineBuilder(*initial_nodes) @classmethod @@ -21,20 +23,18 @@ def builder_for_data(cls, task_type: TaskTypesEnum, data: Union[InputData, MultiModalData], *initial_nodes: Optional[Node]) -> PipelineBuilder: - preprocessing_builder = cls(task_type, *initial_nodes) if isinstance(data, MultiModalData): # if the data is unimodal, initial_nodes = tuple of None # if the data is multimodal, initial_nodes = tuple of 1 element (current data_source node) # so the whole data is reduced to the current data_source for an easier preprocessing data = data[str(initial_nodes[0])] + preprocessing_builder = cls(task_type, data.data_type, *initial_nodes) if data_has_missing_values(data): preprocessing_builder = preprocessing_builder.with_gaps() if data_has_categorical_features(data): preprocessing_builder = preprocessing_builder.with_categorical() if data_has_text_features(data): preprocessing_builder = preprocessing_builder.with_text_vectorizer() - if data_has_image_features(data): - preprocessing_builder = preprocessing_builder.with_image() return preprocessing_builder.to_builder() def with_gaps(self): @@ -42,12 +42,12 @@ def with_gaps(self): return self def with_categorical(self): - if self.task_type != TaskTypesEnum.ts_forecasting: + if self.task_type is not TaskTypesEnum.ts_forecasting: self._builder.add_node('one_hot_encoding') return self def with_scaling(self): - if self.task_type != TaskTypesEnum.ts_forecasting: + if self.task_type is not TaskTypesEnum.ts_forecasting and self.data_type is not DataTypesEnum.image: self._builder.add_node('scaling') return self @@ -55,10 +55,6 @@ def with_text_vectorizer(self): self._builder.add_node('tfidf') return self - def with_image(self): - self._builder.add_node('cnn') - return self - def to_builder(self) -> PipelineBuilder: """ Return result as PipelineBuilder. Scaling is applied final by default. """ return self.with_scaling()._builder diff --git a/fedot/api/api_utils/assumptions/task_assumptions.py b/fedot/api/api_utils/assumptions/task_assumptions.py index 05bb1b96fe..4a004f8aec 100644 --- a/fedot/api/api_utils/assumptions/task_assumptions.py +++ b/fedot/api/api_utils/assumptions/task_assumptions.py @@ -103,6 +103,7 @@ class ClassificationAssumptions(TaskAssumptions): builders = { 'rf': PipelineBuilder().add_node('rf'), 'logit': PipelineBuilder().add_node('logit'), + 'cnn': PipelineBuilder().add_node('cnn'), } def ensemble_operation(self) -> str: diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 8db3b5c4d8..de033ec9a1 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -137,15 +137,3 @@ def data_has_text_features(data: InputData) -> bool: """ return isinstance(data.features[0], str) and len(data.features.shape) == 1 - - -# TODO: @andreygetmanov (should it be there or outside preprocessing) -def data_has_image_features(data: InputData) -> bool: - """ - Check data for text fields. - Return bool, whether data has text fields or not - """ - if data.data_type is DataTypesEnum.image: - return True - else: - return False diff --git a/fedot/core/repository/data/data_operation_repository.json b/fedot/core/repository/data/data_operation_repository.json index 20009e89af..95b1592241 100644 --- a/fedot/core/repository/data/data_operation_repository.json +++ b/fedot/core/repository/data/data_operation_repository.json @@ -333,7 +333,7 @@ "meta": "text_preprocessing_sklearn", "tags": ["non-default"] }, - "word2vec": { + "word2vec_pretrained": { "meta": "text_classification_gensim", "tags": ["non-default"] }, diff --git a/fedot/core/repository/data/model_repository.json b/fedot/core/repository/data/model_repository.json index ace517755a..ee733a55e1 100644 --- a/fedot/core/repository/data/model_repository.json +++ b/fedot/core/repository/data/model_repository.json @@ -420,12 +420,6 @@ "non_linear" ] }, - "word2vec_pretrained": { - "meta": "text_classification_gensim", - "tags": [ - "non-default", "text" - ] - }, "xgboost": { "meta": "sklearn_class", "presets": ["*tree"], diff --git a/test/unit/api/test_assumption_builder.py b/test/unit/api/test_assumption_builder.py index f16aa3c77b..7a0b708676 100644 --- a/test/unit/api/test_assumption_builder.py +++ b/test/unit/api/test_assumption_builder.py @@ -65,24 +65,24 @@ def preprocess(task_type: TaskTypesEnum, data: Union[InputData, MultiModalData]) def test_preprocessing_builder_no_data(): - assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression).to_pipeline(), 'scaling') - assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression).with_gaps().to_pipeline(), + assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).to_pipeline(), 'scaling') + assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).with_gaps().to_pipeline(), 'simple_imputation') - assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression).with_categorical().to_pipeline(), + assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).with_categorical().to_pipeline(), 'one_hot_encoding') assert pipeline_contains_all( - PreprocessingBuilder(TaskTypesEnum.regression).with_gaps().with_categorical().to_pipeline(), + PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).with_gaps().with_categorical().to_pipeline(), 'simple_imputation', 'one_hot_encoding') # have default preprocessing pipelines - assert PreprocessingBuilder(TaskTypesEnum.regression).to_pipeline() is not None - assert PreprocessingBuilder(TaskTypesEnum.classification).to_pipeline() is not None - assert PreprocessingBuilder(TaskTypesEnum.clustering).to_pipeline() is not None + assert PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).to_pipeline() is not None + assert PreprocessingBuilder(TaskTypesEnum.classification, DataTypesEnum.table).to_pipeline() is not None + assert PreprocessingBuilder(TaskTypesEnum.clustering, DataTypesEnum.table).to_pipeline() is not None # have no default preprocessing pipelines without additional options - assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting).to_pipeline() is None + assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting, DataTypesEnum.ts).to_pipeline() is None # with additional options ok - assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting).with_gaps().to_pipeline() is not None + assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting, DataTypesEnum.ts).with_gaps().to_pipeline() is not None def test_preprocessing_builder_with_data():