Skip to content

Commit

Permalink
- CNN node now is added by processing_builder, not by preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
andreygetmanov committed Jun 17, 2022
1 parent b2a94ce commit 9b8085b
Show file tree
Hide file tree
Showing 8 changed files with 25 additions and 43 deletions.
2 changes: 1 addition & 1 deletion fedot/api/api_utils/assumptions/assumptions_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def to_builders(self, initial_node: Optional[Node] = None) -> List[PipelineBuild
valid_builders = []
for processing in self.assumptions_generator.processing_builders():
candidate_builder = preprocessing.merge_with(processing)
if self.ops_filter.satisfies(candidate_builder.to_pipeline()):
if self.ops_filter.satisfies(candidate_builder.to_pipeline(), self.data_type):
valid_builders.append(candidate_builder)
return valid_builders or [self.assumptions_generator.fallback_builder(self.ops_filter)]

Expand Down
9 changes: 6 additions & 3 deletions fedot/api/api_utils/assumptions/operations_filter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from random import choice
from typing import Optional, List, Iterable
from typing import Optional, Iterable

from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.repository.dataset_types import DataTypesEnum


class OperationsFilter:
def satisfies(self, pipeline: Optional[Pipeline]) -> bool:
def satisfies(self, pipeline: Optional[Pipeline], data_type: DataTypesEnum) -> bool:
""" Checks if all operations in a Pipeline satisify this filter. """
if data_type is DataTypesEnum.image and 'cnn' not in [node.operation.operation_type for node in pipeline.nodes]:
return False
return True

def sample(self) -> str:
Expand All @@ -23,7 +26,7 @@ def __init__(self, available_operations: Iterable[str], available_task_operation
self._whitelist = tuple(available_operations)
self._choice_operations = tuple(available_task_operations) if available_task_operations else self._whitelist

def satisfies(self, pipeline: Optional[Pipeline]) -> bool:
def satisfies(self, pipeline: Optional[Pipeline], data_type: DataTypesEnum) -> bool:
def node_ok(node):
return node.operation.operation_type in self._whitelist

Expand Down
18 changes: 7 additions & 11 deletions fedot/api/api_utils/assumptions/preprocessing_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,59 @@

from fedot.core.data.data import InputData
from fedot.core.data.data_preprocessing import data_has_missing_values, data_has_categorical_features, \
data_has_text_features, data_has_image_features
data_has_text_features
from fedot.core.data.multi_modal import MultiModalData
from fedot.core.pipelines.node import Node
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.pipeline_builder import PipelineBuilder
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import TaskTypesEnum


class PreprocessingBuilder:

def __init__(self, task_type: TaskTypesEnum, *initial_nodes: Node):
def __init__(self, task_type: TaskTypesEnum, data_type: DataTypesEnum, *initial_nodes: Node):
self.task_type = task_type
self.data_type = data_type
self._builder = PipelineBuilder(*initial_nodes)

@classmethod
def builder_for_data(cls,
task_type: TaskTypesEnum,
data: Union[InputData, MultiModalData],
*initial_nodes: Optional[Node]) -> PipelineBuilder:
preprocessing_builder = cls(task_type, *initial_nodes)
if isinstance(data, MultiModalData):
# if the data is unimodal, initial_nodes = tuple of None
# if the data is multimodal, initial_nodes = tuple of 1 element (current data_source node)
# so the whole data is reduced to the current data_source for an easier preprocessing
data = data[str(initial_nodes[0])]
preprocessing_builder = cls(task_type, data.data_type, *initial_nodes)
if data_has_missing_values(data):
preprocessing_builder = preprocessing_builder.with_gaps()
if data_has_categorical_features(data):
preprocessing_builder = preprocessing_builder.with_categorical()
if data_has_text_features(data):
preprocessing_builder = preprocessing_builder.with_text_vectorizer()
if data_has_image_features(data):
preprocessing_builder = preprocessing_builder.with_image()
return preprocessing_builder.to_builder()

def with_gaps(self):
self._builder.add_node('simple_imputation')
return self

def with_categorical(self):
if self.task_type != TaskTypesEnum.ts_forecasting:
if self.task_type is not TaskTypesEnum.ts_forecasting:
self._builder.add_node('one_hot_encoding')
return self

def with_scaling(self):
if self.task_type != TaskTypesEnum.ts_forecasting:
if self.task_type is not TaskTypesEnum.ts_forecasting and self.data_type is not DataTypesEnum.image:
self._builder.add_node('scaling')
return self

def with_text_vectorizer(self):
self._builder.add_node('tfidf')
return self

def with_image(self):
self._builder.add_node('cnn')
return self

def to_builder(self) -> PipelineBuilder:
""" Return result as PipelineBuilder. Scaling is applied final by default. """
return self.with_scaling()._builder
Expand Down
1 change: 1 addition & 0 deletions fedot/api/api_utils/assumptions/task_assumptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class ClassificationAssumptions(TaskAssumptions):
builders = {
'rf': PipelineBuilder().add_node('rf'),
'logit': PipelineBuilder().add_node('logit'),
'cnn': PipelineBuilder().add_node('cnn'),
}

def ensemble_operation(self) -> str:
Expand Down
12 changes: 0 additions & 12 deletions fedot/core/data/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,3 @@ def data_has_text_features(data: InputData) -> bool:
"""

return isinstance(data.features[0], str) and len(data.features.shape) == 1


# TODO: @andreygetmanov (should it be there or outside preprocessing)
def data_has_image_features(data: InputData) -> bool:
"""
Check data for text fields.
Return bool, whether data has text fields or not
"""
if data.data_type is DataTypesEnum.image:
return True
else:
return False
2 changes: 1 addition & 1 deletion fedot/core/repository/data/data_operation_repository.json
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@
"meta": "text_preprocessing_sklearn",
"tags": ["non-default"]
},
"word2vec": {
"word2vec_pretrained": {
"meta": "text_classification_gensim",
"tags": ["non-default"]
},
Expand Down
6 changes: 0 additions & 6 deletions fedot/core/repository/data/model_repository.json
Original file line number Diff line number Diff line change
Expand Up @@ -420,12 +420,6 @@
"non_linear"
]
},
"word2vec_pretrained": {
"meta": "text_classification_gensim",
"tags": [
"non-default", "text"
]
},
"xgboost": {
"meta": "sklearn_class",
"presets": ["*tree"],
Expand Down
18 changes: 9 additions & 9 deletions test/unit/api/test_assumption_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,24 @@ def preprocess(task_type: TaskTypesEnum, data: Union[InputData, MultiModalData])


def test_preprocessing_builder_no_data():
assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression).to_pipeline(), 'scaling')
assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression).with_gaps().to_pipeline(),
assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).to_pipeline(), 'scaling')
assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).with_gaps().to_pipeline(),
'simple_imputation')
assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression).with_categorical().to_pipeline(),
assert pipeline_contains_all(PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).with_categorical().to_pipeline(),
'one_hot_encoding')
assert pipeline_contains_all(
PreprocessingBuilder(TaskTypesEnum.regression).with_gaps().with_categorical().to_pipeline(),
PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).with_gaps().with_categorical().to_pipeline(),
'simple_imputation', 'one_hot_encoding')

# have default preprocessing pipelines
assert PreprocessingBuilder(TaskTypesEnum.regression).to_pipeline() is not None
assert PreprocessingBuilder(TaskTypesEnum.classification).to_pipeline() is not None
assert PreprocessingBuilder(TaskTypesEnum.clustering).to_pipeline() is not None
assert PreprocessingBuilder(TaskTypesEnum.regression, DataTypesEnum.table).to_pipeline() is not None
assert PreprocessingBuilder(TaskTypesEnum.classification, DataTypesEnum.table).to_pipeline() is not None
assert PreprocessingBuilder(TaskTypesEnum.clustering, DataTypesEnum.table).to_pipeline() is not None

# have no default preprocessing pipelines without additional options
assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting).to_pipeline() is None
assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting, DataTypesEnum.ts).to_pipeline() is None
# with additional options ok
assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting).with_gaps().to_pipeline() is not None
assert PreprocessingBuilder(TaskTypesEnum.ts_forecasting, DataTypesEnum.ts).with_gaps().to_pipeline() is not None


def test_preprocessing_builder_with_data():
Expand Down

0 comments on commit 9b8085b

Please sign in to comment.