From b2a94ce62d9f34e5a24a0408ea96340095940f23 Mon Sep 17 00:00:00 2001 From: andreygetmanov Date: Thu, 9 Jun 2022 17:54:22 +0300 Subject: [PATCH] - change of multimodal example and case to be run by API --- cases/multi_modal_genre_prediction.py | 75 +++++------------------ examples/advanced/multi_modal_pipeline.py | 54 +--------------- 2 files changed, 18 insertions(+), 111 deletions(-) diff --git a/cases/multi_modal_genre_prediction.py b/cases/multi_modal_genre_prediction.py index 793b2bc3df..f55f72d469 100644 --- a/cases/multi_modal_genre_prediction.py +++ b/cases/multi_modal_genre_prediction.py @@ -1,77 +1,32 @@ -import datetime - -from sklearn.metrics import f1_score - -from examples.advanced.multi_modal_pipeline import calculate_validation_metric, \ - generate_initial_pipeline_and_data, prepare_multi_modal_data -from fedot.core.composer.composer_builder import ComposerBuilder -from fedot.core.composer.gp_composer.gp_composer import PipelineComposerRequirements -from fedot.core.log import default_log -from fedot.core.optimisers.gp_comp.gp_optimiser import GPGraphOptimiserParameters, GeneticSchemeTypesEnum -from fedot.core.pipelines.tuning.unified import PipelineTuner -from fedot.core.repository.operation_types_repository import get_operations_for_task -from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum +from fedot.api.main import Fedot + +from examples.advanced.multi_modal_pipeline import calculate_validation_metric, prepare_multi_modal_data +from fedot.core.data.data_split import train_test_data_setup from fedot.core.repository.tasks import Task, TaskTypesEnum -def run_multi_modal_case(files_path, is_visualise=True, timeout=datetime.timedelta(minutes=1)): +def run_multi_modal_case(files_path, is_visualise=True): task = Task(TaskTypesEnum.classification) images_size = (224, 224) data = prepare_multi_modal_data(files_path, task, images_size) - initial_pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(data, with_split=True) - - # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task - available_model_types = get_operations_for_task(task=task, mode='model') - - # the choice of the metric for the pipeline quality assessment during composition - metric_function = ClassificationMetricsEnum.f1 - # the choice and initialisation of the GP search - composer_requirements = PipelineComposerRequirements( - primary=available_model_types, - secondary=available_model_types, max_arity=3, - max_depth=5, pop_size=5, num_of_generations=5, - crossover_prob=0.8, mutation_prob=0.8, timeout=timeout) - - # GP optimiser parameters choice - scheme_type = GeneticSchemeTypesEnum.parameter_free - optimiser_parameters = GPGraphOptimiserParameters(genetic_scheme_type=scheme_type) - - # Create builder for composer and set composer params - logger = default_log('FEDOT logger', verbose_level=4) - - # the multi modal template (with data sources) is passed as initial assumption for composer - builder = ComposerBuilder(task=task) \ - .with_requirements(composer_requirements) \ - .with_metrics(metric_function) \ - .with_optimiser_params(parameters=optimiser_parameters) \ - .with_logger(logger=logger) \ - .with_initial_pipelines([initial_pipeline]) \ - .with_cache('multi_modal_opt.cache') - - # Create GP-based composer - composer = builder.build() - - # the optimal pipeline generation by composition - the most time-consuming task - pipeline_evo_composed = composer.compose_pipeline(data=fit_data) - pipeline_evo_composed.print_structure() - - # tuning of the composed pipeline - pipeline_tuner = PipelineTuner(pipeline=pipeline_evo_composed, task=task, iterations=15) - tuned_pipeline = pipeline_tuner.tune_pipeline(input_data=fit_data, - loss_function=f1_score, - loss_params={'average': 'micro'}) - tuned_pipeline.print_structure() - tuned_pipeline.fit(input_data=fit_data) + fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.6) + + # tuner on image data is not implemented yet, timeout increase can cause unstable work + automl_model = Fedot(problem='classification', timeout=0.1) + pipeline = automl_model.fit(features=fit_data, + target=fit_data.target) if is_visualise: - tuned_pipeline.show() + pipeline.show() + + prediction = pipeline.predict(predict_data, output_mode='labels') - prediction = tuned_pipeline.predict(predict_data, output_mode='labels') err = calculate_validation_metric(predict_data, prediction) print(f'F1 micro for validation sample is {err}') + return err diff --git a/examples/advanced/multi_modal_pipeline.py b/examples/advanced/multi_modal_pipeline.py index e098e88c84..545efac693 100644 --- a/examples/advanced/multi_modal_pipeline.py +++ b/examples/advanced/multi_modal_pipeline.py @@ -6,8 +6,6 @@ from sklearn.metrics import f1_score as f1 from cases.dataset_preparation import unpack_archived_data -from fedot.core.pipelines.pipeline import Pipeline -from fedot.core.pipelines.node import PrimaryNode, SecondaryNode from fedot.core.data.data import InputData, OutputData from fedot.core.data.data_split import train_test_data_setup from fedot.core.data.multi_modal import MultiModalData @@ -71,62 +69,16 @@ def prepare_multi_modal_data(files_path: str, task: Task, images_size: tuple = ( return data -def generate_initial_pipeline_and_data(data: Union[InputData, MultiModalData], - with_split=True) -> tuple: - """ - Generates initial pipeline for data from 3 different sources (table, images and text) - Each source is the primary node for its subpipeline - - :param data: multimodal data (from 3 different sources: table, text, image) - :param with_split: if True, splits the sample on train/test - :return: pipeline object, 2 multimodal data objects (fit and predict) - """ - - # image - ds_image = PrimaryNode('data_source_img') - image_node = SecondaryNode('cnn', nodes_from=[ds_image]) - image_node.custom_params = {'architecture_type': 'simplified', - 'epochs': 2, - 'batch_size': 16, - 'optimizer_parameters': {'loss': "binary_crossentropy", - 'optimizer': "adam", - 'metrics': 'categorical_crossentropy'} - } - - # table - ds_table = PrimaryNode('data_source_table') - numeric_node = SecondaryNode('scaling', nodes_from=[ds_table]) - - # text - ds_text = PrimaryNode('data_source_text') - node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text]) - text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean]) - text_node.custom_params = {'ngram_range': (1, 3), 'min_df': 0.001, 'max_df': 0.9} - - # combining all sources together - logit_node = SecondaryNode('logit', nodes_from=[numeric_node, text_node, image_node]) - logit_node.custom_params = {'max_iter': 100000, 'random_state': 42} - pipeline = Pipeline(logit_node) - - # train/test ratio - ratio = 0.6 - if with_split: - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=ratio) - else: - fit_data, predict_data = data, data - - return pipeline, fit_data, predict_data - - def run_multi_modal_pipeline(files_path: str, is_visualise=True) -> float: task = Task(TaskTypesEnum.classification) images_size = (224, 224) data = prepare_multi_modal_data(files_path, task, images_size) - initial_pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(data, with_split=True) + fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.6) - automl_model = Fedot(problem='classification', timeout=5) + # tuner on image data is not implemented yet, timeout increase can cause unstable work + automl_model = Fedot(problem='classification', timeout=0.1) pipeline = automl_model.fit(features=fit_data, target=fit_data.target)