Skip to content

Commit

Permalink
minor changes:
Browse files Browse the repository at this point in the history
- added docstring for _column_contains_text
- multimodal_wine dataset is moved to more appropriate place
- protected funcs of multi_modal.py are now protected
  • Loading branch information
andreygetmanov committed Jul 21, 2022
1 parent c11ba62 commit 6844f9e
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 31 deletions.
8 changes: 4 additions & 4 deletions examples/advanced/multimodal_text_num_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from pathlib import Path

from fedot.api.main import Fedot

Expand All @@ -8,10 +9,9 @@
from fedot.core.utils import fedot_project_root


def run_multi_modal_example(files_path: str, is_visualise=True) -> float:
def run_multi_modal_example(file_path: str, is_visualise=True) -> float:
task = Task(TaskTypesEnum.classification)

path = os.path.join(str(fedot_project_root()), files_path, 'multimodal_wine.csv')
path = Path(fedot_project_root(), file_path)
data = MultiModalData.from_csv(file_path=path, task=task, target_columns='variety', index_col=None)
fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7)

Expand All @@ -31,4 +31,4 @@ def run_multi_modal_example(files_path: str, is_visualise=True) -> float:


if __name__ == '__main__':
run_multi_modal_example(files_path='examples/data/multimodal_wine', is_visualise=True)
run_multi_modal_example(file_path='examples/data/multimodal_wine.csv', is_visualise=True)
File renamed without changes.
59 changes: 32 additions & 27 deletions fedot/core/data/multi_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,17 @@ def from_csv_time_series(task: Task,
'Multivariate predict not supported in this function yet.')
else:
train_data, _ = \
prepare_multimodal_ts_data(dataframe=df,
features=var_names,
forecast_length=0)
_prepare_multimodal_ts_data(dataframe=df,
features=var_names,
forecast_length=0)

if target_column is not None:
target = np.array(df[target_column])
else:
target = np.array(df[df.columns[-1]])

# create labels for data sources
data_part_transformation_func = partial(array_to_input_data, idx=idx,
data_part_transformation_func = partial(_array_to_input_data, idx=idx,
target_array=target, task=task,
data_type=DataTypesEnum.ts)

Expand All @@ -123,6 +123,7 @@ def from_csv_time_series(task: Task,

return input_data

@staticmethod
def from_csv(file_path=None,
delimiter=',',
task: Task = Task(TaskTypesEnum.classification),
Expand All @@ -135,7 +136,7 @@ def from_csv(file_path=None,
:param columns_to_drop: the names of columns that should be dropped
:param delimiter: the delimiter to separate the columns
:param task: the task that should be solved with data
:param text_columns: name of columns that contain text data
:param text_columns: names of columns that contain text data
:param target_columns: name of target column (last column if empty and no target if None)
:param index_col: column name or index to use as the Data.idx;
if None then arrange new unique index
Expand All @@ -149,13 +150,13 @@ def from_csv(file_path=None,
idx = data_frame.index.to_numpy()

if not text_columns:
text_columns = define_text_columns(data_frame)
text_columns = _define_text_columns(data_frame)

data_text = prepare_multimodal_text_data(data_frame, text_columns)
data_text = _prepare_multimodal_text_data(data_frame, text_columns)
data_frame_table = data_frame.drop(columns=text_columns)
table_features, target = process_target_and_features(data_frame_table, target_columns)

data_part_transformation_func = partial(array_to_input_data, idx=idx,
data_part_transformation_func = partial(_array_to_input_data, idx=idx,
target_array=target, task=task)

# create labels for data sources
Expand All @@ -173,29 +174,34 @@ def from_csv(file_path=None,
return multi_modal_data


def define_text_columns(data_frame: pd.DataFrame) -> List[str]:
def _define_text_columns(data_frame: pd.DataFrame) -> List[str]:
"""
:param data_frame: pandas dataframe with data
:return: list of text columns' names
"""
text_columns = []
for column in data_frame.columns:
if column_contains_text(data_frame[column]):
text_columns.append(column)
for column_name in data_frame.columns:
if _column_contains_text(data_frame[column_name]):
text_columns.append(column_name)
return text_columns


def column_contains_text(column: pd.Series) -> bool:
def _column_contains_text(column: pd.Series) -> bool:
"""
Column contains text if:
1. it's not numerical or latent numerical
(e.g. ['1.2', '2.3', '3.4', ...] is numerical too)
2. fraction of unique values is more than 0.95
:param column: pandas series with data
:return: True if column contains text
"""
if column.dtype == object and not isfloat(column):
if column.dtype == object and not _isfloat(column):
return len(column.unique()) / len(column) > 0.95
return False


def isfloat(column: pd.Series) -> bool:
def _isfloat(column: pd.Series) -> bool:
"""
:param column: pandas series with data
:return: True if column contains only float or nan values
Expand All @@ -207,25 +213,24 @@ def isfloat(column: pd.Series) -> bool:
return False


def prepare_multimodal_text_data(dataframe: pd.DataFrame, text_columns: List[str]) -> dict:
def _prepare_multimodal_text_data(dataframe: pd.DataFrame, text_columns: List[str]) -> dict:
""" Prepares MultiModal text data in a form of dictionary
:param dataframe: pandas DataFrame to process
:param text_columns: list of text columns names
:param text_columns: list of text columns' names
:return multimodal_text_data: dictionary with numpy arrays of text data
"""
multi_modal_text_data = {}

for column in text_columns:

text_feature = np.array(dataframe[column])
multi_modal_text_data.update({column: text_feature})
for column_name in text_columns:
text_feature = np.array(dataframe[column_name])
multi_modal_text_data.update({column_name: text_feature})

return multi_modal_text_data


def prepare_multimodal_ts_data(dataframe: pd.DataFrame, features: list, forecast_length: int) -> dict:
def _prepare_multimodal_ts_data(dataframe: pd.DataFrame, features: list, forecast_length: int) -> dict:
""" Prepare MultiModal data for time series forecasting task in a form of
dictionary
Expand Down Expand Up @@ -256,11 +261,11 @@ def prepare_multimodal_ts_data(dataframe: pd.DataFrame, features: list, forecast
return multi_modal_train, multi_modal_test


def array_to_input_data(features_array: np.array,
target_array: np.array,
idx: Optional[np.array] = None,
task: Task = Task(TaskTypesEnum.classification),
data_type: DataTypesEnum = DataTypesEnum.table):
def _array_to_input_data(features_array: np.array,
target_array: np.array,
idx: Optional[np.array] = None,
task: Task = Task(TaskTypesEnum.classification),
data_type: DataTypesEnum = DataTypesEnum.table):
"""
Transforms numpy array to InputData object
"""
Expand Down

0 comments on commit 6844f9e

Please sign in to comment.