aimclub · andreygetmanov · Jan 19, 2024 · Jan 19, 2024 · Jan 24, 2024 · Feb 20, 2024
diff --git a/docs/source/advanced/cli_call.rst b/docs/source/advanced/cli_call.rst
@@ -69,4 +69,4 @@ problems decision are presented.
 
 The string below helps to run classification problem decision from the console:
 
-``python --problem classification --train ../../test/data/simple_classification.csv --test ../../test/data/simple_classification.csv  --target Y --timeout 0.1``
+``python --problem classification --train ../../test/data/classification/simple_classification.csv --test ../../test/data/classification/simple_classification.csv  --target Y --timeout 0.1``
diff --git a/examples/simple/cli_application/cli_classification_call.bat b/examples/simple/cli_application/cli_classification_call.bat
@@ -1,3 +1,3 @@
 set python_path = "DEFAULT"
 cd ../../fedot/api
-%python_path% fedot_cli.py --problem classification --train ../../test/data/simple_classification.csv --test ../../test/data/simple_classification.csv  --target Y --timeout 0.1
+%python_path% fedot_cli.py --problem classification --train ../../test/data/classification/simple_classification.csv --test ../../test/data/classification/simple_classification.csv  --target Y --timeout 0.1
diff --git a/test/data/multiclass_classification.csv → ...ssification/multiclass_classification.csv b/test/data/multiclass_classification.csv → ...ssification/multiclass_classification.csv
diff --git a/test/data/classification/multiclass_classification.npy b/test/data/classification/multiclass_classification.npy
diff --git a/test/data/simple_classification.csv → .../classification/simple_classification.csv b/test/data/simple_classification.csv → .../classification/simple_classification.csv
diff --git a/test/data/classification/simple_classification.npy b/test/data/classification/simple_classification.npy
diff --git a/test/data/regression/simple_regression.csv b/test/data/regression/simple_regression.csv
@@ -0,0 +1,62 @@
+Unnamed: 0,Asphalt,N-par,Iso-par,AROM UV,P,V,S,target
+41,0.21,71.09,18.5,2.13,839.3,13.44,0.09,0.223
+8,1.2,63.47,20.44,7.53,891.8,42.49,0.61,0.168
+43,2.37,61.27,20.41,9.0,906.8,62.9,1.01,0.11
+21,4.16,59.83,19.72,9.79,916.5,81.55,1.55,0.07
+27,1.26,68.27,18.77,4.04,858.6,19.59,0.47,0.99
+51,0.98,68.44,18.9,3.96,857.5,19.19,0.39,1.75
+20,4.24,59.62,19.75,9.93,917.9,85.16,1.58,0.05
+42,2.3,59.63,21.0,10.17,918.2,87.75,1.04,0.03
+24,3.29,62.36,19.44,8.07,899.1,50.44,1.23,0.34
+53,1.59,65.0,19.66,4.11,881.2,32.51,0.67,1.21
+59,2.93,60.93,20.16,4.03,909.1,66.68,1.18,0.242
+10,1.05,64.52,20.18,6.78,884.5,35.43,0.53,0.196
+45,2.51,64.44,19.24,6.66,884.9,35.24,0.94,1.03
+22,4.09,60.05,19.7,9.64,915.0,78.14,1.53,0.102
+60,3.45,60.61,19.93,2.43,911.2,70.47,1.33,0.252
+14,0.22,70.36,18.74,2.66,844.3,14.71,0.11,0.239
+29,0.31,71.04,18.45,2.16,839.6,13.53,0.12,0.35
+55,3.28,60.71,20.0,2.96,910.5,69.17,1.28,0.223
+32,2.07,61.45,20.55,8.88,905.5,60.93,0.92,0.03
+6,1.31,62.75,20.61,8.03,896.7,48.26,0.66,0.072
+57,1.86,61.6,20.65,7.35,904.7,59.6,0.85,0.087
+37,1.76,64.89,19.59,6.43,881.9,33.0,0.72,0.618
+54,1.63,66.53,19.11,2.03,870.5,25.36,0.63,2.06
+33,2.97,60.06,20.43,9.77,915.0,79.26,1.21,0.123
+38,1.3,66.73,19.26,5.15,869.2,24.7,0.53,0.874
+12,0.69,67.1,19.54,4.97,866.8,23.53,0.35,0.465
+58,2.37,61.27,20.41,5.76,906.8,62.86,1.01,0.178
+28,0.62,70.13,18.55,2.78,845.8,15.21,0.23,0.68
+31,2.21,60.53,20.77,9.53,911.9,72.93,0.99,0.02
+46,2.59,64.39,19.21,6.68,885.2,35.5,0.97,1.726
+25,2.59,64.39,19.21,6.68,885.2,35.5,0.97,1.726
+7,1.28,62.93,20.57,7.91,895.5,46.73,0.65,0.134
+48,0.72,62.53,24.37,5.1,877.2,28.25,0.37,0.28
+9,1.18,63.64,20.39,7.4,890.6,41.19,0.6,0.169
+39,0.85,68.51,18.95,3.92,859.0,19.0,0.35,0.59
+52,1.51,63.44,20.24,7.51,891.9,42.51,0.7,0.341
+49,0.93,65.39,19.96,6.17,878.5,30.7,0.47,0.452
+11,0.93,65.39,19.96,6.17,878.5,30.7,0.47,0.452
+18,4.39,59.19,19.8,10.23,920.9,93.0,1.64,0.03
+35,2.47,62.04,20.09,8.4,901.5,54.18,1.01,0.322
+47,0.85,58.08,28.54,5.7,890.6,37.14,0.43,0.19
+4,1.36,62.39,20.7,8.29,899.2,51.54,0.69,0.019
+15,0.11,71.15,18.54,2.1,838.9,13.36,0.06,0.108
+36,2.23,63.0,19.92,7.73,894.8,45.5,0.91,0.65
+1,1.97,58.1,21.76,11.32,928.7,122.8,0.99,0.02
+56,3.17,59.1,20.63,5.42,921.8,97.04,1.3,0.02
+17,4.76,58.09,19.92,10.97,928.4,117.2,1.78,0.02
+23,4.02,60.26,19.68,9.5,913.5,74.9,1.5,0.119
+0,2.24,56.15,22.24,12.7,942.2,193.1,1.14,0.02
+3,1.43,61.85,20.84,8.67,902.9,57.04,0.73,0.01
+34,2.72,61.06,20.26,9.08,908.2,65.18,1.11,0.123
+50,0.93,66.95,19.43,5.04,867.8,24.0,0.42,0.66
+30,2.35,59.6,21.0,10.19,918.3,88.19,1.05,0.02
+44,2.44,62.88,19.82,7.8,895.7,46.43,0.98,0.62
+13,0.45,68.75,19.14,3.8,855.4,18.47,0.23,0.39
+26,1.92,66.36,18.98,5.34,871.7,25.95,0.72,2.657
+40,0.42,70.24,18.65,2.72,845.1,15.0,0.17,0.51
+19,4.31,59.4,19.77,10.08,919.4,89.0,1.61,0.04
+2,1.7,60.0,21.29,9.98,915.6,81.93,0.86,0.015
+16,5.53,55.86,20.18,12.5,943.7,194.9,2.07,0.02
+5,1.33,62.57,20.66,8.16,897.9,49.87,0.67,0.047
diff --git a/test/data/regression/simple_regression.npy b/test/data/regression/simple_regression.npy
diff --git a/test/integration/api/test_api_cli_params.py b/test/integration/api/test_api_cli_params.py
@@ -26,7 +26,7 @@ def test_cli_with_parameters():
         f'--cv_folds 2 --target sea_height --train {ts_train_path} '
         f'--test {ts_train_path} --for_len 10'
     ).split()
-    class_train_path = project_root_path.joinpath('test/data/simple_classification.csv')
+    class_train_path = project_root_path.joinpath('test/data/classification/simple_classification.csv')
     class_call = (
         f'--problem classification --train {class_train_path} --test {class_train_path} --target Y '
         '--preset fast_train --timeout 0.1 --depth 3 --arity 3 '

diff --git a/test/integration/api/test_api_utils.py b/test/integration/api/test_api_utils.py
@@ -14,7 +14,7 @@
 from fedot.preprocessing.preprocessing import DataPreprocessor
 from test.data.datasets import get_cholesterol_dataset
 from test.integration.api.test_main_api import get_dataset
-from test.unit.tasks.test_classification import get_binary_classification_data
+from test.unit.tasks.test_classification import get_binary_classification_data_from_csv
 
 
 def test_compose_fedot_model_without_tuning():
@@ -32,7 +32,7 @@ def test_output_binary_classification_correct():
 
     task_type = 'classification'
 
-    data = get_binary_classification_data()
+    data = get_binary_classification_data_from_csv()
 
     train_data, test_data = train_test_data_setup(data, shuffle=True)
 

diff --git a/test/integration/composer/test_history.py b/test/integration/composer/test_history.py
@@ -68,7 +68,7 @@ def _test_individuals_in_history(history: OptHistory):
 
 @pytest.mark.parametrize('n_jobs', [1, 2])
 def test_newly_generated_history(n_jobs: int):
-    file_path_train = fedot_project_root().joinpath('test/data/simple_classification.csv')
+    file_path_train = fedot_project_root().joinpath('test/data/classification/simple_classification.csv')
 
     num_of_gens = 2
     auto_model = Fedot(problem='classification', seed=42,

diff --git a/test/integration/pipelines/tuning/test_pipeline_tuning.py b/test/integration/pipelines/tuning/test_pipeline_tuning.py
@@ -40,7 +40,7 @@ def regression_dataset():
 @pytest.fixture()
 def classification_dataset():
     test_file_path = str(os.path.dirname(__file__))
-    file = os.path.join(str(fedot_project_root()), 'test/data/simple_classification.csv')
+    file = os.path.join(str(fedot_project_root()), 'test/data/classification/simple_classification.csv')
     return InputData.from_csv(os.path.join(test_file_path, file), task=Task(TaskTypesEnum.classification))
 
 

diff --git a/test/integration/real_applications/test_real_cases.py b/test/integration/real_applications/test_real_cases.py
@@ -13,7 +13,8 @@
 
 
 def test_credit_scoring_problem():
-    full_path_train = full_path_test = fedot_project_root().joinpath('test/data/simple_classification.csv')
+    full_path_train = full_path_test = \
+        fedot_project_root().joinpath('test/data/classification/simple_classification.csv')
 
     roc_auc_test = run_credit_scoring_problem(full_path_train, full_path_test, timeout=5, target='Y', n_jobs=1)
     assert roc_auc_test > 0.5

diff --git a/test/unit/data/test_data.py b/test/unit/data/test_data.py
@@ -48,7 +48,7 @@ def test_data_subset_incorrect(data_setup):
 
 def test_data_from_csv():
     test_file_path = str(os.path.dirname(__file__))
-    file = '../../data/simple_classification.csv'
+    file = '../../data/classification/simple_classification.csv'
     task = Task(TaskTypesEnum.classification)
     df = pd.read_csv(os.path.join(test_file_path, file))
     data_array = np.array(df).T
@@ -71,7 +71,7 @@ def test_data_from_csv():
 
 def test_with_custom_target():
     test_file_path = str(os.path.dirname(__file__))
-    file = '../../data/simple_classification.csv'
+    file = '../../data/classification/simple_classification.csv'
     file_custom = '../../data/simple_classification_with_custom_target.csv'
 
     file_data = InputData.from_csv(
@@ -140,7 +140,7 @@ def test_target_data_from_csv_correct():
 
 def test_table_data_shuffle():
     test_file_path = str(os.path.dirname(__file__))
-    file = '../../data/simple_classification.csv'
+    file = '../../data/classification/simple_classification.csv'
 
     data = InputData.from_csv(os.path.join(test_file_path, file))
     shuffled_data = deepcopy(data)

diff --git a/test/unit/data/test_multimodal_data.py b/test/unit/data/test_multimodal_data.py
@@ -106,7 +106,7 @@ def test_text_data_only(data_type):
         data_source_name = 'data_source_text/description'
     elif data_type is DataTypesEnum.table:
         # Case when there is no text data in csv, but MultiModalData.from_csv() is used
-        file_path = 'test/data/simple_classification.csv'
+        file_path = 'test/data/classification/simple_classification.csv'
         data_source_name = 'data_source_table'
 
     path = Path(fedot_project_root(), file_path)

diff --git a/test/unit/optimizer/gp_operators/test_mutation.py b/test/unit/optimizer/gp_operators/test_mutation.py
@@ -33,7 +33,7 @@ def get_requirements_and_params_for_task(task: TaskTypesEnum):
 
 
 def file_data():
-    test_file_path = Path(__file__).parents[3].joinpath('data', 'simple_classification.csv')
+    test_file_path = Path(__file__).parents[3].joinpath('data', 'classification', 'simple_classification.csv')
     input_data = InputData.from_csv(test_file_path)
     input_data.idx = to_categorical_codes(categorical_ids=input_data.idx)
     return input_data

diff --git a/test/unit/pipelines/test_pipeline.py b/test/unit/pipelines/test_pipeline.py
@@ -47,7 +47,7 @@ def classification_dataset():
 @pytest.fixture()
 def file_data_setup():
     test_file_path = str(os.path.dirname(__file__))
-    file = '../../data/simple_classification.csv'
+    file = '../../data/classification/simple_classification.csv'
     input_data = InputData.from_csv(
         os.path.join(test_file_path, file))
     input_data.idx = to_categorical_codes(categorical_ids=input_data.idx)

diff --git a/test/unit/tasks/test_classification.py b/test/unit/tasks/test_classification.py
@@ -1,6 +1,9 @@
 import os
 
 import numpy as np
+import pandas as pd
+import pytest
+from typing import Callable
 from sklearn.datasets import load_iris, make_classification
 from sklearn.metrics import roc_auc_score as roc_auc
 
@@ -60,9 +63,61 @@ def get_iris_data() -> InputData:
     return input_data
 
 
-def get_binary_classification_data():
+def get_binary_classification_data_from_numpy():
     test_file_path = str(os.path.dirname(__file__))
-    file = '../../data/simple_classification.csv'
+    file = '../../data/classification/simple_classification.npy'
+    numpy_data = np.load(os.path.join(test_file_path, file))
+    features_array = numpy_data[:, :-1]
+    target_array = numpy_data[:, -1]
+    input_data = InputData.from_numpy(features_array=features_array,
+                                      target_array=target_array)
+    return input_data
+
+
+def get_binary_classification_data_from_df():
+    test_file_path = str(os.path.dirname(__file__))
+    file = '../../data/classification/simple_classification.csv'
+    df_data = pd.read_csv(os.path.join(test_file_path, file))
+    features_df = df_data.iloc[:, :-1]
+    target_df = df_data.iloc[:, -1]
+    input_data = InputData.from_dataframe(features_df=features_df,
+                                          target_df=target_df)
+    return input_data
+
+
+def get_binary_classification_data_from_csv():
+    test_file_path = str(os.path.dirname(__file__))
+    file = '../../data/classification/simple_classification.csv'
+    input_data = InputData.from_csv(
+        os.path.join(test_file_path, file))
+    return input_data
+
+
+def get_multiclassification_data_from_numpy():
+    test_file_path = str(os.path.dirname(__file__))
+    file = '../../data/classification/multiclass_classification.npy'
+    numpy_data = np.load(os.path.join(test_file_path, file))
+    features_array = numpy_data[:, :-1]
+    target_array = numpy_data[:, -1]
+    input_data = InputData.from_numpy(features_array=features_array,
+                                      target_array=target_array)
+    return input_data
+
+
+def get_multiclassification_data_from_df():
+    test_file_path = str(os.path.dirname(__file__))
+    file = '../../data/classification/multiclass_classification.csv'
+    df_data = pd.read_csv(os.path.join(test_file_path, file))
+    features_df = df_data.iloc[:, :-1]
+    target_df = df_data.iloc[:, -1]
+    input_data = InputData.from_dataframe(features_df=features_df,
+                                          target_df=target_df)
+    return input_data
+
+
+def get_multiclassification_data_from_csv():
+    test_file_path = str(os.path.dirname(__file__))
+    file = '../../data/classification/multiclass_classification.csv'
     input_data = InputData.from_csv(
         os.path.join(test_file_path, file))
     return input_data
@@ -96,8 +151,43 @@ def get_image_classification_data(composite_flag: bool = True):
     return roc_auc_on_valid, dataset_to_train, dataset_to_validate
 
 
-def test_multiclassification_pipeline_fit_correct():
-    data = get_iris_data()
+BINARY_CLASSIFICATION_DATA_SOURCES = [get_binary_classification_data_from_numpy,
+                                      get_binary_classification_data_from_df,
+                                      get_binary_classification_data_from_csv,
+                                      # 'from_image',
+                                      # 'from_text_meta_file',
+                                      # 'from_text_files',
+                                      # 'from_json_files',
+                                      ]
+
+MULTICLASSIFICATION_DATA_SOURCES = [get_multiclassification_data_from_numpy,
+                                    get_multiclassification_data_from_df,
+                                    get_multiclassification_data_from_csv,
+                                    # 'from_image',
+                                    # 'from_text_meta_file',
+                                    # 'from_text_files',
+                                    # 'from_json_files',
+                                    ]
+
+
+@pytest.mark.parametrize('get_classification_data', BINARY_CLASSIFICATION_DATA_SOURCES)
+def test_binary_classification_pipeline_fit_correct(get_classification_data: Callable):
+    data = get_classification_data()
+    pipeline = pipeline_simple()
+    train_data, test_data = train_test_data_setup(data, shuffle=True)
+
+    pipeline.fit(input_data=train_data)
+    results = pipeline.predict(input_data=test_data)
+
+    roc_auc_on_test = roc_auc(y_true=test_data.target,
+                              y_score=results.predict)
+
+    assert roc_auc_on_test > 0.8
+
+
+@pytest.mark.parametrize('get_classification_data', MULTICLASSIFICATION_DATA_SOURCES)
+def test_multiclassification_pipeline_fit_correct(get_classification_data: Callable):
+    data = get_classification_data()
     pipeline = pipeline_simple()
     train_data, test_data = train_test_data_setup(data, shuffle=True)
 
@@ -106,7 +196,7 @@ def test_multiclassification_pipeline_fit_correct():
 
     roc_auc_on_test = roc_auc(y_true=test_data.target,
                               y_score=results.predict,
-                              multi_class='ovo',
+                              multi_class='ovr',  # TODO: strange bug when ovo is chosen
                               average='macro')
 
     assert roc_auc_on_test > 0.95
@@ -154,7 +244,7 @@ def test_output_mode_labels():
 
 
 def test_output_mode_full_probs():
-    data = get_binary_classification_data()
+    data = get_binary_classification_data_from_csv()
     pipeline = pipeline_simple()
     train_data, test_data = train_test_data_setup(data, shuffle=True)