diff --git a/docs/source/benchmarks/amlb_res.csv b/docs/source/benchmarks/amlb_res.csv index 569f98431d..d8e0b56ece 100644 --- a/docs/source/benchmarks/amlb_res.csv +++ b/docs/source/benchmarks/amlb_res.csv @@ -1,33 +1,39 @@ -Dataset name,Metric name,AutoGluon,FEDOT,H2O,LAMA -APSFailure,auc,0.99,0.991,0.992,0.992 -Amazon_employee_access,auc,0.857,0.865,0.873,0.879 -Australian,auc,0.94,0.939,0.939,0.945 -Covertype,neg_logloss,-0.071,-0.117,-0.265, -Fashion-MNIST,neg_logloss,-0.329,-0.373,-0.38,-0.248 -Jannis,neg_logloss,-0.728,-0.737,-0.691,-0.664 -KDDCup09_appetency,auc,0.804,0.822,0.829,0.85 -MiniBooNE,auc,0.982,0.981,,0.988 -Shuttle,neg_logloss,-0.001,-0.001,-0.0,-0.001 -Volkert,neg_logloss,-0.917,-1.097,-0.976,-0.806 -adult,auc,0.91,0.925,0.931,0.932 -bank-marketing,auc,0.931,0.935,0.939,0.94 -blood-transfusion,auc,0.69,0.759,0.765,0.75 -car,neg_logloss,-0.117,-0.011,-0.004,-0.002 -christine,auc,0.804,0.812,0.823,0.83 -cnae-9,neg_logloss,-0.332,-0.211,-0.175,-0.156 -connect-4,neg_logloss,-0.502,-0.456,-0.338,-0.337 -credit-g,auc,0.795,0.778,0.789,0.796 -dilbert,neg_logloss,-0.148,-0.159,-0.05,-0.033 -fabert,neg_logloss,-0.788,-0.895,-0.752,-0.766 -guillermo,auc,0.9,0.891,,0.926 -jasmine,auc,0.883,0.888,0.887,0.88 -jungle chess,neg_logloss,-0.431,-0.193,-0.24,-0.149 -kc1,auc,0.822,0.843,,0.831 -kr-vs-kp,auc,0.999,1.0,,1.0 -mfeat-factors,neg_logloss,-0.161,-0.094,,-0.082 -nomao,auc,0.995,0.994,0.996,0.997 -numerai28_6,auc,0.517,0.529,0.531,0.531 -phoneme,auc,0.965,0.965,,0.965 -segment,neg_logloss,-0.094,-0.062,,-0.061 -sylvine,auc,0.985,0.988,,0.988 -vehicle,neg_logloss,-0.515,-0.354,,-0.404 +Dataset,Metric,AutoGluon,FEDOT,H2O,TPOT +adult,auc,0.9100126,0.91529255,0.9307700000000001,0.9272897999999999 +airlines,auc,0.7249085714285715,0.6537803999999999,0.7303896,0.693676 +albert,auc,0.739028,0.7276503,, +amazon_employee_access,auc,0.8571479999999999,0.8591113,0.8728077000000001,0.8662471 +apsfailure,auc,0.9906209,0.9899874210526317,0.9925172,0.990437 +australian,auc,0.9395274,0.9378541,0.93857085,0.9360440999999999 +bank-marketing,auc,0.9312558,0.93245125,0.9385977000000001,0.9346086 +blood-transfusion,auc,0.6895855,0.72444385,0.75949435,0.7401904 +christine,auc,0.8042872000000001,0.8044556500000001,0.8193608421052632,0.8066902 +credit-g,auc,0.7952859,0.7845833,0.79357155,0.7938096 +guillermo,auc,0.8996748,0.89125215,,0.7833095714285714 +jasmine,auc,0.8831222000000001,0.88548405,0.8873440499999999,0.8903762000000001 +kc1,auc,0.8222621,0.8385662,,0.8448118000000001 +kddcup09_appetency,auc,0.8044676000000001,0.7877767,0.8291237,0.825562 +kr-vs-kp,auc,0.9988583999999999,0.9992477,0.9997232,0.9997627 +miniboone,auc,0.9821717,0.98101815,,0.9834643333333334 +nomao,auc,0.9948282,0.99419515,0.9959996,0.9953825 +numerai28_6,auc,0.5165548,0.5216116000000001,0.5305179, +phoneme,auc,0.9654223,0.9644835,0.9675107000000001,0.970699 +riccardo,auc,0.9997026,0.9979384,, +sylvine,auc,0.9847037999999999,0.9849627999999999,0.9893596,0.9933923 +car,neg_logloss,-0.11658660000000001,-0.088851992,-0.003471899925,-0.64257486468 +cnae-9,neg_logloss,-0.332075,-0.270096135,-0.21849159,-0.15368975 +connect-4,neg_logloss,-0.5015701,-0.47033240000000004,-0.33770059999999996,-0.3734921 +covertype,neg_logloss,-0.07139724444444445,-0.1409624,-0.2642175, +dilbert,neg_logloss,-0.14967388235294118,-0.24454559000000003,-0.07642755500000001,-0.168390625 +dionis,neg_logloss,-2.157603,,, +fabert,neg_logloss,-0.7878137,-0.9015242000000001,-0.77193945,-0.8915912 +fashion-mnist,neg_logloss,-0.3325671,-0.38379342857142856,-0.3832832,-0.535493 +helena,neg_logloss,-2.784965,-6.348634,-2.9801966666666666,-2.98157375 +jannis,neg_logloss,-0.7283778,-0.7619161,-0.691228,-0.703102 +jungle_chess_2pcs_raw_endgame_complete,neg_logloss,-0.43063529999999994,-0.270741845,-0.23951890000000003,-0.21872090000000002 +mfeat-factors,neg_logloss,-0.1611791,-0.17412199,-0.09295753,-0.10726150999999999 +robert,neg_logloss,-1.6843139999999999,-1.745091,, +segment,neg_logloss,-0.09418663,-0.096434561,-0.05962082,-0.07710542000000001 +shuttle,neg_logloss,-0.0008124975,-0.0010121353499999998,-0.00035519797666666667, +vehicle,neg_logloss,-0.5154588,-0.42775929999999995,-0.3313683,-0.3915049 +volkert,neg_logloss,-0.9200727000000001,-1.0448454545454544,-0.9779738888888888, diff --git a/docs/source/benchmarks/amlb_res.html b/docs/source/benchmarks/amlb_res.html deleted file mode 100644 index d0c976963d..0000000000 --- a/docs/source/benchmarks/amlb_res.html +++ /dev/null @@ -1,719 +0,0 @@ - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Classification statistics -
- - framework - - AutoGluon - - FEDOT - - H2O - - LAMA -
- Dataset name - - Metric name - - - - -
- APSFailure - - auc - - 0.990 - - 0.991 - - 0.992 - - 0.992 -
- Amazon_employee_access - - auc - - 0.857 - - 0.865 - - 0.873 - - 0.879 -
- Australian - - auc - - 0.940 - - 0.939 - - 0.938 - - 0.945 -
- Covertype - - neg_logloss - - -0.071 - - -0.117 - - -0.265 - - nan -
- Fashion-MNIST - - neg_logloss - - -0.329 - - -0.373 - - -0.380 - - -0.248 -
- Jannis - - neg_logloss - - -0.728 - - -0.737 - - -0.691 - - -0.664 -
- KDDCup09_appetency - - auc - - 0.804 - - 0.822 - - 0.829 - - 0.850 -
- MiniBooNE - - auc - - 0.982 - - 0.981 - - nan - - 0.988 -
- Shuttle - - neg_logloss - - -0.001 - - -0.001 - - -0.000 - - -0.001 -
- Volkert - - neg_logloss - - -0.917 - - -1.097 - - -0.976 - - -0.806 -
- adult - - auc - - 0.910 - - 0.925 - - 0.931 - - 0.932 -
- bank-marketing - - auc - - 0.931 - - 0.935 - - 0.939 - - 0.940 -
- blood-transfusion - - auc - - 0.690 - - 0.759 - - 0.754 - - 0.750 -
- car - - neg_logloss - - -0.117 - - -0.011 - - -0.003 - - -0.002 -
- christine - - auc - - 0.804 - - 0.812 - - 0.815 - - 0.830 -
- cnae-9 - - neg_logloss - - -0.332 - - -0.211 - - -0.262 - - -0.156 -
- connect-4 - - neg_logloss - - -0.502 - - -0.456 - - -0.338 - - -0.337 -
- credit-g - - auc - - 0.795 - - 0.778 - - 0.798 - - 0.796 -
- dilbert - - neg_logloss - - -0.148 - - -0.159 - - -0.103 - - -0.033 -
- fabert - - neg_logloss - - -0.788 - - -0.895 - - -0.792 - - -0.766 -
- guillermo - - auc - - 0.900 - - 0.891 - - nan - - 0.926 -
- jasmine - - auc - - 0.883 - - 0.888 - - 0.888 - - 0.880 -
- jungle chess - - neg_logloss - - -0.431 - - -0.193 - - -0.240 - - -0.149 -
- kc1 - - auc - - 0.822 - - 0.843 - - nan - - 0.831 -
- kr-vs-kp - - auc - - 0.999 - - 1.000 - - 1.000 - - 1.000 -
- mfeat-factors - - neg_logloss - - -0.161 - - -0.094 - - -0.093 - - -0.082 -
- nomao - - auc - - 0.995 - - 0.994 - - 0.996 - - 0.997 -
- numerai28_6 - - auc - - 0.517 - - 0.529 - - 0.531 - - 0.531 -
- phoneme - - auc - - 0.965 - - 0.965 - - 0.968 - - 0.965 -
- segment - - neg_logloss - - -0.094 - - -0.062 - - -0.060 - - -0.061 -
- sylvine - - auc - - 0.985 - - 0.988 - - 0.989 - - 0.988 -
- vehicle - - neg_logloss - - -0.515 - - -0.354 - - -0.331 - - -0.404 -
-
diff --git a/docs/source/benchmarks/forecasting.rst b/docs/source/benchmarks/forecasting.rst index d83a2880ae..35e696647e 100644 --- a/docs/source/benchmarks/forecasting.rst +++ b/docs/source/benchmarks/forecasting.rst @@ -55,3 +55,94 @@ Timeout for Fedot and other frameworks was set by 2 minutes on each series. For Additionally you can examine papers about Fedot performance on different time series forecasting tasks `[1] `__ , `[2] `__, `[3] `__, `[4] `__, `[5] `__, `[6] `__, `[7] `__. + + +More M4 benchmarking +~~~~~~~~~~~~~~~~~~~~ + +This benchmark is based on a unified benchmarking interface provided by the `pytsbe framework `__ (a tool for benchmarking automated time-series forecasting algorithms). +The `pytsbe` tool uses `subsample `__ from `M4 competition `__ (sample contains 998 series with daily, weekly, monthly, quarterly, yearly intervals). +The forecasting horizons for each series type are: 6 for yearly series, 8 for quarterly series, 18 for monthly series, 13 for weekly series, and 14 for daily series. +The estimation metric used is Symmetric Mean Absolute Percentage Error (SMAPE). + + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + | Library | Quantile | Intervals | + + + +--------+--------+--------+-----------+--------+---------+ + | | | Daily | Weekly | Montly | Quarterly | Yearly | Overall | + +=============+==========+========+========+========+===========+========+=========+ + | LagLlama | 10 | 1.457 | 3.258 | 5.303 | 5.713 | 11.665 | 2.64 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 50 | 4.513 | 11.167 | 18.534 | 20.027 | 33.141 | 13.036 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 90 | 13.123 | 28.268 | 62.091 | 48.793 | 73.565 | 48.056 | + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + | NBEATS | 10 | 0.732 | 1.021 | 1.173 | 1.818 | 3.038 | 1.036 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 50 | 1.948 | 4.384 | 7.628 | 8.193 | 12.648 | 4.643 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 90 | 4.57 | 19.665 | 38.343 | 49.764 | 36.045 | 28.567 | + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + | TimeGPT | 10 | 1.687 | 1.272 | 1.134 | 2.459 | 4.179 | 1.536 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 50 | 5.586 | 7.17 | 6.235 | 7.058 | 8.982 | 6.565 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 90 | 15.716 | 23.337 | 35.786 | 28.056 | 32.902 | 26.387 | + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + | autogluon | 10 | 0.93 | 0.744 | 1.26 | 2.159 | 2.624 | 1.131 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 50 | 2.37 | 5.96 | 7.402 | 6.168 | 7.598 | 4.704 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 90 | 6.189 | 20.888 | 33.51 | 24.909 | 40.516 | 25.026 | + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + | Fedot | 10 | 0.97 | 0.733 | 1.342 | 1.771 | 2.892 | 1.064 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 50 | 2.326 | 4.95 | 7.123 | 6.786 | 8.682 | 4.655 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 90 | 5.398 | 19.131 | 43.519 | 36.36 | 41.147 | 30.29 | + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + | repeat_last | 10 | 0.795 | 1.059 | 1.477 | 2.534 | 4.242 | 1.146 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 50 | 2.008 | 5.365 | 7.796 | 7.379 | 9.066 | 5.158 | + + +----------+--------+--------+--------+-----------+--------+---------+ + | | 90 | 4.66 | 22.38 | 37.294 | 27.215 | 33.074 | 25.79 | + +-------------+----------+--------+--------+--------+-----------+--------+---------+ + +For a more clear understanding, please refer to the mean values of the SMAPE metrics. +Here, as per usual, the best value is indicated in bold for each row (for each seasonal period). + + +-------------+---------+---------+---------+-----------+---------+---------+ + | Library | Intervals | + + +---------+---------+---------+-----------+---------+---------+ + | | Daily | Weekly | Montly | Quarterly | Yearly | Overall | + +=============+=========+=========+=========+===========+=========+=========+ + | LagLlama | 4.513 | 11.167 | 18.534 | 20.027 | 33.141 | 13.036 | + +-------------+---------+---------+---------+-----------+---------+---------+ + | NBEATS |**1.948**|**4.384**| 7.628 | 8.193 | 12.648 |**4.643**| + +-------------+---------+---------+---------+-----------+---------+---------+ + | TimeGPT | 5.586 | 7.17 |**6.235**| 7.058 | 8.982 | 6.565 | + +-------------+---------+---------+---------+-----------+---------+---------+ + | autogluon | 2.37 | 5.96 | 7.402 |**6.168** |**7.598**| 4.704 | + +-------------+---------+---------+---------+-----------+---------+---------+ + | Fedot | 2.326 | 4.95 | 7.123 | 6.786 | 8.682 | 4.655 | + +-------------+---------+---------+---------+-----------+---------+---------+ + | repeat_last | 2.008 | 5.365 | 7.796 | 7.379 | 9.066 | 5.158 | + +-------------+---------+---------+---------+-----------+---------+---------+ + + +The statistical analysis on SMAPE metrics was conducted using the Friedman t-test. +The results confirm that FEDOT's time series forecasting ability is statistically indistinguishable from +forecasting methods of the field leaders (represented by autogluon and NBEATS). + + +------------+--------+----------+--------+---------+-----------+ + | | FEDOT | LAGLLAMA | NBEATS | TimeGPT | autogluon | + +============+========+==========+========+=========+===========+ + | FEDOT | | 0.044 | 0.613 | 0.613 | 0.971 | + +------------+--------+----------+--------+---------+-----------+ + | LAGLLAMA | 0.044 | | 0.121 | 0.121 | 0.048 | + +------------+--------+----------+--------+---------+-----------+ + | NBEATS | 0.613 | 0.121 | | 1.000 | 0.639 | + +------------+--------+----------+--------+---------+-----------+ + | TimeGPT | 0.613 | 0.121 | 1.000 | | 0.639 | + +------------+--------+----------+--------+---------+-----------+ + | autogluon | 0.971 | 0.048 | 0.639 | 0.639 | | + +------------+--------+----------+--------+---------+-----------+ diff --git a/docs/source/benchmarks/tabular.rst b/docs/source/benchmarks/tabular.rst index 8e7801c7ba..0ff6dd00a7 100644 --- a/docs/source/benchmarks/tabular.rst +++ b/docs/source/benchmarks/tabular.rst @@ -2,20 +2,53 @@ Tabular data ------------ Here are overall classification problem results across state-of-the-art AutoML frameworks -using `AutoMlBenchmark `__ test suite: +using `AMLB `__ test suite: -.. raw:: html - :file: amlb_res.html +.. csv-table:: + :header: Dataset, Metric, AutoGluon, FEDOT, H2O, TPOT -The results are obtained using sever based on Xeon Cascadelake (2900MHz) -with 12 cores and 24GB memory for experiments with the local infrastructure. 1h8c configuration was used for AMLB. - -Despite the obtained metrics being a bit different from AMLB's `paper `__ -the results confirm that FEDOT is competitive with SOTA solutions. + adult, auc, 0.91001, 0.91529, **0.93077**, 0.92729 + airlines, auc, 0.72491, 0.65378, **0.73039**, 0.69368 + albert, auc, **0.73903**, 0.72765, nan, nan + amazon_employee_access, auc, 0.85715, 0.85911, **0.87281**, 0.86625 + apsfailure, auc, 0.99062, 0.98999, **0.99252**, 0.99044 + australian, auc, **0.93953**, 0.93785, 0.93857, 0.93604 + bank-marketing, auc, 0.93126, 0.93245, **0.93860**, 0.93461 + blood-transfusion, auc, 0.68959, 0.72444, **0.75949**, 0.74019 + christine, auc, 0.80429, 0.80446, **0.81936**, 0.80669 + credit-g, auc, **0.79529**, 0.78458, 0.79357, 0.79381 + guillermo, auc, **0.89967**, 0.89125, nan, 0.78331 + jasmine, auc, 0.88312, 0.88548, 0.88734, **0.89038** + kc1, auc, 0.82226, 0.83857, nan, **0.84481** + kddcup09_appetency, auc, 0.80447, 0.78778, **0.82912**, 0.82556 + kr-vs-kp, auc, 0.99886, 0.99925, 0.99972, **0.99976** + miniboone, auc, 0.98217, 0.98102, nan, **0.98346** + nomao, auc, 0.99483, 0.99420, **0.99600**, 0.99538 + numerai28_6, auc, 0.51655, 0.52161, **0.53052**, nan + phoneme, auc, 0.96542, 0.96448, 0.96751, **0.97070** + riccardo, auc, **0.99970**, 0.99794, nan, nan + sylvine, auc, 0.98470, 0.98496, 0.98936, **0.99339** + car, neg_logloss, -0.11659, -0.08885, **-0.00347**, -0.64257 + cnae-9, neg_logloss, -0.33208, -0.27010, -0.21849, **-0.15369** + connect-4, neg_logloss, -0.50157, -0.47033, **-0.33770**, -0.37349 + covertype, neg_logloss, **-0.07140**, -0.14096, -0.26422, nan + dilbert, neg_logloss, -0.14967, -0.24455, **-0.07643**, -0.16839 + dionis, neg_logloss, **-2.15760**, nan, nan, nan + fabert, neg_logloss, -0.78781, -0.90152, **-0.77194**, -0.89159 + fashion-mnist, neg_logloss, **-0.33257**, -0.38379, -0.38328, -0.53549 + helena, neg_logloss, **-2.78497**, -6.34863, -2.98020, -2.98157 + jannis, neg_logloss, -0.72838, -0.76192, **-0.69123**, -0.70310 + jungle_chess, neg_logloss, -0.43064, -0.27074, -0.23952, **-0.21872** + mfeat-factors, neg_logloss, -0.16118, -0.17412, **-0.09296**, -0.10726 + robert, neg_logloss, **-1.68431**, -1.74509, nan, nan + segment, neg_logloss, -0.09419, -0.09643, **-0.05962**, -0.07711 + shuttle, neg_logloss, -0.00081, -0.00101, **-0.00036**, nan + vehicle, neg_logloss, -0.51546, -0.42776, **-0.33137**, -0.39150 + volkert, neg_logloss, **-0.92007**, -1.04485, -0.97797, nan The statistical analysis was conducted using the Friedman t-test. The results of experiments and analysis confirm that FEDOT results are statistically indistinguishable -from SOTA competitors H2O, AutoGluon and LAMA (see below). +from SOTA competitors H2O, AutoGluon and TPOT (see below). -.. image:: img_benchmarks/stats.png +.. image:: img_benchmarks/stats.png \ No newline at end of file