added-license-notebook

ryuta-yoshimatsu · ryuta-yoshimatsu · commit 2b6d6568e151 · 2024-06-11T13:53:07.000Z
diff --git a/README.md b/README.md
@@ -186,7 +186,7 @@ We encourage you to read through [examples/global_daily.py](https://github.com/d
 
 ### Foundation Models
 
-Foundation time series models are transformer based models pretrained on millions or billions of time series. These models can produce analysis (i.e. forecasting, anomaly detection, classfication) on an unforeseen time series without training or tuning. We support open source models from multiple sources: [chronos](https://github.com/amazon-science/chronos-forecasting), [moirai](https://blog.salesforceairesearch.com/moirai/), and [moment](https://github.com/moment-timeseries-foundation-model/moment). Covariates (i.e. exogenous regressors) and fine-tuning are currently not yet supported. This is a rapidly changing field, and we are working on updating the supported models and new features as the field evolves.
+Foundation time series models are transformer based models pretrained on millions or billions of time points. These models can produce analysis (i.e. forecasting, anomaly detection, classification) on an unforeseen time series without training or tuning. We support open source models from multiple sources: [chronos](https://github.com/amazon-science/chronos-forecasting), [moirai](https://blog.salesforceairesearch.com/moirai/), and [moment](https://github.com/moment-timeseries-foundation-model/moment). Covariates (i.e. exogenous regressors) and fine-tuning are currently not yet supported. This is a rapidly changing field, and we are working on updating the supported models and new features as the field evolves.
 
 To get started, attach the [examples/foundation_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/foundation_daily.py) notebook to a cluster running [DBR 14.3 LTS for ML](https://docs.databricks.com/en/release-notes/runtime/index.html) or later versions. We recommend using a single-node cluster with multiple GPU instances such as [g4dn.12xlarge [T4]](https://aws.amazon.com/ec2/instance-types/g4/) on AWS or [Standard_NC64as_T4_v3](https://learn.microsoft.com/en-us/azure/virtual-machines/nct4-v3-series) on Azure. Multi-node setup is currently not supported. 
 
diff --git a/examples/foundation_daily.py b/examples/foundation_daily.py
@@ -80,8 +80,9 @@ def transform_group(df):
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "m4" # Name of the schema we use to manage our assets (e.g. datasets)
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -147,7 +148,7 @@ def transform_group(df):
   dbutils.notebook.run(
     "run_daily",
     timeout_seconds=0, 
-    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id})
+    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id, "user": user})
 
 # COMMAND ----------
 
diff --git a/examples/foundation_monthly.py b/examples/foundation_monthly.py
@@ -84,8 +84,11 @@ def transform_group(df):
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "m4" # Name of the schema we use to manage our assets (e.g. datasets)
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
+
+# COMMAND ----------
 
 # Making sure that the catalog and the schema exist
 _ = spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
@@ -145,7 +148,7 @@ def transform_group(df):
   dbutils.notebook.run(
     "run_monthly",
     timeout_seconds=0,
-    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id})
+    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id, "user": user})
 
 # COMMAND ----------
 
diff --git a/examples/global_daily.py b/examples/global_daily.py
@@ -82,8 +82,9 @@ def transform_group(df):
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "m4" # Name of the schema we use to manage our assets (e.g. datasets)
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -152,7 +153,7 @@ def transform_group(df):
   dbutils.notebook.run(
     "run_daily",
     timeout_seconds=0, 
-    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id})
+    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id, "user": user})
 
 # COMMAND ----------
 
diff --git a/examples/global_external_regressors_daily.py b/examples/global_external_regressors_daily.py
@@ -46,9 +46,10 @@
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
-volume = "rossmann" # Name of the volume where you have your rossmann dataset csv sotred
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "rossmann" # Name of the schema we use to manage our assets (e.g. datasets)
+volume = "csv" # Name of the volume where you have your rossmann dataset csv sotred
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -65,7 +66,7 @@
 
 # Number of time series to sample
 sample = True
-size = 100
+size = 1000
 stores = sorted(random.sample(range(0, 1000), size))
 
 train = spark.read.csv(f"/Volumes/{catalog}/{db}/{volume}/train.csv", header=True, inferSchema=True)
@@ -136,7 +137,7 @@
   dbutils.notebook.run(
     "run_external_regressors_daily",
     timeout_seconds=0,
-    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id})
+    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id, "user": user})
 
 # COMMAND ----------
 
diff --git a/examples/global_monthly.py b/examples/global_monthly.py
@@ -84,8 +84,9 @@ def transform_group(df):
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "m4" # Name of the schema we use to manage our assets (e.g. datasets)
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -148,7 +149,7 @@ def transform_group(df):
   dbutils.notebook.run(
     "run_monthly",
     timeout_seconds=0,
-    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id})
+    arguments={"catalog": catalog, "db": db, "model": model, "run_id": run_id, "user": user})
 
 # COMMAND ----------
 
diff --git a/examples/licenses.py b/examples/licenses.py
@@ -0,0 +1,34 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC © 2024 Databricks, Inc. All rights reserved. 
+# MAGIC
+# MAGIC The sources in all notebooks in this directory and the sub-directories are provided subject to the Databricks License. All included or referenced third party libraries are subject to the licenses set forth below.
+# MAGIC
+# MAGIC | library                                | description             | license    | source                                              |
+# MAGIC |----------------------------------------|-------------------------|------------|-----------------------------------------------------|
+# MAGIC | rpy2 | Python interface to the R language (embedded R) | GNU General Public License v2 or later | https://pypi.org/project/rpy2/
+# MAGIC | kaleido | Static image export for web-based visualization libraries with zero dependencies | MIT | https://pypi.org/project/kaleido/
+# MAGIC | fugue | An abstraction layer for distributed computation | Apache 2.0 | https://pypi.org/project/fugue/
+# MAGIC | Jinja2 | A very fast and expressive template engine | BSD | https://pypi.org/project/Jinja2/
+# MAGIC | omegaconf | A flexible configuration library | BSD | https://pypi.org/project/omegaconf/
+# MAGIC | missingno | Missing data visualization module for Python | MIT | https://pypi.org/project/missingno/
+# MAGIC | datasetsforecast | Datasets for Time series forecasting | MIT | https://pypi.org/project/datasetsforecast/
+# MAGIC | statsforecast | Time series forecasting suite using statistical models | Apache 2.0 | https://pypi.org/project/statsforecast/
+# MAGIC | neuralforecast | Time series forecasting suite using deep learning models | Apache 2.0 | https://pypi.org/project/neuralforecast/
+# MAGIC | fable | Forecasting Models for Tidy Time Series | GPL-3 | https://cran.r-project.org/web/packages/fable/index.html
+# MAGIC | fabletools | Core Tools for Packages in the 'fable' Framework | GPL-3 | https://cran.r-project.org/web/packages/fabletools/index.html
+# MAGIC | feasts | Feature Extraction and Statistics for Time Series | GPL-3 | https://cran.r-project.org/web/packages/feasts/index.html
+# MAGIC | lazyeval | Lazy (Non-Standard) Evaluation | GPL-3 | https://cran.r-project.org/web/packages/lazyeval/index.html
+# MAGIC | tsibble | Tidy Temporal Data Frames and Tools | GPL-3 | https://cran.r-project.org/web/packages/tsibble/index.html
+# MAGIC | urca | Unit Root and Cointegration Tests for Time Series Data | GPL-3 | https://cran.r-project.org/web/packages/urca/index.html
+# MAGIC | sktime | A unified framework for machine learning with time series | BSD 3-Clause | https://pypi.org/project/sktime/
+# MAGIC | tbats | BATS and TBATS for time series forecasting | MIT | https://pypi.org/project/tbats/
+# MAGIC | lightgbm | LightGBM Python Package | MIT | https://pypi.org/project/lightgbm/
+# MAGIC | Chronos | Pretrained (Language) Models for Probabilistic Time Series Forecasting | Apache 2.0 | https://github.com/amazon-science/chronos-forecasting
+# MAGIC | Moirai | Unified Training of Universal Time Series Forecasting Transformers | Apache 2.0 | https://github.com/SalesforceAIResearch/uni2ts
+# MAGIC | Moment | A Family of Open Time-series Foundation Models | MIT | https://github.com/moment-timeseries-foundation-model/moment
+# MAGIC | TimesFM | A pretrained time-series foundation model developed by Google Research for time-series forecasting | Apache 2.0 | https://github.com/google-research/timesfm
+
+# COMMAND ----------
+
+
diff --git a/examples/local_univariate_daily.py b/examples/local_univariate_daily.py
@@ -86,8 +86,9 @@ def transform_group(df):
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "m4" # Name of the schema we use to manage our assets (e.g. datasets)
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -183,10 +184,10 @@ def transform_group(df):
     stride=10,
     metric="smape",
     train_predict_ratio=1,
-    data_quality_check=False,
+    data_quality_check=True,
     resample=False,
     active_models=active_models,
-    experiment_path=f"/Shared/mmf_experiment",
+    experiment_path=f"/Users/{user}/mmf/m4_daily",
     use_case_name="m4_daily",
 )
 
diff --git a/examples/local_univariate_external_regressors_daily.py b/examples/local_univariate_external_regressors_daily.py
@@ -38,9 +38,10 @@
 
 # COMMAND ----------
 
-catalog = "solacc_uc" # Name of the catalog we use to manage our assets
-db = "mmf" # Name of the schema we use to manage our assets (e.g. datasets)
-volume = "rossmann" # Name of the volume where you have your rossmann dataset csv sotred
+catalog = "mmf" # Name of the catalog we use to manage our assets
+db = "rossmann" # Name of the schema we use to manage our assets (e.g. datasets)
+volume = "csv" # Name of the volume where you have your rossmann dataset csv sotred
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -156,7 +157,7 @@
     active_models=active_models,
     data_quality_check=False,
     resample=False,
-    experiment_path=f"/Shared/mmf_rossmann",
+    experiment_path=f"/Users/{user}/mmf/rossmann_daily",
     use_case_name="rossmann_daily",
 )
 
@@ -192,3 +193,7 @@
 # COMMAND ----------
 
 display(spark.sql(f"delete from {catalog}.{db}.rossmann_daily_scoring_output"))
+
+# COMMAND ----------
+
+
diff --git a/examples/local_univariate_monthly.py b/examples/local_univariate_monthly.py
@@ -91,8 +91,9 @@ def transform_group(df):
 
 # COMMAND ----------
 
-catalog = "solacc_uc"  # Name of the catalog we use to manage our assets
-db = "mmf"  # Name of the schema we use to manage our assets (e.g. datasets)
+catalog = "mmf"  # Name of the catalog we use to manage our assets
+db = "m4"  # Name of the schema we use to manage our assets (e.g. datasets)
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 # COMMAND ----------
 
@@ -181,10 +182,10 @@ def transform_group(df):
     stride=1,
     metric="smape",
     train_predict_ratio=1,
-    data_quality_check=False,
+    data_quality_check=True,
     resample=False,
     active_models=active_models,
-    experiment_path=f"/Shared/mmf_experiment_monthly",
+    experiment_path=f"/Users/{user}/mmf/m4_monthly",
     use_case_name="m4_monthly",
 )
 
diff --git a/examples/m5-examples/foundation_daily_m5.py b/examples/m5-examples/foundation_daily_m5.py
@@ -22,7 +22,7 @@
 db = "m5" # Name of the schema we use to manage our assets (e.g. datasets)
 n = 1000  # Number of items: choose from [100, 1000, 10000, 'full']. full is 35k
 table = f"daily_train_{n}" # Training table name
-user_email = spark.sql('select current_user() as user').collect()[0]['user'] # User email
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email
 
 # COMMAND ----------
 
@@ -53,17 +53,17 @@
       "model": model, 
       "run_id": run_id, 
       "table": table , 
-      "user_email": user_email,
+      "user": user,
       }
     )
 
 # COMMAND ----------
 
-display(spark.sql(f"select * from {catalog}.{db}.daily_evaluation_output order by unique_id, model, backtest_window_start_date limit 1000"))
+display(spark.sql(f"select * from {catalog}.{db}.daily_evaluation_output order by unique_id, model, backtest_window_start_date"))
 
 # COMMAND ----------
 
-display(spark.sql(f"select * from {catalog}.{db}.daily_scoring_output order by unique_id, model, ds limit 1000"))
+display(spark.sql(f"select * from {catalog}.{db}.daily_scoring_output order by unique_id, model, ds"))
 
 # COMMAND ----------
 
diff --git a/examples/m5-examples/global_daily_m5.py b/examples/m5-examples/global_daily_m5.py
@@ -23,7 +23,7 @@
 db = "m5" # Name of the schema we use to manage our assets (e.g. datasets)
 n = 1000  # Number of items: choose from [100, 1000, 10000, 'full']. full is 35k
 table = f"daily_train_{n}" # Training table name
-user_email = spark.sql('select current_user() as user').collect()[0]['user'] # User email
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email
 
 # COMMAND ----------
 
@@ -55,7 +55,7 @@
       "model": model, 
       "run_id": run_id, 
       "table": table , 
-      "user_email": user_email,
+      "user": user,
       }
     )
 
diff --git a/examples/m5-examples/local_univariate_daily_m5.py b/examples/m5-examples/local_univariate_daily_m5.py
@@ -20,16 +20,18 @@
 
 catalog = "mmf" # Name of the catalog we use to manage our assets
 db = "m5" # Name of the schema we use to manage our assets (e.g. datasets)
-user_email = spark.sql('select current_user() as user').collect()[0]['user']
+user = spark.sql('select current_user() as user').collect()[0]['user'] # User email address
 
 n = 1000  # Number of items: choose from [1000, 10000, 'full']. full is 35k
 taining_table = f"daily_train_{n}"
 
 # COMMAND ----------
 
-display(
-  spark.sql(f"select * from {catalog}.{db}.{taining_table} where unique_id in ('FOODS_1_001_WI_1', 'FOODS_1_004_TX_2', 'FOODS_1_006_WI_1', 'FOODS_1_008_CA_3', 'FOODS_1_012_WI_1') order by unique_id, ds")
-  )
+# MAGIC %SQL
+# MAGIC
+# MAGIC display(
+# MAGIC   spark.sql(f"select * from {catalog}.{db}.{taining_table} where unique_id in ('FOODS_1_001_WI_1', 'FOODS_1_004_TX_2', 'FOODS_1_006_WI_1', 'FOODS_1_008_CA_3', 'FOODS_1_012_WI_1') order by unique_id, ds")
+# MAGIC   )
 
 # COMMAND ----------
 
@@ -53,8 +55,8 @@
     "RFableNNETAR",
     "RFableEnsemble",
     "RDynamicHarmonicRegression",
-    "SKTimeTBats",
-    "SKTimeLgbmDsDt",
+    #"SKTimeTBats",
+    #"SKTimeLgbmDsDt",
 ]
 
 # COMMAND ----------
@@ -82,7 +84,7 @@
     data_quality_check=True,
     resample=False,
     active_models=active_models,
-    experiment_path=f"/Users/{user_email}/mmf/m5",
+    experiment_path=f"/Users/{user}/mmf/m5_daily",
     use_case_name="m5_daily",
 )
 
diff --git a/examples/m5-examples/run_daily_m5.py b/examples/m5-examples/run_daily_m5.py
@@ -9,15 +9,15 @@
 dbutils.widgets.text("model", "")
 dbutils.widgets.text("run_id", "")
 dbutils.widgets.text("table", "")
-dbutils.widgets.text("user_email", "")
+dbutils.widgets.text("user", "")
 
 
 catalog = dbutils.widgets.get("catalog")
 db = dbutils.widgets.get("db")
 model = dbutils.widgets.get("model")
 run_id = dbutils.widgets.get("run_id")
 table = dbutils.widgets.get("table")
-user_email = dbutils.widgets.get("user_email")
+user_email = dbutils.widgets.get("user")
 
 # COMMAND ----------
 
@@ -46,7 +46,7 @@
     data_quality_check=True,
     resample=False,
     active_models=[model],
-    experiment_path=f"/Users/{user_email}/mmf/m5",
+    experiment_path=f"/Users/{user}/mmf/m5_daily",
     use_case_name="m5_daily",
     run_id=run_id,
     accelerator="gpu",
diff --git a/examples/run_daily.py b/examples/run_daily.py
@@ -8,11 +8,13 @@
 dbutils.widgets.text("db", "")
 dbutils.widgets.text("model", "")
 dbutils.widgets.text("run_id", "")
+dbutils.widgets.text("user", "")
 
 catalog = dbutils.widgets.get("catalog")
 db = dbutils.widgets.get("db")
 model = dbutils.widgets.get("model")
 run_id = dbutils.widgets.get("run_id")
+user = dbutils.widgets.get("user")
 
 # COMMAND ----------
 
@@ -42,7 +44,7 @@
     data_quality_check=True,
     resample=False,
     active_models=[model],
-    experiment_path=f"/Shared/mmf_experiment",
+    experiment_path=f"/Users/{user}/mmf/m4_daily",
     use_case_name="m4_daily",
     run_id=run_id,
     accelerator="gpu",
diff --git a/examples/run_external_regressors_daily.py b/examples/run_external_regressors_daily.py
diff --git a/examples/run_monthly.py b/examples/run_monthly.py