Skip to content

Commit

Permalink
[Backport 1.8.latest] Update the spark version to the current version (
Browse files Browse the repository at this point in the history
…#1055) (#1130)

* Update the spark version to the current version (#1055)

* update the spark version to the current version
* update pin for pydantic to resolve explosion/spaCy#12659
* exclude koalas dataframes from test

(cherry picked from commit 824ca0f)

* Backport preleases.

* Fix numpy issue.

---------

Co-authored-by: VersusFacit <[email protected]>
  • Loading branch information
mikealfare and VersusFacit authored Oct 26, 2024
1 parent 96fe72f commit 28321d5
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 7 deletions.
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def _get_plugin_version_dict():
include_package_data=True,
install_requires=[
"sqlparams>=3.0.0",
"dbt-common>=0.1.0a1,<2.0",
"dbt-adapters>=0.1.0a1,<2.0",
"dbt-common>=0.1.0,<2.0",
"dbt-adapters>=0.1.0,<2.0",
# add dbt-core to ensure backwards compatibility of installation, this is not a functional dependency
"dbt-core>=1.8.0a1",
"dbt-core>=1.8.0",
],
extras_require={
"ODBC": odbc_extras,
Expand Down
36 changes: 32 additions & 4 deletions tests/functional/adapter/test_python_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,22 @@ class TestPythonModelSpark(BasePythonModelTests):

@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint")
class TestPySpark(BasePySparkTests):
pass
def test_different_dataframes(self, project):
"""
Test that python models are supported using dataframes from:
- pandas
- pyspark
- pyspark.pandas (formerly dataspark.koalas)
Note:
The CI environment is on Apache Spark >3.1, which includes koalas as pyspark.pandas.
The only Databricks runtime that supports Apache Spark <=3.1 is 9.1 LTS, which is EOL 2024-09-23.
For more information, see:
- https://github.com/databricks/koalas
- https://docs.databricks.com/en/release-notes/runtime/index.html
"""
results = run_dbt(["run", "--exclude", "koalas_df"])
assert len(results) == 3


@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint")
Expand All @@ -33,7 +48,7 @@ def test_incremental(self, project):


models__simple_python_model = """
import pandas
import pyspark.pandas as pandas
import torch
import spacy
Expand All @@ -42,7 +57,7 @@ def model(dbt, spark):
materialized='table',
submission_method='job_cluster',
job_cluster_config={
"spark_version": "7.3.x-scala2.12",
"spark_version": "12.2.x-scala2.12",
"node_type_id": "i3.xlarge",
"num_workers": 0,
"spark_conf": {
Expand All @@ -53,7 +68,7 @@ def model(dbt, spark):
"ResourceClass": "SingleNode"
}
},
packages=['spacy', 'torch', 'pydantic<1.10.3']
packages=['spacy', 'torch', 'pydantic>=1.10.3', 'numpy<2']
)
data = [[1,2]] * 10
return spark.createDataFrame(data, schema=['test', 'test2'])
Expand All @@ -72,6 +87,19 @@ def model(dbt, spark):

@pytest.mark.skip_profile("apache_spark", "spark_session", "databricks_sql_endpoint")
class TestChangingSchemaSpark:
"""
Confirm that we can setup a spot instance and parse required packages into the Databricks job.
Notes:
- This test generates a spot instance on demand using the settings from `job_cluster_config`
in `models__simple_python_model` above. It takes several minutes to run due to creating the cluster.
The job can be monitored via "Data Engineering > Job Runs" or "Workflows > Job Runs"
in the Databricks UI (instead of via the normal cluster).
- The `spark_version` argument will need to periodically be updated. It will eventually become
unsupported and start experiencing issues.
- See https://github.com/explosion/spaCy/issues/12659 for why we're pinning pydantic
"""

@pytest.fixture(scope="class")
def models(self):
return {"simple_python_model.py": models__simple_python_model}
Expand Down

0 comments on commit 28321d5

Please sign in to comment.