Skip to content

Commit

Permalink
Add unit tests for Dataproc GKE with mock GKE cluster
Browse files Browse the repository at this point in the history
Signed-off-by: Partho Sarthi <[email protected]>
  • Loading branch information
parthosa committed Oct 13, 2023
1 parent 5fd7299 commit f78c94b
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 6 deletions.
2 changes: 2 additions & 0 deletions user_tools/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ dependencies = [
"pygments==2.15.0",
# used to apply validator on objects and models
"pydantic==2.1.1",
# used to help pylint understand pydantic
"pylint-pydantic==0.3.0",
# used for common API to access remote filesystems like local/s3/gcs/hdfs
# this will include numpy
"pyarrow==12.0.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,35 @@
"""

from typing import ClassVar, Type
from pydantic import field_validator

from spark_rapids_tools.cloud.cluster import ClientCluster, register_client_cluster, ClusterPropMgr, register_cluster_prop_mgr
from spark_rapids_tools.cloud.cluster import ClientCluster, register_client_cluster, ClusterPropMgr, \
register_cluster_prop_mgr
from spark_rapids_tools.utils.propmanager import PropValidatorSchemaCamel, PropValidatorSchema


class DataprocClusterSchema(PropValidatorSchemaCamel):
class DataprocClusterSchema(PropValidatorSchemaCamel): # pylint: disable=missing-class-docstring)
cluster_name: str
cluster_uuid: str
project_id: str
config: dict

@field_validator('config')
def validate_config(cls, config: dict) -> dict:
"""
Validates the cluster config to ensure it is for GCE instead of GKE.
"""
if 'gceClusterConfig' not in config:
raise ValueError("'gceClusterConfig' key is missing in config.")
return config


class DataprocGkeClusterSchema(PropValidatorSchemaCamel):
cluster_name: str
cluster_uuid: str
project_id: str
config: dict
virtual_cluster_config: dict


@register_cluster_prop_mgr('dataproc')
Expand All @@ -42,7 +54,7 @@ class DataprocClusterPropMgr(ClusterPropMgr):


@register_client_cluster('dataproc')
class DataprocClientCluster(ClientCluster): # pylint: disable=too-few-public-methods
class DataprocClientCluster(ClientCluster): # pylint: disable=too-few-public-methods
pass


Expand All @@ -52,5 +64,5 @@ class DataprocGkeClusterPropMgr(ClusterPropMgr):


@register_client_cluster('dataproc_gke')
class DataprocGkeClientCluster(ClientCluster): # pylint: disable=too-few-public-methods
class DataprocGkeClientCluster(ClientCluster): # pylint: disable=too-few-public-methods
pass
3 changes: 2 additions & 1 deletion user_tools/tests/spark_rapids_tools_ut/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def get_test_resources_path():
def gen_cpu_cluster_props():
return [
('dataproc', 'cluster/dataproc/cpu-00.yaml'),
('dataproc_gke', 'cluster/dataproc_gke/cpu-00.yaml'),
('emr', 'cluster/emr/cpu-00.json'),
('onprem', 'cluster/onprem/cpu-00.yaml'),
('databricks_aws', 'cluster/databricks/aws-cpu-00.json'),
Expand All @@ -43,7 +44,7 @@ def gen_cpu_cluster_props():
# all cpu_cluster_props except the onPrem
csp_cpu_cluster_props = [(e_1, e_2) for (e_1, e_2) in all_cpu_cluster_props if e_1 != 'onprem']
# all csps except onprem
csps = ['dataproc', 'emr', 'databricks_aws', 'databricks_azure']
csps = ['dataproc', 'dataproc_gke', 'emr', 'databricks_aws', 'databricks_azure']
all_csps = csps + ['onprem']


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
clusterName: dataproc-gke-test-nongpu-cluster
clusterUuid: 11111111-1111-1111-1111-111111111111
config:
softwareConfig: {}
labels:
goog-dataproc-cluster-name: dataproc-gke-test-nongpu-cluster
goog-dataproc-cluster-uuid: 11111111-1111-1111-1111-111111111111
goog-dataproc-location: us-central1
projectId: dataproc-gke-project
status:
state: RUNNING
stateStartTime: '2022-12-06T23:21:07.637345Z'
statusHistory:
- state: CREATING
stateStartTime: '2022-11-08T18:02:00.300481Z'
virtualClusterConfig:
auxiliaryServicesConfig:
sparkHistoryServerConfig:
dataprocCluster: projects/dataproc-gke-project/regions/us-central1/clusters/dataproc-phs-test
kubernetesClusterConfig:
gkeClusterConfig:
gkeClusterTarget: projects/dataproc-gke-project/regions/us-central1/clusters/dataproc-gke-test
nodePoolTarget:
- nodePool: projects/dataproc-gke-project/regions/us-central1/clusters/dataproc-gke-test/nodePools/controller-pool
roles:
- DEFAULT
- nodePool: projects/dataproc-gke-project/regions/us-central1/clusters/dataproc-gke-test/nodePools/driver-pool
roles:
- SPARK_DRIVER
- nodePool: projects/dataproc-gke-project/regions/us-central1/clusters/dataproc-gke-test/nodePools/executor-pool-cpu
roles:
- SPARK_EXECUTOR
kubernetesNamespace: dataproc-gke-test-nongpu-cluster
kubernetesSoftwareConfig:
componentVersion:
SPARK: 3.1-dataproc-14
properties:
dataproc:dataproc.gke.agent.google-service-account: [email protected]
dataproc:dataproc.gke.spark.driver.google-service-account: [email protected]
dataproc:dataproc.gke.spark.executor.google-service-account: [email protected]
dpgke:dpgke.unstable.outputOnly.endpoints.sparkHistoryServer: https://eeeeeeeeeeeeee-dot-us-central1.dataproc.googleusercontent.com/sparkhistory/?eventLogDirFilter=11111111-1111-1111-1111-111111111111
spark:spark.eventLog.dir: gs://dataproc-gke-test-bucket/11111111-1111-1111-1111-111111111111/spark-job-history
spark:spark.eventLog.enabled: 'true'
stagingBucket: dataproc-gke-test-bucket
2 changes: 1 addition & 1 deletion user_tools/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ commands =

[testenv:pylint]
deps = pylint
commands = pylint -d fixme --rcfile=../.pylintrc \
commands = pylint -d fixme --load-plugins pylint_pydantic --rcfile=../.pylintrc \
tests \
src

Expand Down

0 comments on commit f78c94b

Please sign in to comment.