Skip to content

Commit

Permalink
[BUG] Update user tools to use latest Databricks CLI version 0.200+ (#…
Browse files Browse the repository at this point in the history
…614)

* updated implementation and docs for databricks platforms for change in new version of databricks cli

Signed-off-by: cindyyuanjiang <[email protected]>

* use ID consistently in comments

Signed-off-by: cindyyuanjiang <[email protected]>

---------

Signed-off-by: cindyyuanjiang <[email protected]>
  • Loading branch information
cindyyuanjiang authored Oct 13, 2023
1 parent 73ca0f0 commit eb61449
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 96 deletions.
69 changes: 34 additions & 35 deletions user_tools/docs/user-tools-databricks-aws.md

Large diffs are not rendered by default.

63 changes: 31 additions & 32 deletions user_tools/docs/user-tools-databricks-azure.md

Large diffs are not rendered by default.

24 changes: 16 additions & 8 deletions user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,16 +115,24 @@ def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list
def pull_cluster_props_by_args(self, args: dict) -> str:
get_cluster_cmd = ['databricks', 'clusters', 'get']
if 'Id' in args:
get_cluster_cmd.extend(['--cluster-id', args.get('Id')])
get_cluster_cmd.extend([args.get('Id')])
elif 'cluster' in args:
get_cluster_cmd.extend(['--cluster-name', args.get('cluster')])
# TODO: currently, arguments '--cpu_cluster' or '--gpu_cluster' are processed and stored as
# 'cluster' (as cluster names), while they are actually cluster ids for databricks platforms
get_cluster_cmd.extend([args.get('cluster')])
else:
self.logger.error('Invalid arguments to pull the cluster properties')
cluster_described = self.run_sys_cmd(get_cluster_cmd)
if cluster_described is not None:
raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False)
return json.dumps(raw_prop_container.props)
return cluster_described
self.logger.error('Unable to pull cluster id or cluster name information')

try:
cluster_described = self.run_sys_cmd(get_cluster_cmd)
if cluster_described is not None:
raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False)
return json.dumps(raw_prop_container.props)
except Exception as ex:
self.logger.error('Invalid arguments to pull the cluster properties: %s', ex)
raise ex

return None

def _build_cmd_ssh_prefix_for_node(self, node: ClusterNode) -> str:
port = self.env_vars.get('sshPort')
Expand Down
20 changes: 16 additions & 4 deletions user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,24 @@ def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list
def pull_cluster_props_by_args(self, args: dict) -> str:
get_cluster_cmd = ['databricks', 'clusters', 'get']
if 'Id' in args:
get_cluster_cmd.extend(['--cluster-id', args.get('Id')])
get_cluster_cmd.extend([args.get('Id')])
elif 'cluster' in args:
get_cluster_cmd.extend(['--cluster-name', args.get('cluster')])
# TODO: currently, arguments '--cpu_cluster' or '--gpu_cluster' are processed and stored as
# 'cluster' (as cluster names), while they are actually cluster ids for databricks platforms
get_cluster_cmd.extend([args.get('cluster')])
else:
self.logger.error('Invalid arguments to pull the cluster properties')
return self.run_sys_cmd(get_cluster_cmd)
self.logger.error('Unable to pull cluster id or cluster name information')

try:
cluster_described = self.run_sys_cmd(get_cluster_cmd)
if cluster_described is not None:
raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False)
return json.dumps(raw_prop_container.props)
except Exception as ex:
self.logger.error('Invalid arguments to pull the cluster properties: %s', ex)
raise ex

return None

def _build_cmd_ssh_prefix_for_node(self, node: ClusterNode) -> str:
port = self.env_vars.get('sshPort')
Expand Down
2 changes: 1 addition & 1 deletion user_tools/src/spark_rapids_pytools/cloud_api/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _list_inconsistent_configurations(self) -> list:
def pull_cluster_props_by_args(self, args: dict) -> str:
aws_cluster_id = args.get('Id')
cluster_name = args.get('cluster')
if args.get('Id') is None:
if aws_cluster_id is None:
# use cluster name to get the cluster values
# we need to get the cluster_id from the list command first.
list_cmd_res = self.exec_platform_list_cluster_by_name(cluster_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ def qualification(cpu_cluster: str = None,
or query to GPU. The wrapper downloads dependencies and executes the analysis on the local
dev machine.
:param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
generated by the databricks-CLI.
:param eventlogs: Event log filenames or S3 storage directories
containing event logs (comma separated). If missing, the wrapper reads the Spark's
property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included
in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
in the output of `databricks clusters get CLUSTER_ID [flags]`.
Note that the wrapper will raise an exception if the property is not set.
:param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
:param aws_profile: A named AWS profile to get the settings/credentials of the AWS account.
Expand All @@ -69,7 +69,7 @@ def qualification(cpu_cluster: str = None,
:param remote_folder: An S3 folder where the output is uploaded at the end of execution.
If no value is provided, the output will be only available on local disk.
:param gpu_cluster: The Databricks-cluster on which the Spark applications is planned to be migrated.
The argument can be a Databricks-cluster or a valid path to the cluster's properties file
The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file
(json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine
instances of the original cluster into databricks instances that support GPU acceleration.
:param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
Expand Down Expand Up @@ -158,15 +158,15 @@ def profiling(gpu_cluster: str = None,
which can be used for debugging and profiling Apache Spark applications.
:param gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
generated by the databricks-CLI. If missing, then the argument worker_info has to be provided.
:param worker_info: A path pointing to a yaml file containing the system information of a
worker node. It is assumed that all workers are homogenous.
If missing, the wrapper pulls the worker info from the "gpu_cluster".
:param eventlogs: Event log filenames or S3 storage directories
containing event logs (comma separated). If missing, the wrapper reads the Spark's
property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included
in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
in the output of `databricks clusters get CLUSTER_ID [flags]`.
Note that the wrapper will raise an exception if the property is not set.
:param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
:param aws_profile: A named AWS profile to get the settings/credentials of the AWS account.
Expand Down Expand Up @@ -236,7 +236,7 @@ def diagnostic(cluster: str,
Diagnostic tool to collect information from Databricks cluster, such as OS version, # of worker nodes,
Yarn configuration, Spark version and error logs etc. Please note, some sensitive information might
be collected by this tool, e.g. access secret configured in configuration files or dumped to log files.
:param cluster: Name of the Databricks cluster running an accelerated computing instance.
:param cluster: ID of the Databricks cluster running an accelerated computing instance.
:param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
:param aws_profile: A named AWS profile to get the settings/credentials of the AWS account.
:param output_folder: Local path where the archived result will be saved.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def qualification(cpu_cluster: str = None,
or query to GPU. The wrapper downloads dependencies and executes the analysis on the local
dev machine.
:param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
generated by the databricks-CLI.
:param eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories
containing event logs (comma separated). If missing, the wrapper reads the Spark's
property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included
in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
in the output of `databricks clusters get CLUSTER_ID [flags]`.
Note that the wrapper will raise an exception if the property is not set.
:param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
:param local_folder: Local work-directory path to store the output and to be used as root
Expand All @@ -67,7 +67,7 @@ def qualification(cpu_cluster: str = None,
:param remote_folder: An ABFS (Azure Blob File System) folder where the output is uploaded at the end
of execution. If no value is provided, the output will be only available on local disk.
:param gpu_cluster: The Databricks-cluster on which the Spark applications are planned to be migrated.
The argument can be a Databricks-cluster or a valid path to the cluster's properties file
The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file
(json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine
instances of the original cluster into databricks instances that support GPU acceleration.
:param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
Expand Down Expand Up @@ -153,15 +153,15 @@ def profiling(gpu_cluster: str = None,
The Profiling tool analyzes both CPU or GPU generated event logs and generates information
which can be used for debugging and profiling Apache Spark applications.
:param gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
generated by the databricks-CLI. If missing, then the argument worker_info has to be provided.
:param worker_info: A path pointing to a yaml file containing the system information of a
worker node. It is assumed that all workers are homogenous.
If missing, the wrapper pulls the worker info from the "gpu_cluster".
:param eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories
containing event logs (comma separated). If missing, the wrapper reads the Spark's
property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included
in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
in the output of `databricks clusters get CLUSTER_ID [flags]`.
Note that the wrapper will raise an exception if the property is not set.
:param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
:param local_folder: Local work-directory path to store the output and to be used as root
Expand Down Expand Up @@ -228,7 +228,7 @@ def diagnostic(cluster: str,
Diagnostic tool to collect information from Databricks cluster, such as OS version, # of worker nodes,
Yarn configuration, Spark version and error logs etc. Please note, some sensitive information might
be collected by this tool, e.g. access secret configured in configuration files or dumped to log files.
:param cluster: Name of the Databricks cluster running an accelerated computing instance.
:param cluster: ID of the Databricks cluster running an accelerated computing instance.
:param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
:param output_folder: Local path where the archived result will be saved.
Note that this argument only accepts local filesystem. If the argument is NONE,
Expand Down
9 changes: 5 additions & 4 deletions user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def qualification(self,
Skipping this argument requires that the cluster argument points to a valid
cluster name on the CSP.
:param cluster: Name of cluster or path to cluster-properties.
:param cluster: Name or ID (for databricks platforms) of cluster or path to cluster-properties.
:param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws",
and "databricks-azure".
:param target_platform: Cost savings and speedup recommendation for comparable cluster in
Expand Down Expand Up @@ -139,8 +139,8 @@ def profiling(self,
containing event logs (comma separated). If missing, the wrapper reads the Spark's
property `spark.eventLog.dir` defined in the `cluster`.
:param cluster: The cluster on which the Spark applications were executed. The argument
can be a cluster name or a valid path to the cluster's properties file (json format)
generated by the CSP SDK.
can be a cluster name od ID (for databricks platforms) or a valid path to the cluster's
properties file (json format) generated by the CSP SDK.
:param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws",
and "databricks-azure".
:param output_folder: path to store the output.
Expand Down Expand Up @@ -173,7 +173,8 @@ def bootstrap(self,
The tool will apply settings for the cluster assuming that jobs will run serially so that
each job can use up all the cluster resources (CPU and GPU) when it is running.
:param cluster: Name of the cluster running an accelerated computing instance class
:param cluster: Name or ID (for databricks platforms) of the cluster running an accelerated
computing instance class
:param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws",
and "databricks-azure".
:param output_folder: path where the final recommendations will be saved.
Expand Down

0 comments on commit eb61449

Please sign in to comment.