[BUG] Update user tools to use latest Databricks CLI version 0.200+ (#…

…614) * updated implementation and docs for databricks platforms for change in new version of databricks cli Signed-off-by: cindyyuanjiang <[email protected]> * use ID consistently in comments Signed-off-by: cindyyuanjiang <[email protected]> --------- Signed-off-by: cindyyuanjiang <[email protected]>
NVIDIA · Oct 13, 2023 · eb61449 · eb61449
1 parent 73ca0f0
commit eb61449
Show file tree

Hide file tree

Showing 8 changed files with 115 additions and 96 deletions.
diff --git a/user_tools/docs/user-tools-databricks-aws.md b/user_tools/docs/user-tools-databricks-aws.md
diff --git a/user_tools/docs/user-tools-databricks-azure.md b/user_tools/docs/user-tools-databricks-azure.md
diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_aws.py
@@ -115,16 +115,24 @@ def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list
     def pull_cluster_props_by_args(self, args: dict) -> str:
         get_cluster_cmd = ['databricks', 'clusters', 'get']
         if 'Id' in args:
-            get_cluster_cmd.extend(['--cluster-id', args.get('Id')])
+            get_cluster_cmd.extend([args.get('Id')])
         elif 'cluster' in args:
-            get_cluster_cmd.extend(['--cluster-name', args.get('cluster')])
+            # TODO: currently, arguments '--cpu_cluster' or '--gpu_cluster' are processed and stored as
+            # 'cluster' (as cluster names), while they are actually cluster ids for databricks platforms
+            get_cluster_cmd.extend([args.get('cluster')])
         else:
-            self.logger.error('Invalid arguments to pull the cluster properties')
-        cluster_described = self.run_sys_cmd(get_cluster_cmd)
-        if cluster_described is not None:
-            raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False)
-            return json.dumps(raw_prop_container.props)
-        return cluster_described
+            self.logger.error('Unable to pull cluster id or cluster name information')
+
+        try:
+            cluster_described = self.run_sys_cmd(get_cluster_cmd)
+            if cluster_described is not None:
+                raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False)
+                return json.dumps(raw_prop_container.props)
+        except Exception as ex:
+            self.logger.error('Invalid arguments to pull the cluster properties: %s', ex)
+            raise ex
+
+        return None
 
     def _build_cmd_ssh_prefix_for_node(self, node: ClusterNode) -> str:
         port = self.env_vars.get('sshPort')

diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py b/user_tools/src/spark_rapids_pytools/cloud_api/databricks_azure.py
@@ -122,12 +122,24 @@ def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list
     def pull_cluster_props_by_args(self, args: dict) -> str:
         get_cluster_cmd = ['databricks', 'clusters', 'get']
         if 'Id' in args:
-            get_cluster_cmd.extend(['--cluster-id', args.get('Id')])
+            get_cluster_cmd.extend([args.get('Id')])
         elif 'cluster' in args:
-            get_cluster_cmd.extend(['--cluster-name', args.get('cluster')])
+            # TODO: currently, arguments '--cpu_cluster' or '--gpu_cluster' are processed and stored as
+            # 'cluster' (as cluster names), while they are actually cluster ids for databricks platforms
+            get_cluster_cmd.extend([args.get('cluster')])
         else:
-            self.logger.error('Invalid arguments to pull the cluster properties')
-        return self.run_sys_cmd(get_cluster_cmd)
+            self.logger.error('Unable to pull cluster id or cluster name information')
+
+        try:
+            cluster_described = self.run_sys_cmd(get_cluster_cmd)
+            if cluster_described is not None:
+                raw_prop_container = JSONPropertiesContainer(prop_arg=cluster_described, file_load=False)
+                return json.dumps(raw_prop_container.props)
+        except Exception as ex:
+            self.logger.error('Invalid arguments to pull the cluster properties: %s', ex)
+            raise ex
+
+        return None
 
     def _build_cmd_ssh_prefix_for_node(self, node: ClusterNode) -> str:
         port = self.env_vars.get('sshPort')

diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py
@@ -141,7 +141,7 @@ def _list_inconsistent_configurations(self) -> list:
     def pull_cluster_props_by_args(self, args: dict) -> str:
         aws_cluster_id = args.get('Id')
         cluster_name = args.get('cluster')
-        if args.get('Id') is None:
+        if aws_cluster_id is None:
             # use cluster name to get the cluster values
             # we need to get the cluster_id from the list command first.
             list_cmd_res = self.exec_platform_list_cluster_by_name(cluster_name)

diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py
@@ -52,12 +52,12 @@ def qualification(cpu_cluster: str = None,
         or query to GPU. The wrapper downloads dependencies and executes the analysis on the local
         dev machine.
         :param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
-                can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
+                can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
                 generated by the databricks-CLI.
         :param  eventlogs: Event log filenames or S3 storage directories
                 containing event logs (comma separated). If missing, the wrapper reads the Spark's
                 property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included
-                in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
+                in the output of `databricks clusters get CLUSTER_ID [flags]`.
                 Note that the wrapper will raise an exception if the property is not set.
         :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
         :param aws_profile: A named AWS profile to get the settings/credentials of the AWS account.
@@ -69,7 +69,7 @@ def qualification(cpu_cluster: str = None,
         :param remote_folder: An S3 folder where the output is uploaded at the end of execution.
                 If no value is provided, the output will be only available on local disk.
         :param gpu_cluster: The Databricks-cluster on which the Spark applications is planned to be migrated.
-                The argument can be a Databricks-cluster or a valid path to the cluster's properties file
+                The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file
                 (json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine
                 instances of the original cluster into databricks instances that support GPU acceleration.
         :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
@@ -158,15 +158,15 @@ def profiling(gpu_cluster: str = None,
         which can be used for debugging and profiling Apache Spark applications.
 
         :param  gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
-                can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
+                can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
                 generated by the databricks-CLI. If missing, then the argument worker_info has to be provided.
         :param  worker_info: A path pointing to a yaml file containing the system information of a
                 worker node. It is assumed that all workers are homogenous.
                 If missing, the wrapper pulls the worker info from the "gpu_cluster".
         :param  eventlogs: Event log filenames or S3 storage directories
                 containing event logs (comma separated). If missing, the wrapper reads the Spark's
                 property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included
-                in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
+                in the output of `databricks clusters get CLUSTER_ID [flags]`.
                 Note that the wrapper will raise an exception if the property is not set.
         :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
         :param aws_profile: A named AWS profile to get the settings/credentials of the AWS account.
@@ -236,7 +236,7 @@ def diagnostic(cluster: str,
         Diagnostic tool to collect information from Databricks cluster, such as OS version, # of worker nodes,
         Yarn configuration, Spark version and error logs etc. Please note, some sensitive information might
         be collected by this tool, e.g. access secret configured in configuration files or dumped to log files.
-        :param cluster: Name of the Databricks cluster running an accelerated computing instance.
+        :param cluster: ID of the Databricks cluster running an accelerated computing instance.
         :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
         :param aws_profile: A named AWS profile to get the settings/credentials of the AWS account.
         :param output_folder: Local path where the archived result will be saved.

diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py
@@ -51,12 +51,12 @@ def qualification(cpu_cluster: str = None,
         or query to GPU. The wrapper downloads dependencies and executes the analysis on the local
         dev machine.
         :param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
-                can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
+                can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
                 generated by the databricks-CLI.
         :param  eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories
                 containing event logs (comma separated). If missing, the wrapper reads the Spark's
                 property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included
-                in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
+                in the output of `databricks clusters get CLUSTER_ID [flags]`.
                 Note that the wrapper will raise an exception if the property is not set.
         :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
         :param local_folder: Local work-directory path to store the output and to be used as root
@@ -67,7 +67,7 @@ def qualification(cpu_cluster: str = None,
         :param remote_folder: An ABFS (Azure Blob File System) folder where the output is uploaded at the end
                 of execution. If no value is provided, the output will be only available on local disk.
         :param gpu_cluster: The Databricks-cluster on which the Spark applications are planned to be migrated.
-                The argument can be a Databricks-cluster or a valid path to the cluster's properties file
+                The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file
                 (json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine
                 instances of the original cluster into databricks instances that support GPU acceleration.
         :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
@@ -153,15 +153,15 @@ def profiling(gpu_cluster: str = None,
         The Profiling tool analyzes both CPU or GPU generated event logs and generates information
         which can be used for debugging and profiling Apache Spark applications.
         :param  gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
-                can be a Databricks-cluster or a valid path to the cluster's properties file (json format)
+                can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
                 generated by the databricks-CLI. If missing, then the argument worker_info has to be provided.
         :param  worker_info: A path pointing to a yaml file containing the system information of a
                 worker node. It is assumed that all workers are homogenous.
                 If missing, the wrapper pulls the worker info from the "gpu_cluster".
         :param  eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories
                 containing event logs (comma separated). If missing, the wrapper reads the Spark's
                 property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included
-                in the output of `databricks clusters get [--cluster-id CLUSTER_ID| --cluster-name CLUSTER_NAME]`.
+                in the output of `databricks clusters get CLUSTER_ID [flags]`.
                 Note that the wrapper will raise an exception if the property is not set.
         :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
         :param local_folder: Local work-directory path to store the output and to be used as root
@@ -228,7 +228,7 @@ def diagnostic(cluster: str,
         Diagnostic tool to collect information from Databricks cluster, such as OS version, # of worker nodes,
         Yarn configuration, Spark version and error logs etc. Please note, some sensitive information might
         be collected by this tool, e.g. access secret configured in configuration files or dumped to log files.
-        :param cluster: Name of the Databricks cluster running an accelerated computing instance.
+        :param cluster: ID of the Databricks cluster running an accelerated computing instance.
         :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
         :param output_folder: Local path where the archived result will be saved.
                Note that this argument only accepts local filesystem. If the argument is NONE,

diff --git a/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py b/user_tools/src/spark_rapids_tools/cmdli/tools_cli.py
@@ -63,7 +63,7 @@ def qualification(self,
 
                 Skipping this argument requires that the cluster argument points to a valid
                 cluster name on the CSP.
-        :param cluster: Name of cluster or path to cluster-properties.
+        :param cluster: Name or ID (for databricks platforms) of cluster or path to cluster-properties.
         :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws",
                 and "databricks-azure".
         :param target_platform: Cost savings and speedup recommendation for comparable cluster in
@@ -139,8 +139,8 @@ def profiling(self,
                 containing event logs (comma separated). If missing, the wrapper reads the Spark's
                 property `spark.eventLog.dir` defined in the `cluster`.
         :param cluster: The cluster on which the Spark applications were executed. The argument
-                can be a cluster name or a valid path to the cluster's properties file (json format)
-                generated by the CSP SDK.
+                can be a cluster name od ID (for databricks platforms) or a valid path to the cluster's
+                properties file (json format) generated by the CSP SDK.
         :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws",
                 and "databricks-azure".
         :param output_folder: path to store the output.
@@ -173,7 +173,8 @@ def bootstrap(self,
         The tool will apply settings for the cluster assuming that jobs will run serially so that
         each job can use up all the cluster resources (CPU and GPU) when it is running.
 
-        :param cluster: Name of the cluster running an accelerated computing instance class
+        :param cluster: Name or ID (for databricks platforms) of the cluster running an accelerated
+                computing instance class
         :param platform: defines one of the following "onprem", "emr", "dataproc", "databricks-aws",
                 and "databricks-azure".
         :param output_folder: path where the final recommendations will be saved.