diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py index 921dcc9b5..d6b8417a7 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py +++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -555,9 +555,9 @@ def __update_apps_with_prediction_info(self, # Rename the source column to the destination column result_df.rename(columns={src_col: dst_col}, errors='ignore', inplace=True) # if the qualx does not have a speedup value, default to 1.0 - result_df['Estimated GPU Speedup'].fillna(1.0, inplace=True) + result_df.fillna({'Estimated GPU Speedup': 1.0}, inplace=True) # if the qualx does not have a duration value, default to App Duration - result_df['Estimated GPU Duration'].fillna(result_df['App Duration'], inplace=True) + result_df.fillna({'Estimated GPU Duration': result_df['App Duration']}, inplace=True) # We need to be careful about other columns that depend on remapped columns result_df['Estimated GPU Time Saved'] = result_df['App Duration'] - result_df['Estimated GPU Duration'] return result_df diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py index a47b45d73..ff8835cc1 100644 --- a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py +++ b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -810,7 +810,7 @@ def scan_tbl( if not app_info.empty: app_info['appName'] = app_name - app_info['sparkVersion'].fillna('Unknown', inplace=True) + app_info.fillna({'sparkVersion': 'Unknown'}, inplace=True) # Get jar versions: cudf_version = '-' diff --git a/user_tools/src/spark_rapids_tools/tools/top_candidates.py b/user_tools/src/spark_rapids_tools/tools/top_candidates.py index fb207406b..63512bcb3 100644 --- a/user_tools/src/spark_rapids_tools/tools/top_candidates.py +++ b/user_tools/src/spark_rapids_tools/tools/top_candidates.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -101,13 +101,14 @@ def _generate_output_table_internal(self, output_df: pd.DataFrame) -> pd.DataFra """ # Create and append 'Speedup Category Order' column to output_df for sorting order speedup_category_order = self.props.get('ineligibleCategory') + self.props.get('eligibleCategories') - output_df['Speedup Category Order'] = \ - output_df['Estimated GPU Speedup Category'].map({name: i for i, name in enumerate(speedup_category_order)}) + df = output_df.copy() + df['Speedup Category Order'] = \ + df['Estimated GPU Speedup Category'].map({name: i for i, name in enumerate(speedup_category_order)}) # Sort columns and select output columns output_columns = self.props.get('outputColumns') sorting_columns = self.props.get('sortingColumns') - valid_output_columns = list(output_df.columns.intersection(output_columns)) - res_df = output_df.sort_values(by=sorting_columns, ascending=False)[valid_output_columns] + valid_output_columns = list(df.columns.intersection(output_columns)) + res_df = df.sort_values(by=sorting_columns, ascending=False)[valid_output_columns] # this is a bit weird since hardcoding, but we don't want this to have ** for csv output if 'Estimated GPU Speedup Category' in res_df: res_df.rename(columns={'Estimated GPU Speedup Category': 'Estimated GPU Speedup Category**'},