diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/model.py b/user_tools/src/spark_rapids_tools/tools/qualx/model.py index 9f9470a1f..4491545f2 100644 --- a/user_tools/src/spark_rapids_tools/tools/qualx/model.py +++ b/user_tools/src/spark_rapids_tools/tools/qualx/model.py @@ -347,7 +347,7 @@ def extract_model_features( default_df = default_df.loc[~default_df.appName.str.startswith(f'{ds_name}:')] modified_default_df = default_split_fn(default_df) if modified_default_df.index.equals(default_df.index): - cpu_aug_tbl.update(default_df) + cpu_aug_tbl.update(modified_default_df) cpu_aug_tbl.astype(df_schema) else: raise ValueError('Default split_function unexpectedly modified row indices.') diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py index d203bbc7f..320615b7c 100644 --- a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py +++ b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py @@ -30,7 +30,9 @@ get_logger, get_dataset_platforms, load_plugin, - run_profiler_tool, log_fallback, + log_fallback, + run_profiler_tool, + RegexPattern ) PREPROCESSED_FILE = 'preprocessed.parquet' @@ -269,10 +271,12 @@ def infer_app_meta(eventlogs: List[str]) -> Mapping[str, Mapping]: app_meta_inner = {} for e in eventlog_list: parts = Path(e).parts - app_id_inner = parts[-1] + app_id_part = parts[-1] + match = RegexPattern.app_id.search(app_id_part) + app_id = match.group() if match else app_id_part run_type = parts[-2].upper() job_name = parts[-4] - app_meta_inner[app_id_inner] = { + app_meta_inner[app_id] = { 'jobName': job_name, 'runType': run_type, 'scaleFactor': 1, diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/util.py b/user_tools/src/spark_rapids_tools/tools/qualx/util.py index 831721ff3..9bb8ee1a5 100644 --- a/user_tools/src/spark_rapids_tools/tools/qualx/util.py +++ b/user_tools/src/spark_rapids_tools/tools/qualx/util.py @@ -45,9 +45,9 @@ def get_logger(name: str) -> logging.Logger: @dataclass class RegexPattern: - app_id = re.compile(r'^app.*[_-][0-9]+[_-][0-9]+$') - profile = re.compile(r'^prof_[0-9]+_[0-9a-zA-Z]+$') - qual_tool = re.compile(r'^qual_[0-9]+_[0-9a-zA-Z]+$') + app_id = re.compile(r'app.*[_-][0-9]+[_-][0-9]+') + profile = re.compile(r'prof_[0-9]+_[0-9a-zA-Z]+') + qual_tool = re.compile(r'qual_[0-9]+_[0-9a-zA-Z]+') rapids_profile = re.compile(r'rapids_4_spark_profile') rapids_qual = re.compile(r'rapids_4_spark_qualification_output') qual_tool_metrics = re.compile(r'raw_metrics')