From 63f126d6e8c2b44c08f7b988ba4053efbe1b92dd Mon Sep 17 00:00:00 2001 From: David Baines Date: Tue, 27 May 2025 12:30:51 +0100 Subject: [PATCH 01/10] Add multi-threading to speed up IO bound operations --- silnlp/common/clean_projects.py | 112 ++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 21 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 6031c244..b1335735 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse +import concurrent.futures import logging import shutil import sys @@ -320,6 +321,19 @@ def execute_cleanup(self): self._log_info("Cleanup execution finished.") +# --- Helper for concurrent project cleaning --- +def process_single_project_for_cleaning( + project_path: Path, current_args: argparse.Namespace +) -> str: + """ + Creates a ProjectCleaner instance, analyzes, and cleans a single project. + Returns the project name upon successful completion. + """ + cleaner = ProjectCleaner(project_path, current_args) + cleaner.analyze_project_contents() + cleaner.execute_cleanup() + return project_path.name # Return name for logging/tracking + # --- Main Function --- def main(): parser = argparse.ArgumentParser( @@ -368,10 +382,9 @@ def main(): file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) - if args.verbose > 0: - print(f"Starting cleanup process for projects in: {args.projects_root}") - if args.dry_run: - print("DRY RUN mode enabled.") + print(f"Starting cleanup process for projects in: {args.projects_root}") + if args.dry_run: + print("DRY RUN mode enabled.") logger.info( f"Starting cleanup process. Projects root: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." ) @@ -381,21 +394,53 @@ def main(): print(f"Error: Projects root folder not found: {args.projects_root}") sys.exit(1) - all_folders = [folder for folder in projects_root_path.iterdir() if folder.is_dir()] - found_total_msg = f"Found {len(all_folders)} folders in {args.projects_root}." + # Initial scan for all items to determine directories + initial_items = list(projects_root_path.glob("*")) + all_folders = [] + if args.verbose > 0: + print(f"Scanning {len(initial_items)} items in {args.projects_root} to find directories...") + + for item in tqdm(initial_items, desc=f"Scanning {args.projects_root}", unit="item", disable=args.verbose > 0): + if item.is_dir(): + all_folders.append(item) + + + found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." logger.info(found_total_msg) if args.verbose > 0: print(found_total_msg) project_folders = [] non_project_folders = [] - for folder in tqdm( - all_folders, desc="Scanning folders", unit="folder", disable=args.verbose > 0 - ): - if has_settings_file(folder): - project_folders.append(folder) - else: - non_project_folders.append(folder) + + # Use a ThreadPoolExecutor for concurrent I/O-bound tasks + max_workers = 10 + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + + # Submit tasks for each folder + future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} + + # Iterate over completed tasks using tqdm, add mininterval for smoother updates + # if individual has_settings_file calls are very fast. + for future in tqdm(concurrent.futures.as_completed(future_to_folder), + total=len(all_folders), + desc="Identifying project folders", + unit="folder", + disable=args.verbose > 0): + folder = future_to_folder[future] + try: + is_project = future.result() + if is_project: + project_folders.append(folder) + else: + non_project_folders.append(folder) + except Exception as exc: + logger.error(f"Error checking folder {folder}: {exc}") + if args.verbose > 0: + print(f"Error checking folder {folder}: {exc}") + non_project_folders.append(folder) + found_msg = f"Found {len(project_folders)} project folders." logger.info(found_msg) @@ -422,14 +467,39 @@ def main(): print(no_projects_msg) return - for project_path in tqdm(project_folders, desc="Cleaning projects", unit="project"): - cleaner = ProjectCleaner(project_path, args) - cleaner.analyze_project_contents() - cleaner.execute_cleanup() - if args.verbose > 0: - print(f"--- Finished processing project: {project_path.name} ---") - elif args.verbose == 0: - logger.info(f"Finished processing project: {project_path.name}") + # Concurrently process each project folder for cleaning + # Re-use max_workers from the previous section, or define a new one if desired. + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Store future to project_path to retrieve the original Path object for robust error messages + future_to_project_path_map = { + executor.submit(process_single_project_for_cleaning, project_path, args): project_path + for project_path in project_folders + } + + for future in tqdm( + concurrent.futures.as_completed(future_to_project_path_map), + total=len(project_folders), + desc="Cleaning projects", + unit="project", + disable=args.verbose > 0, # tqdm is disabled if verbose output is on + mininterval=0.01 # More frequent updates, similar to the folder identification step + ): + processed_project_path = future_to_project_path_map[future] + try: + # future.result() will re-raise exceptions from the worker function + # and return the project name. + project_name = future.result() + + # Log completion for this project + if args.verbose > 0: + print(f"--- Finished processing project: {project_name} ---") + # Log to file even if tqdm is active (args.verbose == 0) + logger.info(f"Finished processing project: {project_name}") + + except Exception as exc: + logger.error(f"Error cleaning project {processed_project_path.name}: {exc}") + if args.verbose > 0: # Also print to console if verbose + print(f"Error cleaning project {processed_project_path.name}: {exc}") final_msg = "\nCleanup process completed." logger.info(final_msg) From 90dc99b7bc3d9d8193d84ccff11f9f3c571102d4 Mon Sep 17 00:00:00 2001 From: David Baines Date: Tue, 27 May 2025 16:44:11 +0100 Subject: [PATCH 02/10] Updates to clean_projects.py --- silnlp/common/clean_projects.py | 127 ++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index b1335735..12e68d3e 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -110,21 +110,18 @@ def __init__(self, project_path: Path, args): self.files_to_delete = set() self.folders_to_delete = set() self.parsing_errors = [] + self.log_buffer: list[str] = [] # Buffer to store log messages for this project self.log_prefix = f"[{self.project_path.name}] " def _log_info(self, message: str): full_message = f"{self.log_prefix}{message}" - logger.info(full_message) - if self.args.verbose > 0: - print(full_message) + self.log_buffer.append(full_message) def _log_action(self, action: str, item_path: Path): full_message = ( f"{self.log_prefix}{action}: {item_path.relative_to(self.project_path)}" ) - logger.info(full_message) - if self.args.verbose > 0: - print(full_message) + self.log_buffer.append(full_message) def _parse_settings(self): settings_file_path = self.project_path / SETTINGS_FILENAME @@ -132,7 +129,7 @@ def _parse_settings(self): settings_file_path = self.project_path / SETTINGS_FILENAME.lower() if not settings_file_path.exists(): warning_msg = f"Warning: {SETTINGS_FILENAME} not found." - if self.args.verbose: + if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) self.parsing_errors.append(f"{SETTINGS_FILENAME} not found.") return @@ -141,36 +138,44 @@ def _parse_settings(self): parser = FileParatextProjectSettingsParser(str(self.project_path)) project_settings = parser.parse() self.project_settings = project_settings - - full_suffix = project_settings.file_name_suffix.upper() - self.scripture_file_extension = Path(full_suffix).suffix - if not self.scripture_file_extension: - self.scripture_file_extension = "" + + # Log raw settings related to file naming now that self.project_settings is assigned. self._log_info( - f"Determined scripture file extension: {self.scripture_file_extension}" + f"Settings - FileNamePrePart:'{self.project_settings.file_name_prefix}' " + f"PostPart:'{self.project_settings.file_name_suffix}' " + f"BookNameForm:'{self.project_settings.file_name_form}'" ) - if project_settings.biblical_terms_file_name: - terms_file_path = ( - self.project_path / project_settings.biblical_terms_file_name - ) - if terms_file_path.is_file(): - self.biblical_terms_files.add(terms_file_path) - self._log_info( - f"Found BiblicalTermsListSetting file: {terms_file_path.name}" - ) - else: - warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" - if self.args.verbose: - self._log_info(warning_msg) - self.parsing_errors.append( - f"BiblicalTermsListSetting file not found: {terms_file_path.name}" - ) except Exception as e: error_msg = f"Error parsing {SETTINGS_FILENAME}: {e}" - if self.args.verbose: + if self.args.verbose > 0: # Condition to buffer this error message self._log_info(error_msg) self.parsing_errors.append(error_msg) + # Log that specific settings details could not be retrieved + self._log_info( + f"Settings - Could not log file naming details (PrePart, PostPart, BookNameForm) due to parsing error: {e}" + ) + + # The following code correctly uses self.project_settings, + # which will be None if parsing failed, and thus these blocks will be skipped. + + if project_settings.biblical_terms_file_name: + terms_file_path = ( + self.project_path / project_settings.biblical_terms_file_name + ) + if terms_file_path.is_file(): + self.biblical_terms_files.add(terms_file_path) + self._log_info( + f"Found BiblicalTermsListSetting file: {terms_file_path.name}" + ) + else: + warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" + if self.args.verbose > 0: # Condition to buffer this warning + self._log_info(warning_msg) + self.parsing_errors.append( + f"BiblicalTermsListSetting file not found: {terms_file_path.name}" + ) + def analyze_project_contents(self): self._parse_settings() @@ -324,15 +329,15 @@ def execute_cleanup(self): # --- Helper for concurrent project cleaning --- def process_single_project_for_cleaning( project_path: Path, current_args: argparse.Namespace -) -> str: +) -> tuple[str, list[str], list[str]]: """ Creates a ProjectCleaner instance, analyzes, and cleans a single project. - Returns the project name upon successful completion. + Returns the project name, a list of log messages, and a list of parsing errors. """ cleaner = ProjectCleaner(project_path, current_args) cleaner.analyze_project_contents() cleaner.execute_cleanup() - return project_path.name # Return name for logging/tracking + return project_path.name, cleaner.log_buffer, cleaner.parsing_errors # --- Main Function --- def main(): @@ -404,6 +409,14 @@ def main(): if item.is_dir(): all_folders.append(item) + test = True + + if test: + all_folders = all_folders[:200] + # Use a single ThreadPoolExecutor for concurrent I/O-bound tasks + max_workers = 1 + else: + max_workers = 10 found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." logger.info(found_total_msg) @@ -413,8 +426,7 @@ def main(): project_folders = [] non_project_folders = [] - # Use a ThreadPoolExecutor for concurrent I/O-bound tasks - max_workers = 10 + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -467,6 +479,8 @@ def main(): print(no_projects_msg) return + processed_project_data: list[tuple[str, list[str], list[str], Path]] = [] + # Concurrently process each project folder for cleaning # Re-use max_workers from the previous section, or define a new one if desired. with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -486,20 +500,37 @@ def main(): ): processed_project_path = future_to_project_path_map[future] try: - # future.result() will re-raise exceptions from the worker function - # and return the project name. - project_name = future.result() - - # Log completion for this project - if args.verbose > 0: - print(f"--- Finished processing project: {project_name} ---") - # Log to file even if tqdm is active (args.verbose == 0) - logger.info(f"Finished processing project: {project_name}") - + project_name, project_logs, project_errors = future.result() + processed_project_data.append((project_name, project_logs, project_errors, processed_project_path)) except Exception as exc: - logger.error(f"Error cleaning project {processed_project_path.name}: {exc}") - if args.verbose > 0: # Also print to console if verbose - print(f"Error cleaning project {processed_project_path.name}: {exc}") + # Log critical errors during processing immediately, as they might prevent log collection + crit_error_msg = f"Critical error during processing of project {processed_project_path.name}: {exc}" + logger.error(crit_error_msg) + if args.verbose > 0: + print(crit_error_msg) + # Store a placeholder for sorted output + processed_project_data.append((processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path)) + + # Sort all collected data by project name + processed_project_data.sort(key=lambda x: x[0]) + + # Log the collected and sorted data + for project_name, project_logs, project_parsing_errors, _project_path in processed_project_data: + # Log messages collected by the cleaner + for log_msg_from_buffer in project_logs: + logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner + if args.verbose > 0: # Print to console if verbose + print(log_msg_from_buffer) + + # Log parsing errors, ensuring they are associated with the project + if project_parsing_errors: + for err_str in project_parsing_errors: + error_log_message = f"[{project_name}] Config Error: {err_str}" + logger.warning(error_log_message) # Use warning for parsing/config errors + if args.verbose > 0: + print(error_log_message) + + logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project final_msg = "\nCleanup process completed." logger.info(final_msg) From d3f182d0fd24d83c29b3d3acb991cdc5b11fafd1 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 28 May 2025 14:07:11 +0100 Subject: [PATCH 03/10] Lower case TermRenderings.xml and turn off test --- silnlp/common/clean_projects.py | 118 ++++++++++++-------------------- 1 file changed, 44 insertions(+), 74 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 12e68d3e..1d0e42c2 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -12,11 +12,12 @@ from tqdm import tqdm # --- Global Constants --- -PROJECTS_FOLDER_DEFAULT = "M:/Paratext/projects" +PROJECTS_FOLDER_DEFAULT = "M:/Paratext/projects" logger = logging.getLogger(__name__) SETTINGS_FILENAME = "Settings.xml" # --- Configuration for Deletion/Keep Rules --- +# These are matched with lower cased versions of the filename, they must be listed in lower case here. FILES_TO_DELETE_BY_NAME_CI = { "allclustercorrections.txt", @@ -76,8 +77,7 @@ "bookNames.xml", "canons.xml", "lexicon.xml", - "TermRenderings.xml", - + "termrenderings.xml", } EXTENSIONS_TO_KEEP_CI = { @@ -94,9 +94,7 @@ def has_settings_file(project_folder: Path) -> bool: - return (project_folder / SETTINGS_FILENAME).is_file() or ( - project_folder / SETTINGS_FILENAME.lower() - ).is_file() + return (project_folder / SETTINGS_FILENAME).is_file() or (project_folder / SETTINGS_FILENAME.lower()).is_file() class ProjectCleaner: @@ -118,9 +116,7 @@ def _log_info(self, message: str): self.log_buffer.append(full_message) def _log_action(self, action: str, item_path: Path): - full_message = ( - f"{self.log_prefix}{action}: {item_path.relative_to(self.project_path)}" - ) + full_message = f"{self.log_prefix}{action}: {item_path.relative_to(self.project_path)}" self.log_buffer.append(full_message) def _parse_settings(self): @@ -129,7 +125,7 @@ def _parse_settings(self): settings_file_path = self.project_path / SETTINGS_FILENAME.lower() if not settings_file_path.exists(): warning_msg = f"Warning: {SETTINGS_FILENAME} not found." - if self.args.verbose > 0: # Condition to buffer this warning + if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) self.parsing_errors.append(f"{SETTINGS_FILENAME} not found.") return @@ -138,7 +134,7 @@ def _parse_settings(self): parser = FileParatextProjectSettingsParser(str(self.project_path)) project_settings = parser.parse() self.project_settings = project_settings - + # Log raw settings related to file naming now that self.project_settings is assigned. self._log_info( f"Settings - FileNamePrePart:'{self.project_settings.file_name_prefix}' " @@ -148,34 +144,27 @@ def _parse_settings(self): except Exception as e: error_msg = f"Error parsing {SETTINGS_FILENAME}: {e}" - if self.args.verbose > 0: # Condition to buffer this error message + if self.args.verbose > 0: # Condition to buffer this error message self._log_info(error_msg) self.parsing_errors.append(error_msg) # Log that specific settings details could not be retrieved self._log_info( - f"Settings - Could not log file naming details (PrePart, PostPart, BookNameForm) due to parsing error: {e}" + f"Settings - Couldn't log naming details (PrePart, PostPart, BookNameForm) due to parsing error: {e}" ) # The following code correctly uses self.project_settings, # which will be None if parsing failed, and thus these blocks will be skipped. if project_settings.biblical_terms_file_name: - terms_file_path = ( - self.project_path / project_settings.biblical_terms_file_name - ) + terms_file_path = self.project_path / project_settings.biblical_terms_file_name if terms_file_path.is_file(): self.biblical_terms_files.add(terms_file_path) - self._log_info( - f"Found BiblicalTermsListSetting file: {terms_file_path.name}" - ) + self._log_info(f"Found BiblicalTermsListSetting file: {terms_file_path.name}") else: warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" - if self.args.verbose > 0: # Condition to buffer this warning + if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) - self.parsing_errors.append( - f"BiblicalTermsListSetting file not found: {terms_file_path.name}" - ) - + self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {terms_file_path.name}") def analyze_project_contents(self): self._parse_settings() @@ -201,32 +190,22 @@ def analyze_project_contents(self): # Scripture files are identified using ParatextProjectSettings.get_book_id() if self.project_settings: - for ( - item - ) in ( - self.project_path.iterdir() - ): # Scripture files are typically at the project root + for item in self.project_path.iterdir(): # Scripture files are typically at the project root if item.is_file(): book_id = self.project_settings.get_book_id(item.name) if book_id is not None: self.files_to_keep.add(item) if self.args.verbose > 1: - self._log_info( - f"Kept scripture file (via get_book_id): {item.name}" - ) + self._log_info(f"Kept scripture file (via get_book_id): {item.name}") elif self.args.verbose > 0: - self._log_info( - "Project settings not available; cannot use get_book_id for scripture identification." - ) + self._log_info("Project settings not available; cannot use get_book_id for scripture identification.") for item in all_items_in_project: if item.is_file() and item.suffix.lower() in EXTENSIONS_TO_KEEP_CI: self.files_to_keep.add(item) if self.args.verbose > 1: - self._log_info( - f"Identified {len(self.files_to_keep)} files to keep initially." - ) + self._log_info(f"Identified {len(self.files_to_keep)} files to keep initially.") # --- Pass 2: Identify files to DELETE --- for item_path in all_items_in_project: @@ -241,17 +220,12 @@ def analyze_project_contents(self): if item_name_lower in FILES_TO_DELETE_BY_NAME_CI: delete_file = True reason = "specific name" - elif any( - item_path.match(pattern) for pattern in FILES_TO_DELETE_BY_PATTERN - ): + elif any(item_path.match(pattern) for pattern in FILES_TO_DELETE_BY_PATTERN): delete_file = True reason = "pattern match" - elif any( - sub_str in item_name_lower - for sub_str in FILENAME_SUBSTRINGS_TO_DELETE_CI - ): + elif any(sub_str in item_name_lower for sub_str in FILENAME_SUBSTRINGS_TO_DELETE_CI): delete_file = True - reason = "substring match" + reason = "substring match" elif item_suffix_lower in EXTENSIONS_TO_DELETE_CI: delete_file = True reason = f"extension ({item_suffix_lower})" @@ -274,9 +248,7 @@ def analyze_project_contents(self): if delete_file: self.files_to_delete.add(item_path) if self.args.verbose > 1: - self._log_info( - f"Marked for deletion ({reason}): {item_path.relative_to(self.project_path)}" - ) + self._log_info(f"Marked for deletion ({reason}): {item_path.relative_to(self.project_path)}") # --- Pass 3: Identify folders to DELETE --- for item in self.project_path.iterdir(): @@ -339,6 +311,7 @@ def process_single_project_for_cleaning( cleaner.execute_cleanup() return project_path.name, cleaner.log_buffer, cleaner.parsing_errors + # --- Main Function --- def main(): parser = argparse.ArgumentParser( @@ -363,9 +336,7 @@ def main(): default=0, help="Increase output verbosity. -v for project-level info, -vv for file-level decisions.", ) - parser.add_argument( - "--log-file", help="Path to a file to log actions and verbose information." - ) + parser.add_argument("--log-file", help="Path to a file to log actions and verbose information.") args = parser.parse_args() # --- Configure Logging --- @@ -391,7 +362,7 @@ def main(): if args.dry_run: print("DRY RUN mode enabled.") logger.info( - f"Starting cleanup process. Projects root: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." + f"Starting cleanup process for: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." ) projects_root_path = Path(args.projects_root) @@ -409,7 +380,7 @@ def main(): if item.is_dir(): all_folders.append(item) - test = True + test = False if test: all_folders = all_folders[:200] @@ -426,20 +397,20 @@ def main(): project_folders = [] non_project_folders = [] - - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - + # Submit tasks for each folder future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} # Iterate over completed tasks using tqdm, add mininterval for smoother updates # if individual has_settings_file calls are very fast. - for future in tqdm(concurrent.futures.as_completed(future_to_folder), - total=len(all_folders), - desc="Identifying project folders", - unit="folder", - disable=args.verbose > 0): + for future in tqdm( + concurrent.futures.as_completed(future_to_folder), + total=len(all_folders), + desc="Identifying project folders", + unit="folder", + disable=args.verbose > 0, + ): folder = future_to_folder[future] try: is_project = future.result() @@ -450,19 +421,16 @@ def main(): except Exception as exc: logger.error(f"Error checking folder {folder}: {exc}") if args.verbose > 0: - print(f"Error checking folder {folder}: {exc}") + print(f"Error checking folder {folder}: {exc}") non_project_folders.append(folder) - found_msg = f"Found {len(project_folders)} project folders." logger.info(found_msg) if args.verbose > 0: print(found_msg) if non_project_folders: - non_project_msg = ( - f"Found {len(non_project_folders)} non-project folders (will be ignored):" - ) + non_project_msg = f"Found {len(non_project_folders)} non-project folders (will be ignored):" logger.info(non_project_msg) if args.verbose > 0: print(non_project_msg) @@ -496,7 +464,7 @@ def main(): desc="Cleaning projects", unit="project", disable=args.verbose > 0, # tqdm is disabled if verbose output is on - mininterval=0.01 # More frequent updates, similar to the folder identification step + mininterval=0.01, # More frequent updates, similar to the folder identification step ): processed_project_path = future_to_project_path_map[future] try: @@ -509,7 +477,9 @@ def main(): if args.verbose > 0: print(crit_error_msg) # Store a placeholder for sorted output - processed_project_data.append((processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path)) + processed_project_data.append( + (processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path) + ) # Sort all collected data by project name processed_project_data.sort(key=lambda x: x[0]) @@ -518,19 +488,19 @@ def main(): for project_name, project_logs, project_parsing_errors, _project_path in processed_project_data: # Log messages collected by the cleaner for log_msg_from_buffer in project_logs: - logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner - if args.verbose > 0: # Print to console if verbose + logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner + if args.verbose > 0: # Print to console if verbose print(log_msg_from_buffer) # Log parsing errors, ensuring they are associated with the project if project_parsing_errors: for err_str in project_parsing_errors: error_log_message = f"[{project_name}] Config Error: {err_str}" - logger.warning(error_log_message) # Use warning for parsing/config errors + logger.warning(error_log_message) # Use warning for parsing/config errors if args.verbose > 0: print(error_log_message) - - logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project + + logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project final_msg = "\nCleanup process completed." logger.info(final_msg) From ef092ffa59c51bdf0de4e186a9a51a7f2c503ea3 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 28 May 2025 14:09:23 +0100 Subject: [PATCH 04/10] Remove test code. --- silnlp/common/clean_projects.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 1d0e42c2..f4defdd9 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -381,13 +381,7 @@ def main(): all_folders.append(item) test = False - - if test: - all_folders = all_folders[:200] - # Use a single ThreadPoolExecutor for concurrent I/O-bound tasks - max_workers = 1 - else: - max_workers = 10 + max_workers = 10 found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." logger.info(found_total_msg) From deba37f431c01b47f380a9fa50e90499e14070b2 Mon Sep 17 00:00:00 2001 From: David Baines Date: Mon, 2 Jun 2025 10:36:26 +0100 Subject: [PATCH 05/10] Remove the unused test parameter --- silnlp/common/clean_projects.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index f4defdd9..4cb0e82c 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -155,8 +155,8 @@ def _parse_settings(self): # The following code correctly uses self.project_settings, # which will be None if parsing failed, and thus these blocks will be skipped. - if project_settings.biblical_terms_file_name: - terms_file_path = self.project_path / project_settings.biblical_terms_file_name + if self.project_settings and self.project_settings.biblical_terms_file_name: + terms_file_path = self.project_path / self.project_settings.biblical_terms_file_name if terms_file_path.is_file(): self.biblical_terms_files.add(terms_file_path) self._log_info(f"Found BiblicalTermsListSetting file: {terms_file_path.name}") @@ -164,7 +164,7 @@ def _parse_settings(self): warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) - self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {terms_file_path.name}") + self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {self.project_settings.biblical_terms_file_name})") def analyze_project_contents(self): self._parse_settings() @@ -340,7 +340,9 @@ def main(): args = parser.parse_args() # --- Configure Logging --- - log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + #log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + log_formatter = logging.Formatter("2025-05-29 14:30:00,000 - %(levelname)s - %(message)s") + logger.setLevel(logging.INFO) console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(log_formatter) @@ -380,7 +382,6 @@ def main(): if item.is_dir(): all_folders.append(item) - test = False max_workers = 10 found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." From 0c726d58a54d648e478a8dc29546534cfaec5378 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 6 Aug 2025 11:34:54 +0100 Subject: [PATCH 06/10] Fix formatting format - remove hard coded date --- silnlp/common/clean_projects.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 4cb0e82c..63ba5e7e 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -340,8 +340,7 @@ def main(): args = parser.parse_args() # --- Configure Logging --- - #log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - log_formatter = logging.Formatter("2025-05-29 14:30:00,000 - %(levelname)s - %(message)s") + log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") logger.setLevel(logging.INFO) console_handler = logging.StreamHandler(sys.stdout) From 64b40ef574f0f7f3de93f253eee9ee3657c7742f Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 6 Aug 2025 12:15:22 +0100 Subject: [PATCH 07/10] Accept optional list of folder or .env, process ctrl+c --- silnlp/common/clean_projects.py | 192 ++++++++++++++++++-------------- 1 file changed, 107 insertions(+), 85 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 63ba5e7e..76136731 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -19,7 +19,7 @@ # --- Configuration for Deletion/Keep Rules --- # These are matched with lower cased versions of the filename, they must be listed in lower case here. -FILES_TO_DELETE_BY_NAME_CI = { +FILES_TO_DELETE_BY_NAME = { "allclustercorrections.txt", "keys.asc", } @@ -29,9 +29,9 @@ "readme", ] -FILENAME_SUBSTRINGS_TO_DELETE_CI = ["error", "hyphenatedwords", "note"] +FILENAME_SUBSTRINGS_TO_DELETE = ["error", "hyphenatedwords", "note"] -EXTENSIONS_TO_DELETE_CI = { +EXTENSIONS_TO_DELETE = { ".bak", ".css", ".csv", @@ -66,7 +66,7 @@ ".zip", } -FILES_TO_KEEP_BY_NAME_CI = { +FILES_TO_KEEP_BY_NAME = { "settings.xml", "autocorrect.txt", "copr.htm", @@ -80,7 +80,7 @@ "termrenderings.xml", } -EXTENSIONS_TO_KEEP_CI = { +EXTENSIONS_TO_KEEP = { ".cct", ".dic", ".ldml", @@ -88,7 +88,7 @@ } # All subfolders should be deleted -SUBFOLDERS_TO_PRESERVE_BY_NAME_CI = {} +SUBFOLDERS_TO_PRESERVE_BY_NAME = {} # --- Helper Functions --- @@ -181,7 +181,7 @@ def analyze_project_contents(self): self.files_to_keep.add(settings_path_lower) for item in self.project_path.iterdir(): - if item.is_file() and item.name.lower() in FILES_TO_KEEP_BY_NAME_CI: + if item.is_file() and item.name.lower() in FILES_TO_KEEP_BY_NAME: self.files_to_keep.add(item) for terms_file in self.biblical_terms_files: @@ -201,7 +201,7 @@ def analyze_project_contents(self): self._log_info("Project settings not available; cannot use get_book_id for scripture identification.") for item in all_items_in_project: - if item.is_file() and item.suffix.lower() in EXTENSIONS_TO_KEEP_CI: + if item.is_file() and item.suffix.lower() in EXTENSIONS_TO_KEEP: self.files_to_keep.add(item) if self.args.verbose > 1: @@ -217,16 +217,16 @@ def analyze_project_contents(self): delete_file = False reason = "" - if item_name_lower in FILES_TO_DELETE_BY_NAME_CI: + if item_name_lower in FILES_TO_DELETE_BY_NAME: delete_file = True reason = "specific name" elif any(item_path.match(pattern) for pattern in FILES_TO_DELETE_BY_PATTERN): delete_file = True reason = "pattern match" - elif any(sub_str in item_name_lower for sub_str in FILENAME_SUBSTRINGS_TO_DELETE_CI): + elif any(sub_str in item_name_lower for sub_str in FILENAME_SUBSTRINGS_TO_DELETE): delete_file = True reason = "substring match" - elif item_suffix_lower in EXTENSIONS_TO_DELETE_CI: + elif item_suffix_lower in EXTENSIONS_TO_DELETE: delete_file = True reason = f"extension ({item_suffix_lower})" elif item_name_lower.startswith(".") or item_name_lower.startswith("_"): @@ -253,7 +253,7 @@ def analyze_project_contents(self): # --- Pass 3: Identify folders to DELETE --- for item in self.project_path.iterdir(): if item.is_dir(): - if item.name.lower() not in SUBFOLDERS_TO_PRESERVE_BY_NAME_CI: + if item.name.lower() not in SUBFOLDERS_TO_PRESERVE_BY_NAME: self.folders_to_delete.add(item) elif self.args.verbose > 1: self._log_info(f"Preserving subfolder: {item.name}") @@ -319,10 +319,9 @@ def main(): formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( - "projects_root", - nargs="?", - default=PROJECTS_FOLDER_DEFAULT, - help="The root directory containing Paratext project folders.", + "folders", + nargs="*", + help="One or more Paratext project root directories to clean. If not specified, uses SIL_NLP_ENV.pt_projects_dir.", ) parser.add_argument( "--dry-run", @@ -339,9 +338,19 @@ def main(): parser.add_argument("--log-file", help="Path to a file to log actions and verbose information.") args = parser.parse_args() + # --- Import environment if needed --- + if not args.folders: + try: + from silnlp.common.environment import SIL_NLP_ENV + projects_root_paths = [SIL_NLP_ENV.pt_projects_dir] + except ImportError as e: + print("Could not import SIL_NLP_ENV from environment.py. Please specify at least one folder.") + sys.exit(1) + else: + projects_root_paths = [Path(folder) for folder in args.folders] + # --- Configure Logging --- log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - logger.setLevel(logging.INFO) console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(log_formatter) @@ -359,31 +368,31 @@ def main(): file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) - print(f"Starting cleanup process for projects in: {args.projects_root}") - if args.dry_run: - print("DRY RUN mode enabled.") - logger.info( - f"Starting cleanup process for: {args.projects_root}. Dry run: {args.dry_run}. Verbose: {args.verbose}." - ) + for projects_root_path in projects_root_paths: + print(f"Starting cleanup process for projects in: {projects_root_path}") + if args.dry_run: + print("DRY RUN mode enabled.") + logger.info( + f"Starting cleanup process for: {projects_root_path}. Dry run: {args.dry_run}. Verbose: {args.verbose}." + ) - projects_root_path = Path(args.projects_root) - if not projects_root_path.is_dir(): - print(f"Error: Projects root folder not found: {args.projects_root}") - sys.exit(1) + if not Path(projects_root_path).is_dir(): + print(f"Error: Projects root folder not found: {projects_root_path}") + sys.exit(1) # Initial scan for all items to determine directories initial_items = list(projects_root_path.glob("*")) all_folders = [] if args.verbose > 0: - print(f"Scanning {len(initial_items)} items in {args.projects_root} to find directories...") + print(f"Scanning {len(initial_items)} items in {projects_root_path} to find directories...") - for item in tqdm(initial_items, desc=f"Scanning {args.projects_root}", unit="item", disable=args.verbose > 0): + for item in tqdm(initial_items, desc=f"Scanning {projects_root_path}", unit="item", disable=args.verbose > 0): if item.is_dir(): all_folders.append(item) max_workers = 10 - found_total_msg = f"Found {len(all_folders)} total directories in {args.projects_root}." + found_total_msg = f"Found {len(all_folders)} total directories in {projects_root_path}." logger.info(found_total_msg) if args.verbose > 0: print(found_total_msg) @@ -391,33 +400,39 @@ def main(): project_folders = [] non_project_folders = [] - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - - # Submit tasks for each folder - future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} - - # Iterate over completed tasks using tqdm, add mininterval for smoother updates - # if individual has_settings_file calls are very fast. - for future in tqdm( - concurrent.futures.as_completed(future_to_folder), - total=len(all_folders), - desc="Identifying project folders", - unit="folder", - disable=args.verbose > 0, - ): - folder = future_to_folder[future] - try: - is_project = future.result() - if is_project: - project_folders.append(folder) - else: + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + + # Submit tasks for each folder + future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} + + # Iterate over completed tasks using tqdm, add mininterval for smoother updates + # if individual has_settings_file calls are very fast. + for future in tqdm( + concurrent.futures.as_completed(future_to_folder), + total=len(all_folders), + desc="Identifying project folders", + unit="folder", + disable=args.verbose > 0, + ): + folder = future_to_folder[future] + try: + is_project = future.result() + if is_project: + project_folders.append(folder) + else: + non_project_folders.append(folder) + except Exception as exc: + logger.error(f"Error checking folder {folder}: {exc}") + if args.verbose > 0: + print(f"Error checking folder {folder}: {exc}") non_project_folders.append(folder) - except Exception as exc: - logger.error(f"Error checking folder {folder}: {exc}") - if args.verbose > 0: - print(f"Error checking folder {folder}: {exc}") - non_project_folders.append(folder) - + except KeyboardInterrupt: + print("\nInterrupted by user. Attempting to shut down workers...") + logger.warning("Interrupted by user. Attempting to shut down workers...") + executor.shutdown(wait=False, cancel_futures=True) + sys.exit(1) + found_msg = f"Found {len(project_folders)} project folders." logger.info(found_msg) if args.verbose > 0: @@ -445,35 +460,42 @@ def main(): # Concurrently process each project folder for cleaning # Re-use max_workers from the previous section, or define a new one if desired. - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - # Store future to project_path to retrieve the original Path object for robust error messages - future_to_project_path_map = { - executor.submit(process_single_project_for_cleaning, project_path, args): project_path - for project_path in project_folders - } - - for future in tqdm( - concurrent.futures.as_completed(future_to_project_path_map), - total=len(project_folders), - desc="Cleaning projects", - unit="project", - disable=args.verbose > 0, # tqdm is disabled if verbose output is on - mininterval=0.01, # More frequent updates, similar to the folder identification step - ): - processed_project_path = future_to_project_path_map[future] - try: - project_name, project_logs, project_errors = future.result() - processed_project_data.append((project_name, project_logs, project_errors, processed_project_path)) - except Exception as exc: - # Log critical errors during processing immediately, as they might prevent log collection - crit_error_msg = f"Critical error during processing of project {processed_project_path.name}: {exc}" - logger.error(crit_error_msg) - if args.verbose > 0: - print(crit_error_msg) - # Store a placeholder for sorted output - processed_project_data.append( - (processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path) - ) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Store future to project_path to retrieve the original Path object for robust error messages + future_to_project_path_map = { + executor.submit(process_single_project_for_cleaning, project_path, args): project_path + for project_path in project_folders + } + + for future in tqdm( + concurrent.futures.as_completed(future_to_project_path_map), + total=len(project_folders), + desc="Cleaning projects", + unit="project", + disable=args.verbose > 0, # tqdm is disabled if verbose output is on + mininterval=0.01, # More frequent updates, similar to the folder identification step + ): + processed_project_path = future_to_project_path_map[future] + try: + project_name, project_logs, project_errors = future.result() + processed_project_data.append((project_name, project_logs, project_errors, processed_project_path)) + except Exception as exc: + # Log critical errors during processing immediately, as they might prevent log collection + crit_error_msg = f"Critical error during processing of project {processed_project_path.name}: {exc}" + logger.error(crit_error_msg) + if args.verbose > 0: + print(crit_error_msg) + # Store a placeholder for sorted output + processed_project_data.append( + (processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path) + ) + except KeyboardInterrupt: + print("\nInterrupted by user. Attempting to shut down workers...") + logger.warning("Interrupted by user. Attempting to shut down workers...") + executor.shutdown(wait=False, cancel_futures=True) + sys.exit(1) # Sort all collected data by project name processed_project_data.sort(key=lambda x: x[0]) From cc023ab4aa7680b2c1c1cd772bac2ffe258766db Mon Sep 17 00:00:00 2001 From: David Baines Date: Tue, 19 Aug 2025 06:06:43 +0100 Subject: [PATCH 08/10] Fix issues identified by review --- silnlp/common/clean_projects.py | 311 +++++++++++++++++--------------- 1 file changed, 163 insertions(+), 148 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 76136731..27542321 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -2,6 +2,7 @@ import argparse import concurrent.futures +import fnmatch import logging import shutil import sys @@ -35,7 +36,9 @@ ".bak", ".css", ".csv", + ".cct", ".dbl", + ".dic", ".doc", ".docx", ".font", @@ -47,7 +50,6 @@ ".ini", ".json", ".kb2", - ".lds", ".map", ".md", ".old", @@ -81,12 +83,17 @@ } EXTENSIONS_TO_KEEP = { - ".cct", - ".dic", ".ldml", ".lds", } +extension_overlap = EXTENSIONS_TO_KEEP & EXTENSIONS_TO_DELETE +if extension_overlap: + raise ValueError( + "EXTENSIONS_TO_KEEP and EXTENSIONS_TO_DELETE must not overlap. Please check the code \ + for these extensions: {extension_overlap}" + ) + # All subfolders should be deleted SUBFOLDERS_TO_PRESERVE_BY_NAME = {} @@ -164,7 +171,9 @@ def _parse_settings(self): warning_msg = f"Warning: BiblicalTermsListSetting file not found at expected path: {terms_file_path}" if self.args.verbose > 0: # Condition to buffer this warning self._log_info(warning_msg) - self.parsing_errors.append(f"BiblicalTermsListSetting file not found: {self.project_settings.biblical_terms_file_name})") + self.parsing_errors.append( + f"BiblicalTermsListSetting file not found: {self.project_settings.biblical_terms_file_name}" + ) def analyze_project_contents(self): self._parse_settings() @@ -220,7 +229,9 @@ def analyze_project_contents(self): if item_name_lower in FILES_TO_DELETE_BY_NAME: delete_file = True reason = "specific name" - elif any(item_path.match(pattern) for pattern in FILES_TO_DELETE_BY_PATTERN): + elif any( + fnmatch.fnmatch(item_path.name.lower(), pattern.lower()) for pattern in FILES_TO_DELETE_BY_PATTERN + ): delete_file = True reason = "pattern match" elif any(sub_str in item_name_lower for sub_str in FILENAME_SUBSTRINGS_TO_DELETE): @@ -321,7 +332,8 @@ def main(): parser.add_argument( "folders", nargs="*", - help="One or more Paratext project root directories to clean. If not specified, uses SIL_NLP_ENV.pt_projects_dir.", + help="One or more Paratext project root directories to clean. If not specified, \ + uses SIL_NLP_ENV.pt_projects_dir.", ) parser.add_argument( "--dry-run", @@ -333,7 +345,8 @@ def main(): "--verbose", action="count", default=0, - help="Increase output verbosity. -v for project-level info, -vv for file-level decisions.", + help="Increase output verbosity. -v for project-level info, -vv for file-level decisions\ + and -vvv for debug-level details.", ) parser.add_argument("--log-file", help="Path to a file to log actions and verbose information.") args = parser.parse_args() @@ -342,9 +355,10 @@ def main(): if not args.folders: try: from silnlp.common.environment import SIL_NLP_ENV - projects_root_paths = [SIL_NLP_ENV.pt_projects_dir] + + projects_root_path = [Path(SIL_NLP_ENV.pt_projects_dir)] except ImportError as e: - print("Could not import SIL_NLP_ENV from environment.py. Please specify at least one folder.") + print(f"Could not import SIL_NLP_ENV from environment.py. {e}") sys.exit(1) else: projects_root_paths = [Path(folder) for folder in args.folders] @@ -355,11 +369,11 @@ def main(): console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(log_formatter) if args.verbose == 0: - console_handler.setLevel(logging.CRITICAL + 1) + console_handler.setLevel(logging.WARNING) elif args.verbose == 1: console_handler.setLevel(logging.INFO) - else: - console_handler.setLevel(logging.INFO) + else: # args.verbose > 2 + console_handler.setLevel(logging.DEBUG) logger.addHandler(console_handler) if args.log_file: @@ -369,159 +383,160 @@ def main(): logger.addHandler(file_handler) for projects_root_path in projects_root_paths: - print(f"Starting cleanup process for projects in: {projects_root_path}") + logger.info(f"Starting cleanup process for projects in: {projects_root_path}") if args.dry_run: print("DRY RUN mode enabled.") logger.info( f"Starting cleanup process for: {projects_root_path}. Dry run: {args.dry_run}. Verbose: {args.verbose}." ) - if not Path(projects_root_path).is_dir(): - print(f"Error: Projects root folder not found: {projects_root_path}") + if not projects_root_path.is_dir(): + logger.error(f"Projects root folder not found: {projects_root_path}") sys.exit(1) - # Initial scan for all items to determine directories - initial_items = list(projects_root_path.glob("*")) - all_folders = [] - if args.verbose > 0: - print(f"Scanning {len(initial_items)} items in {projects_root_path} to find directories...") + # Initial scan for all items to determine list of directories within projects_root_path + initial_items = list(projects_root_path.iterdir()) - for item in tqdm(initial_items, desc=f"Scanning {projects_root_path}", unit="item", disable=args.verbose > 0): - if item.is_dir(): - all_folders.append(item) - - max_workers = 10 + all_folders = [] + if args.verbose > 0: + print(f"Scanning {len(initial_items)} items in {projects_root_path} to find directories...") - found_total_msg = f"Found {len(all_folders)} total directories in {projects_root_path}." - logger.info(found_total_msg) - if args.verbose > 0: - print(found_total_msg) + for item in tqdm(initial_items, desc=f"Scanning {projects_root_path}", unit="item", disable=args.verbose > 1): + if item.is_dir(): + all_folders.append(item) - project_folders = [] - non_project_folders = [] + max_workers = 10 - try: - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + found_total_msg = f"Found {len(all_folders)} total directories in {projects_root_path}." + logger.info(found_total_msg) - # Submit tasks for each folder - future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} + project_folders = [] + non_project_folders = [] - # Iterate over completed tasks using tqdm, add mininterval for smoother updates - # if individual has_settings_file calls are very fast. - for future in tqdm( - concurrent.futures.as_completed(future_to_folder), - total=len(all_folders), - desc="Identifying project folders", - unit="folder", - disable=args.verbose > 0, - ): - folder = future_to_folder[future] - try: - is_project = future.result() - if is_project: - project_folders.append(folder) - else: + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + + # Submit tasks for each folder + future_to_folder = {executor.submit(has_settings_file, folder): folder for folder in all_folders} + + # Iterate over completed tasks using tqdm, add mininterval for smoother updates + # if individual has_settings_file calls are very fast. + for future in tqdm( + concurrent.futures.as_completed(future_to_folder), + total=len(all_folders), + desc="Identifying project folders", + unit="folder", + disable=args.verbose > 0, + ): + folder = future_to_folder[future] + try: + is_project = future.result() + if is_project: + project_folders.append(folder) + else: + non_project_folders.append(folder) + except Exception as exc: + logger.error(f"Error checking folder {folder}: {exc}") + if args.verbose > 0: + print(f"Error checking folder {folder}: {exc}") non_project_folders.append(folder) - except Exception as exc: - logger.error(f"Error checking folder {folder}: {exc}") + except KeyboardInterrupt: + print("\nInterrupted by user. Attempting to shut down workers...") + logger.warning("Interrupted by user. Attempting to shut down workers...") + executor.shutdown(wait=False, cancel_futures=True) + sys.exit(1) + + found_msg = f"Found {len(project_folders)} project folders." + logger.info(found_msg) + + if non_project_folders: + non_project_msg = f"Found {len(non_project_folders)} non-project folders (will be ignored):" + logger.info(non_project_msg) + if args.verbose > 0: + print(non_project_msg) + if args.verbose > 1: + for folder in non_project_folders: + logger.info(f" - Ignored non-project folder: {folder.name}") if args.verbose > 0: - print(f"Error checking folder {folder}: {exc}") - non_project_folders.append(folder) - except KeyboardInterrupt: - print("\nInterrupted by user. Attempting to shut down workers...") - logger.warning("Interrupted by user. Attempting to shut down workers...") - executor.shutdown(wait=False, cancel_futures=True) - sys.exit(1) - - found_msg = f"Found {len(project_folders)} project folders." - logger.info(found_msg) - if args.verbose > 0: - print(found_msg) - - if non_project_folders: - non_project_msg = f"Found {len(non_project_folders)} non-project folders (will be ignored):" - logger.info(non_project_msg) - if args.verbose > 0: - print(non_project_msg) - if args.verbose > 1: - for folder in non_project_folders: - logger.info(f" - Ignored non-project folder: {folder.name}") - if args.verbose > 0: - print(f" - {folder.name}") - - if not project_folders: - no_projects_msg = "No project folders found to clean." - logger.info(no_projects_msg) - if args.verbose > 0: - print(no_projects_msg) - return - - processed_project_data: list[tuple[str, list[str], list[str], Path]] = [] - - # Concurrently process each project folder for cleaning - # Re-use max_workers from the previous section, or define a new one if desired. - - try: - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - # Store future to project_path to retrieve the original Path object for robust error messages - future_to_project_path_map = { - executor.submit(process_single_project_for_cleaning, project_path, args): project_path - for project_path in project_folders - } - - for future in tqdm( - concurrent.futures.as_completed(future_to_project_path_map), - total=len(project_folders), - desc="Cleaning projects", - unit="project", - disable=args.verbose > 0, # tqdm is disabled if verbose output is on - mininterval=0.01, # More frequent updates, similar to the folder identification step - ): - processed_project_path = future_to_project_path_map[future] - try: - project_name, project_logs, project_errors = future.result() - processed_project_data.append((project_name, project_logs, project_errors, processed_project_path)) - except Exception as exc: - # Log critical errors during processing immediately, as they might prevent log collection - crit_error_msg = f"Critical error during processing of project {processed_project_path.name}: {exc}" - logger.error(crit_error_msg) + print(f" - {folder.name}") + + if not project_folders: + no_projects_msg = "No project folders found to clean." + logger.info(no_projects_msg) + if args.verbose > 0: + print(no_projects_msg) + return + + processed_project_data: [list[tuple[str, list[str], list[str], Path]]] = [] + + # Concurrently process each project folder for cleaning + # Re-use max_workers from the previous section, or define a new one if desired. + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Store future to project_path to retrieve the original Path object for robust error messages + future_to_project_path_map = { + executor.submit(process_single_project_for_cleaning, project_path, args): project_path + for project_path in project_folders + } + + for future in tqdm( + concurrent.futures.as_completed(future_to_project_path_map), + total=len(project_folders), + desc="Cleaning projects", + unit="project", + disable=args.verbose > 0, # tqdm is disabled if verbose output is on + mininterval=0.01, # More frequent updates, similar to the folder identification step + ): + processed_project_path = future_to_project_path_map[future] + try: + project_name, project_logs, project_errors = future.result() + processed_project_data.append( + (project_name, project_logs, project_errors, processed_project_path) + ) + except Exception as exc: + # Log critical errors during processing immediately, as they might prevent log collection + crit_error_msg = ( + f"Critical error during processing of project {processed_project_path.name}: {exc}" + ) + logger.error(crit_error_msg) + if args.verbose > 0: + print(crit_error_msg) + # Store a placeholder for sorted output + processed_project_data.append( + (processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path) + ) + except KeyboardInterrupt: + print("\nInterrupted by user. Attempting to shut down workers...") + logger.warning("Interrupted by user. Attempting to shut down workers...") + executor.shutdown(wait=False, cancel_futures=True) + sys.exit(1) + + # Sort all collected data by project name + processed_project_data.sort(key=lambda x: x[0]) + + # Log the collected and sorted data + for project_name, project_logs, project_parsing_errors, _project_path in processed_project_data: + # Log messages collected by the cleaner + for log_msg_from_buffer in project_logs: + logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner + if args.verbose > 0: # Print to console if verbose + print(log_msg_from_buffer) + + # Log parsing errors, ensuring they are associated with the project + if project_parsing_errors: + for err_str in project_parsing_errors: + error_log_message = f"[{project_name}] Config Error: {err_str}" + logger.warning(error_log_message) # Use warning for parsing/config errors if args.verbose > 0: - print(crit_error_msg) - # Store a placeholder for sorted output - processed_project_data.append( - (processed_project_path.name, [], [f"Critical error: {exc}"], processed_project_path) - ) - except KeyboardInterrupt: - print("\nInterrupted by user. Attempting to shut down workers...") - logger.warning("Interrupted by user. Attempting to shut down workers...") - executor.shutdown(wait=False, cancel_futures=True) - sys.exit(1) - - # Sort all collected data by project name - processed_project_data.sort(key=lambda x: x[0]) - - # Log the collected and sorted data - for project_name, project_logs, project_parsing_errors, _project_path in processed_project_data: - # Log messages collected by the cleaner - for log_msg_from_buffer in project_logs: - logger.info(log_msg_from_buffer) # Already formatted with [ProjectName] prefix by ProjectCleaner - if args.verbose > 0: # Print to console if verbose - print(log_msg_from_buffer) - - # Log parsing errors, ensuring they are associated with the project - if project_parsing_errors: - for err_str in project_parsing_errors: - error_log_message = f"[{project_name}] Config Error: {err_str}" - logger.warning(error_log_message) # Use warning for parsing/config errors - if args.verbose > 0: - print(error_log_message) - - logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project - - final_msg = "\nCleanup process completed." - logger.info(final_msg) - if args.verbose > 0: - print(final_msg) + print(error_log_message) + + logger.info(f"[{project_name}] Processing completed.") # Log overall completion for this project + + final_msg = "\nCleanup process completed." + logger.info(final_msg) + if args.verbose > 0: + print(final_msg) if __name__ == "__main__": From e03bec4ca875df949b8ad270ca11cc28ec12e8db Mon Sep 17 00:00:00 2001 From: David Baines Date: Mon, 25 Aug 2025 12:40:07 +0100 Subject: [PATCH 09/10] Remove ENV import and default direcrtory cleaning --- silnlp/common/clean_projects.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 27542321..997084e7 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse +from ast import Raise import concurrent.futures import fnmatch import logging @@ -9,11 +10,13 @@ from pathlib import Path from typing import Optional +# from silnlp.common.environment import SIL_NLP_ENV from machine.corpora import FileParatextProjectSettingsParser, ParatextProjectSettings from tqdm import tqdm # --- Global Constants --- -PROJECTS_FOLDER_DEFAULT = "M:/Paratext/projects" + +# PROJECTS_FOLDER_DEFAULT = SIL_NLP_ENV.pt_projects_dir logger = logging.getLogger(__name__) SETTINGS_FILENAME = "Settings.xml" @@ -332,8 +335,7 @@ def main(): parser.add_argument( "folders", nargs="*", - help="One or more Paratext project root directories to clean. If not specified, \ - uses SIL_NLP_ENV.pt_projects_dir.", + help="One or more Paratext project root directories to clean.", ) parser.add_argument( "--dry-run", @@ -351,17 +353,10 @@ def main(): parser.add_argument("--log-file", help="Path to a file to log actions and verbose information.") args = parser.parse_args() - # --- Import environment if needed --- - if not args.folders: - try: - from silnlp.common.environment import SIL_NLP_ENV - - projects_root_path = [Path(SIL_NLP_ENV.pt_projects_dir)] - except ImportError as e: - print(f"Could not import SIL_NLP_ENV from environment.py. {e}") - sys.exit(1) - else: + if args.folders: projects_root_paths = [Path(folder) for folder in args.folders] + else: + raise ValueError("At least one project folder must be specified as an argument.") # --- Configure Logging --- log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") @@ -469,9 +464,6 @@ def main(): processed_project_data: [list[tuple[str, list[str], list[str], Path]]] = [] - # Concurrently process each project folder for cleaning - # Re-use max_workers from the previous section, or define a new one if desired. - try: with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # Store future to project_path to retrieve the original Path object for robust error messages From 99bd2db23d346207e5ecf48596d5dc02a1ea6113 Mon Sep 17 00:00:00 2001 From: David Baines Date: Wed, 10 Sep 2025 16:55:15 +0100 Subject: [PATCH 10/10] Remove comments --- silnlp/common/clean_projects.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/silnlp/common/clean_projects.py b/silnlp/common/clean_projects.py index 997084e7..e1cb33af 100644 --- a/silnlp/common/clean_projects.py +++ b/silnlp/common/clean_projects.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -from ast import Raise import concurrent.futures import fnmatch import logging @@ -10,13 +9,11 @@ from pathlib import Path from typing import Optional -# from silnlp.common.environment import SIL_NLP_ENV from machine.corpora import FileParatextProjectSettingsParser, ParatextProjectSettings from tqdm import tqdm # --- Global Constants --- -# PROJECTS_FOLDER_DEFAULT = SIL_NLP_ENV.pt_projects_dir logger = logging.getLogger(__name__) SETTINGS_FILENAME = "Settings.xml"