diff --git a/bci/browser/binary/vendors/chromium.py b/bci/browser/binary/vendors/chromium.py index 8511b73..f7a99a0 100644 --- a/bci/browser/binary/vendors/chromium.py +++ b/bci/browser/binary/vendors/chromium.py @@ -68,11 +68,23 @@ def download_binary(self): bin_path = self.get_potential_bin_path() os.makedirs(os.path.dirname(bin_path), exist_ok=True) unzipped_folder_path = os.path.join(os.path.dirname(zip_file_path), "chrome-linux") + self.__remove_unnecessary_files(unzipped_folder_path) util.safe_move_dir(unzipped_folder_path, os.path.dirname(bin_path)) cli.execute_and_return_status("chmod -R a+x %s" % os.path.dirname(bin_path)) # Remove temporary files in /tmp/COMMIT_POS shutil.rmtree(os.path.dirname(zip_file_path)) + def __remove_unnecessary_files(self, binary_folder_path: str) -> None: + """ + Remove binary files that are not necessary for default usage of the browser. + This is to improve performance, especially when caching binary files. + + :param binary_folder_path: Path to the folder where the binary files are stored. + """ + locales_folder_path = os.path.join(binary_folder_path, 'locales') + if os.path.isdir(locales_folder_path): + util.remove_all_in_folder(locales_folder_path, except_files=['en-GB.pak', 'en-US.pak']) + def _get_version(self) -> str: command = "./chrome --version" if bin_path := self.get_bin_path(): @@ -86,11 +98,11 @@ def _get_version(self) -> str: @staticmethod def list_downloaded_binaries() -> list[dict[str, str]]: - return Binary.list_downloaded_binaries(BIN_FOLDER_PATH) + return Binary._list_downloaded_binaries(BIN_FOLDER_PATH) @staticmethod def get_artisanal_manager() -> ArtisanalBuildManager: - return Binary.get_artisanal_manager(BIN_FOLDER_PATH, EXECUTABLE_NAME) + return Binary._get_artisanal_manager(BIN_FOLDER_PATH, EXECUTABLE_NAME) browser_version_to_driver_version = { '88': "88.0.4324.96", diff --git a/bci/database/mongo/binary_cache.py b/bci/database/mongo/binary_cache.py index 7a98434..bf19086 100644 --- a/bci/database/mongo/binary_cache.py +++ b/bci/database/mongo/binary_cache.py @@ -69,7 +69,7 @@ def write_from_db(file_path: str, grid_file_id: str) -> None: return True @staticmethod - def store_binary_files(binary_executable_path: str, state: State) -> bool: + def store_binary_files(binary_executable_path: str, state: State): """ Stores the files in the folder of the given path in the database. @@ -80,35 +80,60 @@ def store_binary_files(binary_executable_path: str, state: State) -> bool: if MongoDB().binary_cache_limit <= 0: return False - while BinaryCache.__count_cached_binaries() >= MongoDB.binary_cache_limit: + while BinaryCache.__count_cached_binaries() >= MongoDB().binary_cache_limit: if BinaryCache.__count_cached_binaries(state_type='revision') <= 0: # There are only version binaries in the cache, which will never be removed return False BinaryCache.__remove_least_used_revision_binary_files() + logger.debug(f"Caching binary files for {state}...") fs = MongoDB().gridfs + binary_folder_path = os.path.dirname(binary_executable_path) + last_access_ts = datetime.datetime.now() + def store_file(file_path: str) -> None: + # Max chunk size is 16 MB (meta-data included) + chunk_size = 1024 * 1024 * 15 + with open(file_path, 'rb') as file: + file_id = fs.new_file( + file_type='binary', + browser_name=state.browser_name, + state_type=state.type, + state_index=state.index, + relative_file_path=os.path.relpath(file_path, binary_folder_path), + access_count=0, + last_access_ts=last_access_ts, + chunk_size=chunk_size + ) + while chunk := file.read(chunk_size): + file_id.write(chunk) + file_id.close() + start_time = time.time() - with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = [] for root, _, files in os.walk(binary_folder_path): for file in files: file_path = os.path.join(root, file) - with open(file_path, 'rb') as file: - executor.submit( - fs.put, - file.read(), - file_type='binary', - browser_name=state.browser_name, - state_type=state.type, - state_index=state.index, - relative_file_path=os.path.relpath(file_path, binary_folder_path), - access_count=0, - last_access_ts=datetime.datetime.now(), - ) + future = executor.submit(store_file, file_path) + futures.append(future) + logger.debug(f"Number of files to cache: {len(futures)}") executor.shutdown(wait=True) - elapsed_time = time.time() - start_time - logger.debug(f'Stored binary in {elapsed_time:.2f}s') - return True + + futures_with_exception = [future for future in futures if future.exception() is not None] + if futures_with_exception: + logger.error( + ( + f"Something went wrong caching binary files for {state}, " + "Removing possibly imcomplete binary files from cache." + ), + exc_info=futures_with_exception[0].exception() + ) + BinaryCache.__remove_revision_binary_files(state.type, state.index) + logger.debug(f"Removed possibly incomplete cached binary files for {state}.") + else: + elapsed_time = time.time() - start_time + logger.debug(f'Stored binary in {elapsed_time:.2f}s') @staticmethod def __count_cached_binaries(state_type: Optional[str] = None) -> int: @@ -130,7 +155,6 @@ def __remove_least_used_revision_binary_files() -> None: """ Removes the least used revision binary files from the database. """ - fs = MongoDB().gridfs files_collection = MongoDB().get_collection('fs.files') grid_cursor = files_collection.find( @@ -139,6 +163,16 @@ def __remove_least_used_revision_binary_files() -> None: ) for state_doc in grid_cursor: state_index = state_doc['state_index'] - for grid_doc in files_collection.find({'state_index': state_index, 'state_type': 'revision'}): - fs.delete(grid_doc['_id']) + BinaryCache.__remove_revision_binary_files('revision', state_index) break + + @staticmethod + def __remove_revision_binary_files(state_type: str, state_index: int) -> None: + """ + Removes the binary files associated with the parameters. + """ + fs = MongoDB().gridfs + files_collection = MongoDB().get_collection('fs.files') + + for grid_doc in files_collection.find({'state_index': state_index, 'state_type': state_type}): + fs.delete(grid_doc['_id']) diff --git a/bci/database/mongo/mongodb.py b/bci/database/mongo/mongodb.py index df8c567..456bad7 100644 --- a/bci/database/mongo/mongodb.py +++ b/bci/database/mongo/mongodb.py @@ -62,6 +62,7 @@ def connect(self, db_params: DatabaseParameters) -> None: retryWrites=False, serverSelectionTimeoutMS=10000, ) + self.binary_cache_limit = db_params.binary_cache_limit logger.info(f'Binary cache limit set to {db_params.binary_cache_limit}') # Force connection to check whether MongoDB server is reachable try: diff --git a/bci/util.py b/bci/util.py index 3175ca1..f91bf5d 100644 --- a/bci/util.py +++ b/bci/util.py @@ -7,6 +7,7 @@ import os import shutil import time +from typing import Optional import requests @@ -42,11 +43,13 @@ def copy_folder(src_path, dst_path): shutil.copytree(src_path, dst_path, dirs_exist_ok=True) -def remove_all_in_folder(folder_path: str) -> None: +def remove_all_in_folder(folder_path: str, except_files: Optional[list[str]]=None) -> None: + except_files = [] if except_files is None else except_files for root, dirs, files in os.walk(folder_path): for file_name in files: file_path = os.path.join(root, file_name) - os.remove(file_path) + if file_name not in except_files: + os.remove(file_path) for dir_name in dirs: dir_path = os.path.join(root, dir_name) shutil.rmtree(dir_path)