Skip to content

Commit

Permalink
Enable browser caching and improve performance.
Browse files Browse the repository at this point in the history
Caching improvements:
- Chunked uploading to database
- Ignore unnecessary locale files

Additionally, some code cleanup.
  • Loading branch information
GJFR committed Dec 4, 2024
1 parent eff772b commit b5ba2ab
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 25 deletions.
16 changes: 14 additions & 2 deletions bci/browser/binary/vendors/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,23 @@ def download_binary(self):
bin_path = self.get_potential_bin_path()
os.makedirs(os.path.dirname(bin_path), exist_ok=True)
unzipped_folder_path = os.path.join(os.path.dirname(zip_file_path), "chrome-linux")
self.__remove_unnecessary_files(unzipped_folder_path)
util.safe_move_dir(unzipped_folder_path, os.path.dirname(bin_path))
cli.execute_and_return_status("chmod -R a+x %s" % os.path.dirname(bin_path))
# Remove temporary files in /tmp/COMMIT_POS
shutil.rmtree(os.path.dirname(zip_file_path))

def __remove_unnecessary_files(self, binary_folder_path: str) -> None:
"""
Remove binary files that are not necessary for default usage of the browser.
This is to improve performance, especially when caching binary files.
:param binary_folder_path: Path to the folder where the binary files are stored.
"""
locales_folder_path = os.path.join(binary_folder_path, 'locales')
if os.path.isdir(locales_folder_path):
util.remove_all_in_folder(locales_folder_path, except_files=['en-GB.pak', 'en-US.pak'])

def _get_version(self) -> str:
command = "./chrome --version"
if bin_path := self.get_bin_path():
Expand All @@ -86,11 +98,11 @@ def _get_version(self) -> str:

@staticmethod
def list_downloaded_binaries() -> list[dict[str, str]]:
return Binary.list_downloaded_binaries(BIN_FOLDER_PATH)
return Binary._list_downloaded_binaries(BIN_FOLDER_PATH)

@staticmethod
def get_artisanal_manager() -> ArtisanalBuildManager:
return Binary.get_artisanal_manager(BIN_FOLDER_PATH, EXECUTABLE_NAME)
return Binary._get_artisanal_manager(BIN_FOLDER_PATH, EXECUTABLE_NAME)

browser_version_to_driver_version = {
'88': "88.0.4324.96",
Expand Down
76 changes: 55 additions & 21 deletions bci/database/mongo/binary_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def write_from_db(file_path: str, grid_file_id: str) -> None:
return True

@staticmethod
def store_binary_files(binary_executable_path: str, state: State) -> bool:
def store_binary_files(binary_executable_path: str, state: State):
"""
Stores the files in the folder of the given path in the database.
Expand All @@ -80,35 +80,60 @@ def store_binary_files(binary_executable_path: str, state: State) -> bool:
if MongoDB().binary_cache_limit <= 0:
return False

while BinaryCache.__count_cached_binaries() >= MongoDB.binary_cache_limit:
while BinaryCache.__count_cached_binaries() >= MongoDB().binary_cache_limit:
if BinaryCache.__count_cached_binaries(state_type='revision') <= 0:
# There are only version binaries in the cache, which will never be removed
return False
BinaryCache.__remove_least_used_revision_binary_files()

logger.debug(f"Caching binary files for {state}...")
fs = MongoDB().gridfs

binary_folder_path = os.path.dirname(binary_executable_path)
last_access_ts = datetime.datetime.now()
def store_file(file_path: str) -> None:
# Max chunk size is 16 MB (meta-data included)
chunk_size = 1024 * 1024 * 15
with open(file_path, 'rb') as file:
file_id = fs.new_file(
file_type='binary',
browser_name=state.browser_name,
state_type=state.type,
state_index=state.index,
relative_file_path=os.path.relpath(file_path, binary_folder_path),
access_count=0,
last_access_ts=last_access_ts,
chunk_size=chunk_size
)
while chunk := file.read(chunk_size):
file_id.write(chunk)
file_id.close()

start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = []
for root, _, files in os.walk(binary_folder_path):
for file in files:
file_path = os.path.join(root, file)
with open(file_path, 'rb') as file:
executor.submit(
fs.put,
file.read(),
file_type='binary',
browser_name=state.browser_name,
state_type=state.type,
state_index=state.index,
relative_file_path=os.path.relpath(file_path, binary_folder_path),
access_count=0,
last_access_ts=datetime.datetime.now(),
)
future = executor.submit(store_file, file_path)
futures.append(future)
logger.debug(f"Number of files to cache: {len(futures)}")
executor.shutdown(wait=True)
elapsed_time = time.time() - start_time
logger.debug(f'Stored binary in {elapsed_time:.2f}s')
return True

futures_with_exception = [future for future in futures if future.exception() is not None]
if futures_with_exception:
logger.error(
(
f"Something went wrong caching binary files for {state}, "
"Removing possibly imcomplete binary files from cache."
),
exc_info=futures_with_exception[0].exception()
)
BinaryCache.__remove_revision_binary_files(state.type, state.index)
logger.debug(f"Removed possibly incomplete cached binary files for {state}.")
else:
elapsed_time = time.time() - start_time
logger.debug(f'Stored binary in {elapsed_time:.2f}s')

@staticmethod
def __count_cached_binaries(state_type: Optional[str] = None) -> int:
Expand All @@ -130,7 +155,6 @@ def __remove_least_used_revision_binary_files() -> None:
"""
Removes the least used revision binary files from the database.
"""
fs = MongoDB().gridfs
files_collection = MongoDB().get_collection('fs.files')

grid_cursor = files_collection.find(
Expand All @@ -139,6 +163,16 @@ def __remove_least_used_revision_binary_files() -> None:
)
for state_doc in grid_cursor:
state_index = state_doc['state_index']
for grid_doc in files_collection.find({'state_index': state_index, 'state_type': 'revision'}):
fs.delete(grid_doc['_id'])
BinaryCache.__remove_revision_binary_files('revision', state_index)
break

@staticmethod
def __remove_revision_binary_files(state_type: str, state_index: int) -> None:
"""
Removes the binary files associated with the parameters.
"""
fs = MongoDB().gridfs
files_collection = MongoDB().get_collection('fs.files')

for grid_doc in files_collection.find({'state_index': state_index, 'state_type': state_type}):
fs.delete(grid_doc['_id'])
1 change: 1 addition & 0 deletions bci/database/mongo/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def connect(self, db_params: DatabaseParameters) -> None:
retryWrites=False,
serverSelectionTimeoutMS=10000,
)
self.binary_cache_limit = db_params.binary_cache_limit
logger.info(f'Binary cache limit set to {db_params.binary_cache_limit}')
# Force connection to check whether MongoDB server is reachable
try:
Expand Down
7 changes: 5 additions & 2 deletions bci/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import shutil
import time
from typing import Optional

import requests

Expand Down Expand Up @@ -42,11 +43,13 @@ def copy_folder(src_path, dst_path):
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)


def remove_all_in_folder(folder_path: str) -> None:
def remove_all_in_folder(folder_path: str, except_files: Optional[list[str]]=None) -> None:
except_files = [] if except_files is None else except_files
for root, dirs, files in os.walk(folder_path):
for file_name in files:
file_path = os.path.join(root, file_name)
os.remove(file_path)
if file_name not in except_files:
os.remove(file_path)
for dir_name in dirs:
dir_path = os.path.join(root, dir_name)
shutil.rmtree(dir_path)
Expand Down

0 comments on commit b5ba2ab

Please sign in to comment.