Skip to content

Commit

Permalink
Resolve #32 by using revision cache more efficiently
Browse files Browse the repository at this point in the history
  • Loading branch information
GJFR committed Oct 22, 2024
1 parent 6335b0f commit 0a61592
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 45 deletions.
2 changes: 0 additions & 2 deletions bci/browser/binary/vendors/firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
from bci import cli, util
from bci.browser.binary.artisanal_manager import ArtisanalBuildManager
from bci.browser.binary.binary import Binary
from bci.version_control.states.revisions.firefox import (BINARY_AVAILABILITY_MAPPING,
REVISION_NUMBER_MAPPING)
from bci.version_control.states.state import State

logger = logging.getLogger('bci')
Expand Down
14 changes: 11 additions & 3 deletions bci/database/mongo/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime, timezone

from flatten_dict import flatten
from pymongo import MongoClient
from pymongo import ASCENDING, MongoClient
from pymongo.collection import Collection
from pymongo.errors import ServerSelectionTimeoutError

Expand Down Expand Up @@ -83,18 +83,26 @@ def disconnect():

@staticmethod
def __initialize_collections():
for collection_name in ['chromium_binary_availability', 'firefox_central_binary_availability']:
for collection_name in ['chromium_binary_availability']:
if collection_name not in DB.list_collection_names():
DB.create_collection(collection_name)

# Binary cache
if 'fs.files' not in DB.list_collection_names():
# Create the 'fs.files' collection with indexes
DB.create_collection('fs.files', {})
DB.create_collection('fs.files')
DB['fs.files'].create_index(['state_type', 'browser_name', 'state_index', 'relative_file_path'], unique=True)
if 'fs.chunks' not in DB.list_collection_names():
# Create the 'fs.chunks' collection with zstd compression
DB.create_collection('fs.chunks', storageEngine={'wiredTiger': {'configString': 'block_compressor=zstd'}})
DB['fs.chunks'].create_index(['files_id', 'n'], unique=True)

# Revision cache
if 'firefox_binary_availability' not in DB.list_collection_names():
DB.create_collection('firefox_binary_availability')
DB['firefox_binary_availability'].create_index([('revision_number', ASCENDING)])
DB['firefox_binary_availability'].create_index(['node'])

def get_collection(self, name: str):
if name not in DB.list_collection_names():
logger.info(f"Collection '{name}' does not exist, creating it...")
Expand Down
64 changes: 64 additions & 0 deletions bci/database/mongo/revision_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
from typing import Optional

from pymongo import ASCENDING, DESCENDING

from bci.database.mongo.mongodb import MongoDB

logger = logging.getLogger(__name__)


class RevisionCache:
@staticmethod
def store_firefox_binary_availability(data: dict) -> None:
values = list(data.values())
collection = MongoDB.get_instance().db['firefox_binary_availability']

if (n := len(values)) == collection.count_documents({}):
logger.debug(f'Reivision Cache was not updated ({n} documents).')
return

collection.delete_many({})
collection.insert_many(values)
logger.info(f'Revision Cache was updates ({len(values)} documents).')

@staticmethod
def firefox_get_revision_number(revision_id: str) -> int:
collection = MongoDB.get_instance().db['firefox_binary_availability']
result = collection.find_one({'revision_id': revision_id}, {'revision_number': 1})
return result['revision_number']

@staticmethod
def firefox_has_binary_for(revision_nb: Optional[int], revision_id: Optional[str]) -> bool:
collection = MongoDB.get_instance().db['firefox_binary_availability']
if revision_nb:
result = collection.find_one({'revision_number': revision_nb})
elif revision_id:
result = collection.find_one({'revision_number': revision_nb})
else:
raise AttributeError('No revision number or id was provided')
return result is not None

@staticmethod
def firefox_get_binary_info(revision_id: str) -> dict:
collection = MongoDB.get_instance().db['firefox_binary_availability']
return collection.find_one({'revision_id': revision_id}, {'files_url': 1, 'app_version': 1})

@staticmethod
def firefox_get_previous_and_next_revision_nb_with_binary(revision_nb: int) -> tuple[int, int]:
collection = MongoDB.get_instance().db['firefox_binary_availability']

previous_revision_nbs = collection.find({'revision_number': {'$lt': revision_nb}}).sort(
{'revision_number': DESCENDING}
)
previous_document = next(previous_revision_nbs, None)

next_revision_nbs = collection.find({'revision_number': {'$gt': revision_nb}}).sort(
{'revision_number': ASCENDING}
)
next_document = next(next_revision_nbs, None)

return (
previous_document['revision_number'] if previous_document else None,
next_document['revision_number'] if next_document else None,
)
2 changes: 1 addition & 1 deletion bci/distribution/worker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def start_container_thread():
'/dev/shm:/dev/shm',
],
)
logger.debug(f"Container '{container_name}' finished experiments with parameters '{repr(params)}'")
logger.debug(f"Container '{container_name}' finished experiments for '{params.state}'")
Clients.push_results_to_all()
except docker.errors.ContainerError:
logger.error(
Expand Down
3 changes: 3 additions & 0 deletions bci/master.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import bci.database.mongo.container as mongodb_container
from bci.configuration import Global
from bci.database.mongo.mongodb import MongoDB, ServerException
from bci.database.mongo.revision_cache import RevisionCache
from bci.distribution.worker_manager import WorkerManager
from bci.evaluations.custom.custom_evaluation import CustomEvaluationFramework
from bci.evaluations.evaluation_framework import EvaluationFramework
Expand All @@ -18,6 +19,7 @@
from bci.search_strategy.composite_search import CompositeSearch
from bci.search_strategy.sequence_strategy import SequenceFinished, SequenceStrategy
from bci.version_control.factory import StateFactory
from bci.version_control.states.revisions.firefox import BINARY_AVAILABILITY_MAPPING
from bci.web.clients import Clients

logger = logging.getLogger(__name__)
Expand All @@ -41,6 +43,7 @@ def __init__(self):
Global.initialize_folders()
self.db_connection_params = Global.get_database_params()
self.connect_to_database(self.db_connection_params)
RevisionCache.store_firefox_binary_availability(BINARY_AVAILABILITY_MAPPING) # TODO: find better place
self.inititialize_available_evaluation_frameworks()
logger.info('BugHog is ready!')

Expand Down
17 changes: 15 additions & 2 deletions bci/search_strategy/bgb_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,14 @@ def next(self) -> State:

pairs = list(zip(self._completed_states, self._completed_states[1:]))
while pairs:
furthest_pair = max(pairs, key=lambda x: x[1].index - x[0].index)
filtered_pairs = [pair for pair in pairs if not self._pair_is_in_unavailability_gap(pair)]
furthest_pair = max(filtered_pairs, key=lambda x: x[1].index - x[0].index)
splitter_state = self._find_best_splitter_state(furthest_pair[0], furthest_pair[1])
if splitter_state is None:
self._unavailability_gap_pairs.add(furthest_pair)
elif splitter_state:
logger.debug(
f"Splitting [{furthest_pair[0].index}]--/{splitter_state.index}/--[{furthest_pair[1].index}]"
f'Splitting [{furthest_pair[0].index}]--/{splitter_state.index}/--[{furthest_pair[1].index}]'
)
self._add_state(splitter_state)
return splitter_state
Expand All @@ -76,3 +77,15 @@ def _state_is_in_unavailability_gap(self, state: State) -> bool:
if pair[0].index < state.index < pair[1].index:
return True
return False

def _pair_is_in_unavailability_gap(self, pair: tuple[State, State]) -> bool:
"""
Returns True if the pair of states is in a gap between two states without any available binaries
"""
for gap_pair in self._unavailability_gap_pairs:
if (
gap_pair[0].index < pair[0].index < gap_pair[1].index
and gap_pair[0].index < pair[1].index < gap_pair[1].index
):
return True
return False
68 changes: 42 additions & 26 deletions bci/search_strategy/sequence_strategy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from abc import abstractmethod
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from typing import Optional

from bci.version_control.factory import StateFactory
Expand Down Expand Up @@ -64,6 +64,14 @@ def _find_closest_state_with_available_binary(self, target: State, boundaries: t
if target.has_available_binary():
return target

try:
if state := self.__get_closest_available_state(target, boundaries):
return state
else:
return None
except FunctionalityNotAvailable:
pass

def index_has_available_binary(index: int) -> Optional[State]:
state = self._state_factory.create_state(index)
if state.has_available_binary():
Expand All @@ -74,36 +82,44 @@ def index_has_available_binary(index: int) -> Optional[State]:
diff = 1
first_state, last_state = boundaries
best_splitter_index = target.index
while (best_splitter_index - diff - 1) > first_state.index or (best_splitter_index + diff + 1) < last_state.index:
threads = []
for offset in (-diff, diff, - 1 - diff, 1 + diff):
target_index = best_splitter_index + offset
if first_state.index < target_index < last_state.index:
thread = ThreadWithReturnValue(target=index_has_available_binary, args=(target_index,))
thread.start()
threads.append(thread)

for thread in threads:
state = thread.join()
if state:
return state
while (best_splitter_index - diff) > first_state.index or (best_splitter_index + diff) < last_state.index:
with ThreadPoolExecutor(max_workers=6) as executor:
futures = []
for offset in (-diff, diff, - 1 - diff, 1 + diff, - 2 - diff, 2 + diff):
target_index = best_splitter_index + offset
if first_state.index < target_index < last_state.index:
futures.append(executor.submit(index_has_available_binary, target_index))

for future in futures:
state = future.result()
if state:
return state

diff += 2
return None


class SequenceFinished(Exception):
pass
def __get_closest_available_state(self, target: State, boundaries: tuple[State, State]) -> State | None:
"""
Return the closest state with an available binary.
"""
try:
states = target.get_previous_and_next_state_with_binary()
states = [state for state in states if state is not None]
ordered_states = sorted(states, key=lambda x: abs(target.revision_nb - x.revision_nb))

for state in ordered_states:
if boundaries[0].revision_nb < state.revision_nb < boundaries[1].revision_nb:
return state

return None

class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, Verbose=None):
Thread.__init__(self, group=group, target=target, name=name, args=args, kwargs=kwargs)
self._return = None
except NotImplementedError as e:
raise FunctionalityNotAvailable() from e

def run(self):
if self._target is not None:
self._return = self._target(*self._args, **self._kwargs)

def join(self, *args):
Thread.join(self, *args)
return self._return
class SequenceFinished(Exception):
pass

class FunctionalityNotAvailable(Exception):
pass
27 changes: 17 additions & 10 deletions bci/version_control/states/revisions/firefox.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Optional

from bci.database.mongo.revision_cache import RevisionCache
from bci.util import request_json
from bci.version_control.states.revisions.base import BaseRevision
from bci.version_control.states.state import State

BINARY_AVAILABILITY_URL = 'https://distrinet.pages.gitlab.kuleuven.be/users/gertjan-franken/bughog-revision-metadata/firefox_binary_availability.json'
REVISION_NUMBER_MAPPING_URL = 'https://distrinet.pages.gitlab.kuleuven.be/users/gertjan-franken/bughog-revision-metadata/firefox_revision_nb_to_id.json'
Expand All @@ -22,22 +24,27 @@ def browser_name(self) -> str:
return 'firefox'

def has_online_binary(self) -> bool:
if self._revision_id:
return self._revision_id in BINARY_AVAILABILITY_MAPPING
if self._revision_nb:
return str(self._revision_nb) in REVISION_NUMBER_MAPPING
raise AttributeError('Cannot check binary availability without a revision id or revision number')
return RevisionCache.firefox_has_binary_for(revision_nb=self.revision_nb, revision_id=self._revision_id)

def get_online_binary_url(self) -> str:
binary_base_url = BINARY_AVAILABILITY_MAPPING[self._revision_id]['files_url']
app_version = BINARY_AVAILABILITY_MAPPING[self._revision_id]['app_version']
result = RevisionCache.firefox_get_binary_info(self._revision_id)
binary_base_url = result['files_url']
app_version = result['app_version']
binary_url = f'{binary_base_url}firefox-{app_version}.en-US.linux-x86_64.tar.bz2'
return binary_url

def get_previous_and_next_state_with_binary(self) -> tuple[State, State]:
previous_revision_nb, next_revision_nb = RevisionCache.firefox_get_previous_and_next_revision_nb_with_binary(
self.revision_nb
)

return (
FirefoxRevision(revision_nb=previous_revision_nb) if previous_revision_nb else None,
FirefoxRevision(revision_nb=next_revision_nb) if next_revision_nb else None,
)

def _fetch_missing_data(self):
if self._revision_id is None:
self._revision_id = REVISION_NUMBER_MAPPING.get(str(self._revision_nb), None)
if self._revision_nb is None:
binary_data = BINARY_AVAILABILITY_MAPPING.get(self._revision_id, None)
if binary_data is not None:
self._revision_nb = binary_data.get('revision_number')
RevisionCache.firefox_get_revision_number(self._revision_id)
3 changes: 3 additions & 0 deletions bci/version_control/states/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ def has_available_binary(self) -> bool:
self.condition = StateCondition.UNAVAILABLE
return has_available_binary

def get_previous_and_next_state_with_binary(self) -> tuple[State, State]:
raise NotImplementedError(f'This function is not implemented for {self}')

def __repr__(self) -> str:
return f'State(index={self.index})'

Expand Down
2 changes: 1 addition & 1 deletion bci/version_control/states/versions/firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def __init__(self, major_version: int):
def _get_rev_nb(self) -> int:
return get_release_revision_number(self.major_version)

def _get_rev_id(self):
def _get_rev_id(self) -> str:
return get_release_revision_id(self.major_version)

@property
Expand Down

0 comments on commit 0a61592

Please sign in to comment.