From 77debc410fdbecef89f4468fd328ca3ca1f4ea63 Mon Sep 17 00:00:00 2001 From: Sid Date: Fri, 28 Jun 2024 11:29:56 -0700 Subject: [PATCH 1/9] Add eval_block_delay for competitions and check in ModelUpdater. --- competitions/data.py | 3 ++ constants/__init__.py | 1 + model/model_updater.py | 22 ++++++-- neurons/validator.py | 49 ++++++++++-------- .../competitions/test_competition_tracker.py | 2 + tests/competitions/test_utils.py | 3 ++ tests/model/test_model_updater.py | 51 +++++++++++++++++-- 7 files changed, 99 insertions(+), 32 deletions(-) diff --git a/competitions/data.py b/competitions/data.py index 12f3118..bae6554 100644 --- a/competitions/data.py +++ b/competitions/data.py @@ -35,6 +35,9 @@ class ModelConstraints: # The Pretrained tokenizer to use. tokenizer: str + # Block delay before evaluating uploaded models. Based on look-back period for eval data collection. + eval_block_delay: int + # Any additional arguments to pass to from_pretrained kwargs: Any = field(default_factory=dict) diff --git a/constants/__init__.py b/constants/__init__.py index 0327b99..5284639 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -75,6 +75,7 @@ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", }, + eval_block_delay=7200, # ~1 day. ), } diff --git a/model/model_updater.py b/model/model_updater.py index 65620a7..7a3d9f0 100644 --- a/model/model_updater.py +++ b/model/model_updater.py @@ -58,12 +58,15 @@ async def _get_metadata(self, hotkey: str) -> Optional[ModelMetadata]: """Get metadata about a model by hotkey""" return await self.metadata_store.retrieve_model_metadata(hotkey) - async def sync_model(self, hotkey: str, force: bool = False) -> bool: + async def sync_model( + self, hotkey: str, curr_block: int, retry_stable_metadata: bool = False + ) -> bool: """Updates local model for a hotkey if out of sync and returns if it was updated." Args: hotkey (str): The hotkey of the model to sync. - force (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed. + curr_block (int): The current block. + retry_stable_metadata (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed. """ # Get the metadata for the miner. metadata = await self._get_metadata(hotkey) @@ -81,15 +84,24 @@ async def sync_model(self, hotkey: str, force: bool = False) -> bool: metadata.id.competition_id, metadata.block ) if not competition: - bt.logging.trace(f"No competition found for {metadata.id.competition_id} at block {metadata.block}") - raise ValueError(f"No competition found for {metadata.id.competition_id} at block {metadata.block}") + bt.logging.trace( + f"No competition found for {metadata.id.competition_id} at block {metadata.block}" + ) + raise ValueError( + f"No competition found for {metadata.id.competition_id} at block {metadata.block}" + ) + + # Check that the metadata is old enough to meet the eval_block_delay for the competition. + # If not we return false and will check again next time we go through the update loop. + if curr_block - metadata.block < competition.constraints.eval_block_delay: + return False # Check what model id the model tracker currently has for this hotkey. tracker_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey( hotkey ) # If we are not forcing a sync due to retrying a top model we can short-circuit if no change. - if not force and metadata == tracker_model_metadata: + if not retry_stable_metadata and metadata == tracker_model_metadata: return False # Get the local path based on the local store to download to (top level hotkey path) diff --git a/neurons/validator.py b/neurons/validator.py index edf7244..b43d53e 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -390,31 +390,35 @@ def update_models(self): try: uid_last_retried_evaluation[uid] = dt.datetime.now() - # Redownload this model and schedule it for eval even if it isn't updated by the sync. + # Redownload this model and schedule it for eval even if it hasn't changed. + # Still respect the eval block delay so that previously top uids can't bypass it. hotkey = metagraph.hotkeys[uid] - asyncio.run( - self.model_updater.sync_model(hotkey, force=True) + should_retry = self.model_updater.sync_model( + hotkey, + metagraph.block.item(), + retry_stable_metadata=True, ) - # Since this is a top model (as determined by other valis), - # we don't worry if self.pending_uids is already "full". - # Validators should only have ~1 winner per competition and we only check bigger valis - # so there should not be many simultaneous top models not already being evaluated. - top_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey( - hotkey - ) - if top_model_metadata is not None: - bt.logging.trace( - f"Shortcutting to top model or retrying evaluation for previously discarded top model with incentive for UID={uid}" - ) - with self.pending_uids_to_eval_lock: - self.pending_uids_to_eval[ - top_model_metadata.id.competition_id - ].add(uid) - else: - bt.logging.warning( - f"Failed to find metadata for uid {uid} with hotkey {hotkey}" + if should_retry: + # Since this is a top model (as determined by other valis), + # we don't worry if self.pending_uids is already "full". + # Validators should only have ~1 winner per competition and we only check bigger valis + # so there should not be many simultaneous top models not already being evaluated. + top_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey( + hotkey ) + if top_model_metadata is not None: + bt.logging.trace( + f"Shortcutting to top model or retrying evaluation for previously discarded top model with incentive for UID={uid}" + ) + with self.pending_uids_to_eval_lock: + self.pending_uids_to_eval[ + top_model_metadata.id.competition_id + ].add(uid) + else: + bt.logging.warning( + f"Failed to find metadata for uid {uid} with hotkey {hotkey}" + ) except Exception: bt.logging.debug( f"Failure in update loop for UID={uid} during top model check. {traceback.format_exc()}" @@ -468,10 +472,11 @@ def update_models(self): # Get their hotkey from the metagraph. with self.metagraph_lock: hotkey = self.metagraph.hotkeys[next_uid] + curr_block = self.metagraph.block.item() # Compare metadata and tracker, syncing new model from remote store to local if necessary. updated = asyncio.run( - self.model_updater.sync_model(hotkey, force=False) + self.model_updater.sync_model(hotkey, curr_block, retry_stable_metadata=False) ) if updated: diff --git a/tests/competitions/test_competition_tracker.py b/tests/competitions/test_competition_tracker.py index 4c9bbf6..214cd2b 100644 --- a/tests/competitions/test_competition_tracker.py +++ b/tests/competitions/test_competition_tracker.py @@ -16,6 +16,7 @@ class TestCompetitionTracker(unittest.TestCase): sequence_length=4096, allowed_architectures=[LlamaForCausalLM], tokenizer="Xenova/gpt-4", + eval_block_delay=7200, kwargs={}, ), reward_percentage=0.6, @@ -27,6 +28,7 @@ class TestCompetitionTracker(unittest.TestCase): sequence_length=2048, allowed_architectures=[LlamaForCausalLM], tokenizer="Xenova/gpt-4", + eval_block_delay=7200, kwargs={}, ), reward_percentage=0.4, diff --git a/tests/competitions/test_utils.py b/tests/competitions/test_utils.py index 34ec21c..acb772c 100644 --- a/tests/competitions/test_utils.py +++ b/tests/competitions/test_utils.py @@ -36,6 +36,7 @@ def test_get_model_constraints_valid_competition(self): GemmaForCausalLM, ], tokenizer="Xenova/gpt-4", + eval_block_delay=7200, kwargs={ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", @@ -65,6 +66,7 @@ def test_get_competition_for_block_valid_competition(self): GemmaForCausalLM, ], tokenizer="Xenova/gpt-4", + eval_block_delay=7200, kwargs={ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", @@ -101,6 +103,7 @@ def test_get_competition_schedule_for_block_valid_block(self): GemmaForCausalLM, ], tokenizer="Xenova/gpt-4", + eval_block_delay=7200, kwargs={ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", diff --git a/tests/model/test_model_updater.py b/tests/model/test_model_updater.py index 986b306..03f1c17 100644 --- a/tests/model/test_model_updater.py +++ b/tests/model/test_model_updater.py @@ -74,7 +74,7 @@ def test_sync_model_bad_metadata(self): # FakeRemoteModelStore raises a KeyError but HuggingFace may raise other exceptions. with self.assertRaises(Exception): - asyncio.run(self.model_updater.sync_model(hotkey)) + asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) def test_sync_model_same_metadata(self): hotkey = "test_hotkey" @@ -101,7 +101,7 @@ def test_sync_model_same_metadata(self): self.model_tracker.on_miner_model_updated(hotkey, model_metadata) - asyncio.run(self.model_updater.sync_model(hotkey)) + asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) # Tracker information did not change. self.assertEqual( @@ -145,7 +145,7 @@ def test_sync_model_new_metadata(self): with self.assertRaises(Exception): self.local_store.retrieve_model(hotkey, model_id, kwargs={}) - asyncio.run(self.model_updater.sync_model(hotkey)) + asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) self.assertEqual( self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey), @@ -156,6 +156,46 @@ def test_sync_model_new_metadata(self): str(model), ) + def test_sync_model_new_metadata_under_block_delay(self): + hotkey = "test_hotkey" + model_hash = "TestHash1" + model_id = ModelId( + namespace="TestPath", + name="TestModel", + competition_id=CompetitionId.SN9_MODEL, + hash=model_hash, + secure_hash=utils.get_hash_of_two_strings(model_hash, hotkey), + commit="TestCommit", + ) + model_metadata = ModelMetadata(id=model_id, block=1) + + pt_model = self.tiny_model + + model = Model(id=model_id, pt_model=pt_model) + + # Setup the metadata and remote store but not local or the model_tracker. + asyncio.run( + self.metadata_store.store_model_metadata_exact(hotkey, model_metadata) + ) + asyncio.run( + self.remote_store.upload_model( + model, + competition_utils.get_model_constraints(CompetitionId.SN9_MODEL), + ) + ) + + self.assertIsNone( + self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey) + ) + + updated = asyncio.run(self.model_updater.sync_model(hotkey, curr_block=1)) + + # Tracker information did not change. + self.assertFalse(updated) + self.assertIsNone( + self.model_tracker.get_model_metadata_for_miner_hotkey(hotkey) + ) + def test_sync_model_bad_hash(self): hotkey = "test_hotkey" model_hash = "TestHash1" @@ -191,7 +231,7 @@ def test_sync_model_bad_hash(self): # Assert we fail due to the hash mismatch between the model in remote store and the metadata on chain. with self.assertRaises(ValueError) as context: - asyncio.run(self.model_updater.sync_model(hotkey)) + asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) self.assertIn("Hash", str(context.exception)) @@ -237,7 +277,7 @@ def test_sync_model_wrong_parameters(self): # Assert we fail due to not meeting the competition parameters. with self.assertRaises(ValueError) as context: - asyncio.run(self.model_updater.sync_model(hotkey)) + asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) self.assertIn("does not satisfy parameters", str(context.exception)) @@ -249,5 +289,6 @@ def test_sync_model_wrong_parameters(self): # TODO: Create test for valid competition at too early of a block once added. + if __name__ == "__main__": unittest.main() From 9069b16ddcd6b2061ea3cd9346d59fc3ee12eaf0 Mon Sep 17 00:00:00 2001 From: Sid Date: Fri, 28 Jun 2024 11:36:17 -0700 Subject: [PATCH 2/9] Add comment about metagraph usage in update_models. --- neurons/validator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/neurons/validator.py b/neurons/validator.py index b43d53e..5b2b171 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -355,6 +355,8 @@ def update_models(self): > constants.chain_update_cadence ): last_checked_top_models_time = dt.datetime.now() + # Take a deep copy of the metagraph for use in the top uid retry check. + # The regular loop below will use self.metagraph which may be updated as we go. with self.metagraph_lock: metagraph = copy.deepcopy(self.metagraph) @@ -476,7 +478,9 @@ def update_models(self): # Compare metadata and tracker, syncing new model from remote store to local if necessary. updated = asyncio.run( - self.model_updater.sync_model(hotkey, curr_block, retry_stable_metadata=False) + self.model_updater.sync_model( + hotkey, curr_block, retry_stable_metadata=False + ) ) if updated: From a0cda2f03f9a3d3a910e75ba480a71da51113736 Mon Sep 17 00:00:00 2001 From: Sid Date: Fri, 28 Jun 2024 11:44:12 -0700 Subject: [PATCH 3/9] Add logging for eval block delay case in ModelUpdater. --- model/model_updater.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/model/model_updater.py b/model/model_updater.py index 7a3d9f0..00f8f6b 100644 --- a/model/model_updater.py +++ b/model/model_updater.py @@ -94,6 +94,11 @@ async def sync_model( # Check that the metadata is old enough to meet the eval_block_delay for the competition. # If not we return false and will check again next time we go through the update loop. if curr_block - metadata.block < competition.constraints.eval_block_delay: + bt.logging.debug( + f"""Sync for hotkey {hotkey} delayed as the current block: {curr_block} is not at least + {competition.constraints.eval_block_delay} blocks after the upload block: {metadata.block}. + Will automatically retry later.""" + ) return False # Check what model id the model tracker currently has for this hotkey. From f1c3fe5e786a1fadbbbfaea4785a1644ca10dc07 Mon Sep 17 00:00:00 2001 From: Sid Date: Mon, 1 Jul 2024 08:07:17 -0700 Subject: [PATCH 4/9] Reduce delay to 4 hours and keep 'force' parameter name. --- constants/__init__.py | 4 ++-- model/model_updater.py | 6 +++--- neurons/validator.py | 6 ++---- tests/competitions/test_competition_tracker.py | 4 ++-- tests/competitions/test_utils.py | 6 +++--- 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index 5284639..b8d194f 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -43,7 +43,7 @@ CORTEX_WANDB_PROJECT = "cortex-t/multi-modality" CORTEX_WANDB_TYPE = "validator" CORTEX_MAX_UIDS = 256 -CORTEX_MAX_AGE = dt.timedelta(days=1) +CORTEX_MAX_AGE = dt.timedelta(hours=4) CORTEX_MIN_SCORE = 0.85 # Minimum stake to get data from a cortex validator. CORTEX_MIN_STAKE = 100_000 @@ -75,7 +75,7 @@ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", }, - eval_block_delay=7200, # ~1 day. + eval_block_delay=1200, # ~4 hours. ), } diff --git a/model/model_updater.py b/model/model_updater.py index 00f8f6b..ef03b85 100644 --- a/model/model_updater.py +++ b/model/model_updater.py @@ -59,14 +59,14 @@ async def _get_metadata(self, hotkey: str) -> Optional[ModelMetadata]: return await self.metadata_store.retrieve_model_metadata(hotkey) async def sync_model( - self, hotkey: str, curr_block: int, retry_stable_metadata: bool = False + self, hotkey: str, curr_block: int, force: bool = False ) -> bool: """Updates local model for a hotkey if out of sync and returns if it was updated." Args: hotkey (str): The hotkey of the model to sync. curr_block (int): The current block. - retry_stable_metadata (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed. + force (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed. """ # Get the metadata for the miner. metadata = await self._get_metadata(hotkey) @@ -106,7 +106,7 @@ async def sync_model( hotkey ) # If we are not forcing a sync due to retrying a top model we can short-circuit if no change. - if not retry_stable_metadata and metadata == tracker_model_metadata: + if not force and metadata == tracker_model_metadata: return False # Get the local path based on the local store to download to (top level hotkey path) diff --git a/neurons/validator.py b/neurons/validator.py index 5b2b171..1c872bc 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -398,7 +398,7 @@ def update_models(self): should_retry = self.model_updater.sync_model( hotkey, metagraph.block.item(), - retry_stable_metadata=True, + force=True, ) if should_retry: @@ -478,9 +478,7 @@ def update_models(self): # Compare metadata and tracker, syncing new model from remote store to local if necessary. updated = asyncio.run( - self.model_updater.sync_model( - hotkey, curr_block, retry_stable_metadata=False - ) + self.model_updater.sync_model(hotkey, curr_block, force=False) ) if updated: diff --git a/tests/competitions/test_competition_tracker.py b/tests/competitions/test_competition_tracker.py index 214cd2b..13855c4 100644 --- a/tests/competitions/test_competition_tracker.py +++ b/tests/competitions/test_competition_tracker.py @@ -16,7 +16,7 @@ class TestCompetitionTracker(unittest.TestCase): sequence_length=4096, allowed_architectures=[LlamaForCausalLM], tokenizer="Xenova/gpt-4", - eval_block_delay=7200, + eval_block_delay=1200, kwargs={}, ), reward_percentage=0.6, @@ -28,7 +28,7 @@ class TestCompetitionTracker(unittest.TestCase): sequence_length=2048, allowed_architectures=[LlamaForCausalLM], tokenizer="Xenova/gpt-4", - eval_block_delay=7200, + eval_block_delay=1200, kwargs={}, ), reward_percentage=0.4, diff --git a/tests/competitions/test_utils.py b/tests/competitions/test_utils.py index acb772c..c61c2d3 100644 --- a/tests/competitions/test_utils.py +++ b/tests/competitions/test_utils.py @@ -36,7 +36,7 @@ def test_get_model_constraints_valid_competition(self): GemmaForCausalLM, ], tokenizer="Xenova/gpt-4", - eval_block_delay=7200, + eval_block_delay=1200, kwargs={ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", @@ -66,7 +66,7 @@ def test_get_competition_for_block_valid_competition(self): GemmaForCausalLM, ], tokenizer="Xenova/gpt-4", - eval_block_delay=7200, + eval_block_delay=1200, kwargs={ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", @@ -103,7 +103,7 @@ def test_get_competition_schedule_for_block_valid_block(self): GemmaForCausalLM, ], tokenizer="Xenova/gpt-4", - eval_block_delay=7200, + eval_block_delay=1200, kwargs={ "torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2", From ee789791b62eb5db84cfbea29c5c98a66b31c8e4 Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Mon, 1 Jul 2024 20:20:03 -0700 Subject: [PATCH 5/9] Address logging error noise in the update loop --- model/model_updater.py | 36 +++++++++++++++++++----------------- neurons/validator.py | 6 ++++-- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/model/model_updater.py b/model/model_updater.py index ef03b85..e9cec86 100644 --- a/model/model_updater.py +++ b/model/model_updater.py @@ -13,6 +13,14 @@ from model.utils import get_hash_of_two_strings +class MinerMisconfiguredError(Exception): + """Error raised when a miner is misconfigured.""" + + def __init__(self, hotkey: str, message: str): + self.hotkey = hotkey + super().__init__(f"[{hotkey}] {message}") + + class ModelUpdater: """Checks if the currently tracked model for a hotkey matches what the miner committed to the chain.""" @@ -72,11 +80,8 @@ async def sync_model( metadata = await self._get_metadata(hotkey) if not metadata: - bt.logging.trace( - f"No valid metadata found on the chain for hotkey {hotkey}" - ) - raise ValueError( - f"No valid metadata found on the chain for hotkey {hotkey}" + raise MinerMisconfiguredError( + hotkey, f"No valid metadata found on the chain" ) # Check that the metadata indicates a competition available at time of upload. @@ -84,11 +89,9 @@ async def sync_model( metadata.id.competition_id, metadata.block ) if not competition: - bt.logging.trace( - f"No competition found for {metadata.id.competition_id} at block {metadata.block}" - ) - raise ValueError( - f"No competition found for {metadata.id.competition_id} at block {metadata.block}" + raise MinerMisconfiguredError( + hotkey, + f"No competition found for {metadata.id.competition_id} at block {metadata.block}", ) # Check that the metadata is old enough to meet the eval_block_delay for the competition. @@ -123,16 +126,15 @@ async def sync_model( # Check that the hash of the downloaded content matches. secure_hash = get_hash_of_two_strings(model.id.hash, hotkey) if secure_hash != metadata.id.secure_hash: - bt.logging.trace( - f"Sync for hotkey {hotkey} failed. Hashes do not match of content: {secure_hash} != {metadata.id.secure_hash}." - ) - raise ValueError( - f"Sync for hotkey {hotkey} failed. Hash of content downloaded from hugging face does not match chain metadata. {metadata}" + raise MinerMisconfiguredError( + hotkey, + f"Hash of content downloaded from hugging face does not match chain metadata. {metadata}", ) if not ModelUpdater.verify_model_satisfies_parameters(model): - raise ValueError( - f"Sync for hotkey {hotkey} failed, model does not satisfy parameters for competition {competition.id}" + raise MinerMisconfiguredError( + hotkey, + f"Model does not satisfy parameters for competition {competition.id}", ) return True diff --git a/neurons/validator.py b/neurons/validator.py index 1c872bc..1c33ada 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -46,7 +46,7 @@ from competitions.data import CompetitionId from competitions import utils as competition_utils from model.model_tracker import ModelTracker -from model.model_updater import ModelUpdater +from model.model_updater import MinerMisconfiguredError, ModelUpdater from model.storage.chain.chain_model_metadata_store import ChainModelMetadataStore from model.storage.disk.disk_model_store import DiskModelStore from model.storage.hugging_face.hugging_face_model_store import HuggingFaceModelStore @@ -495,9 +495,11 @@ def update_models(self): ) else: bt.logging.warning( - f"Failed to find metadata for uid {uid} with hotkey {hotkey}" + f"Failed to find metadata for uid {next_uid} with hotkey {hotkey}" ) + except MinerMisconfiguredError as e: + bt.logging.trace(e) except Exception as e: bt.logging.error(f"Error in update loop: {e}") From 56e5ec75239ded23fbf258e6cb9d4ce82509f375 Mon Sep 17 00:00:00 2001 From: rusticluftig Date: Mon, 1 Jul 2024 20:29:34 -0700 Subject: [PATCH 6/9] Update tests --- tests/model/test_model_updater.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/model/test_model_updater.py b/tests/model/test_model_updater.py index 03f1c17..18c50f0 100644 --- a/tests/model/test_model_updater.py +++ b/tests/model/test_model_updater.py @@ -8,7 +8,7 @@ from model import utils from model.data import Model, ModelId, ModelMetadata from model.model_tracker import ModelTracker -from model.model_updater import ModelUpdater +from model.model_updater import MinerMisconfiguredError, ModelUpdater from model.storage.disk.disk_model_store import DiskModelStore from tests.model.storage.fake_model_metadata_store import FakeModelMetadataStore from tests.model.storage.fake_remote_model_store import FakeRemoteModelStore @@ -230,7 +230,7 @@ def test_sync_model_bad_hash(self): self.remote_store.inject_mismatched_model(model_id_chain, model) # Assert we fail due to the hash mismatch between the model in remote store and the metadata on chain. - with self.assertRaises(ValueError) as context: + with self.assertRaises(MinerMisconfiguredError) as context: asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) self.assertIn("Hash", str(context.exception)) @@ -276,7 +276,7 @@ def test_sync_model_wrong_parameters(self): ) # Assert we fail due to not meeting the competition parameters. - with self.assertRaises(ValueError) as context: + with self.assertRaises(MinerMisconfiguredError) as context: asyncio.run(self.model_updater.sync_model(hotkey, curr_block=100_000)) self.assertIn("does not satisfy parameters", str(context.exception)) From 3c6a5078fa85126df3e26f9908f33bc10b3528c0 Mon Sep 17 00:00:00 2001 From: Sid Date: Sat, 6 Jul 2024 12:34:59 -0700 Subject: [PATCH 7/9] Remove flash-attn. --- constants/__init__.py | 1 - docs/miner.md | 7 ------- docs/validator.md | 10 ++-------- neurons/config.py | 5 ----- neurons/miner.py | 1 - requirements.txt | 1 - tests/competitions/test_utils.py | 3 --- 7 files changed, 2 insertions(+), 26 deletions(-) diff --git a/constants/__init__.py b/constants/__init__.py index b8d194f..e0e456c 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -73,7 +73,6 @@ tokenizer="Xenova/gpt-4", kwargs={ "torch_dtype": torch.bfloat16, - "attn_implementation": "flash_attention_2", }, eval_block_delay=1200, # ~4 hours. ), diff --git a/docs/miner.md b/docs/miner.md index 6cb82d0..261cafb 100644 --- a/docs/miner.md +++ b/docs/miner.md @@ -39,13 +39,6 @@ cd finetuning python -m pip install -e . ``` -Note: flash-attn may not have their dependencies set up correctly. If you run into issues try installing those requirements separately first: -```shell -pip install packaging -pip install wheel -pip install torch -``` - 6. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). 7. (Optional) Run a Subtensor instance: diff --git a/docs/validator.md b/docs/validator.md index a03ce2e..6742fc0 100644 --- a/docs/validator.md +++ b/docs/validator.md @@ -61,9 +61,9 @@ It is important to note that this affects the game theoretics of the incentive l # System Requirements -Validators will need enough disk space to store the model of every miner in the subnet. Each model (As of Jun 15th, 2024) is limited to 15 GB and 7B parameters, and the validator has cleanup logic to remove old models. It is recommended to have at least 3 TB of disk space. +Validators will need enough disk space to store the model of every miner in the subnet. Each model (As of Jul 15th, 2024) is limited to 15 GB and 7B parameters, and the validator has cleanup logic to remove old models. It is recommended to have at least 3 TB of disk space. -Validators will need enough processing power to evaluate their model. As of Jun 15th, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations. +Validators will need enough processing power to evaluate their model. As of Jul 15th, 2024 it is required to have a GPU with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations. # Getting Started @@ -88,12 +88,6 @@ cd finetuning python -m pip install -e . ``` -Note: flash-attn may not have their dependencies set up correctly. If you run into issues try installing those requirements separately first: -```shell -pip install packaging -pip install wheel -pip install torch -``` 5. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). diff --git a/neurons/config.py b/neurons/config.py index c0dd262..9a51b2d 100644 --- a/neurons/config.py +++ b/neurons/config.py @@ -182,11 +182,6 @@ def miner_config(): default=4096, help="Number of samples trained on per epoch", ) - parser.add_argument( - "--attn_implementation", - default="flash_attention_2", - help="Implementation of attention to use", - ) parser.add_argument( "--netuid", type=str, diff --git a/neurons/miner.py b/neurons/miner.py index e77d8a9..6443b0f 100644 --- a/neurons/miner.py +++ b/neurons/miner.py @@ -131,7 +131,6 @@ async def main(config: bt.config): kwargs["torch_dtype"] = ( torch.bfloat16 if config.dtype == "bfloat16" else torch.float16 ) - kwargs["attn_implementation"] = config.attn_implementation # Init model. tokenizer = ft.model.load_tokenizer(model_constraints, cache_dir=config.model_dir) diff --git a/requirements.txt b/requirements.txt index 671adf8..7fbbbf8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ bittensor==6.9.3 -flash-attn huggingface_hub numpy==1.26.4 python-dotenv diff --git a/tests/competitions/test_utils.py b/tests/competitions/test_utils.py index c61c2d3..143a7c2 100644 --- a/tests/competitions/test_utils.py +++ b/tests/competitions/test_utils.py @@ -39,7 +39,6 @@ def test_get_model_constraints_valid_competition(self): eval_block_delay=1200, kwargs={ "torch_dtype": torch.bfloat16, - "attn_implementation": "flash_attention_2", }, ) @@ -69,7 +68,6 @@ def test_get_competition_for_block_valid_competition(self): eval_block_delay=1200, kwargs={ "torch_dtype": torch.bfloat16, - "attn_implementation": "flash_attention_2", }, ), reward_percentage=1.0, @@ -106,7 +104,6 @@ def test_get_competition_schedule_for_block_valid_block(self): eval_block_delay=1200, kwargs={ "torch_dtype": torch.bfloat16, - "attn_implementation": "flash_attention_2", }, ), reward_percentage=1.0, From a3c9c5537126e07b866c44ed905e43e26b0867bf Mon Sep 17 00:00:00 2001 From: Sid Date: Wed, 17 Jul 2024 20:05:57 -0700 Subject: [PATCH 8/9] Bump python required version for numpy dependency. --- docs/miner.md | 2 ++ docs/validator.md | 1 + setup.py | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/miner.md b/docs/miner.md index 261cafb..915a849 100644 --- a/docs/miner.md +++ b/docs/miner.md @@ -39,6 +39,8 @@ cd finetuning python -m pip install -e . ``` +Note: We require a python version of at least 3.9. + 6. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). 7. (Optional) Run a Subtensor instance: diff --git a/docs/validator.md b/docs/validator.md index 6742fc0..8d213e4 100644 --- a/docs/validator.md +++ b/docs/validator.md @@ -88,6 +88,7 @@ cd finetuning python -m pip install -e . ``` +Note: We require a python version of at least 3.9. 5. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). diff --git a/setup.py b/setup.py index 4de1ed6..b2cdd2b 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,7 @@ def read_requirements(path): include_package_data=True, author_email="", license="MIT", - python_requires=">=3.8", + python_requires=">=3.9", install_requires=requirements, classifiers=[ "Development Status :: 3 - Alpha", @@ -81,7 +81,6 @@ def read_requirements(path): # Pick your license as you wish "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", From c4137fcec16817a8d54f97668786edbab02a7dea Mon Sep 17 00:00:00 2001 From: Sid Date: Wed, 17 Jul 2024 20:20:23 -0700 Subject: [PATCH 9/9] Bump version for 1.0.2 release. --- constants/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/constants/__init__.py b/constants/__init__.py index e0e456c..999d90b 100644 --- a/constants/__init__.py +++ b/constants/__init__.py @@ -20,7 +20,7 @@ # Project Constants. # --------------------------------- -__version__ = "1.0.1" +__version__ = "1.0.2" version_split = __version__.split(".") __spec_version__ = ( (1000 * int(version_split[0]))