Merge pull request #56 from macrocosm-os/dev

Release 1.0.2.
macrocosm-os · Jul 18, 2024 · 2273945 · 2273945
2 parents b05af4f + 30f7266
commit 2273945
Show file tree

Hide file tree

Showing 13 changed files with 132 additions and 76 deletions.
diff --git a/competitions/data.py b/competitions/data.py
@@ -35,6 +35,9 @@ class ModelConstraints:
     # The Pretrained tokenizer to use.
     tokenizer: str
 
+    # Block delay before evaluating uploaded models. Based on look-back period for eval data collection.
+    eval_block_delay: int
+
     # Any additional arguments to pass to from_pretrained
     kwargs: Any = field(default_factory=dict)
 

diff --git a/constants/__init__.py b/constants/__init__.py
@@ -20,7 +20,7 @@
 # Project Constants.
 # ---------------------------------
 
-__version__ = "1.0.1"
+__version__ = "1.0.2"
 version_split = __version__.split(".")
 __spec_version__ = (
     (1000 * int(version_split[0]))
@@ -43,7 +43,7 @@
 CORTEX_WANDB_PROJECT = "cortex-t/multi-modality"
 CORTEX_WANDB_TYPE = "validator"
 CORTEX_MAX_UIDS = 256
-CORTEX_MAX_AGE = dt.timedelta(days=1)
+CORTEX_MAX_AGE = dt.timedelta(hours=4)
 CORTEX_MIN_SCORE = 0.85
 # Minimum stake to get data from a cortex validator.
 CORTEX_MIN_STAKE = 100_000
@@ -73,8 +73,8 @@
         tokenizer="Xenova/gpt-4",
         kwargs={
             "torch_dtype": torch.bfloat16,
-            "attn_implementation": "flash_attention_2",
         },
+        eval_block_delay=1200,  # ~4 hours.
     ),
 }
 

diff --git a/docs/miner.md b/docs/miner.md
@@ -39,12 +39,7 @@ cd finetuning
 python -m pip install -e .
 ```
 
-Note: flash-attn may not have their dependencies set up correctly. If you run into issues try installing those requirements separately first:
-```shell
-pip install packaging
-pip install wheel
-pip install torch
-```
+Note: We require a python version of at least 3.9.
 
 6. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).
 

diff --git a/docs/validator.md b/docs/validator.md
@@ -61,9 +61,9 @@ It is important to note that this affects the game theoretics of the incentive l
 
 # System Requirements
 
-Validators will need enough disk space to store the model of every miner in the subnet. Each model (As of Jun 15th, 2024) is limited to 15 GB and 7B parameters, and the validator has cleanup logic to remove old models. It is recommended to have at least 3 TB of disk space.
+Validators will need enough disk space to store the model of every miner in the subnet. Each model (As of Jul 15th, 2024) is limited to 15 GB and 7B parameters, and the validator has cleanup logic to remove old models. It is recommended to have at least 3 TB of disk space.
 
-Validators will need enough processing power to evaluate their model. As of Jun 15th, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.
+Validators will need enough processing power to evaluate their model. As of Jul 15th, 2024 it is required to have a GPU with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.
 
 # Getting Started
 
@@ -88,12 +88,7 @@ cd finetuning
 python -m pip install -e .
 ```
 
-Note: flash-attn may not have their dependencies set up correctly. If you run into issues try installing those requirements separately first:
-```shell
-pip install packaging
-pip install wheel
-pip install torch
-```
+Note: We require a python version of at least 3.9.
 
 5. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).
 

diff --git a/model/model_updater.py b/model/model_updater.py
@@ -13,6 +13,14 @@
 from model.utils import get_hash_of_two_strings
 
 
+class MinerMisconfiguredError(Exception):
+    """Error raised when a miner is misconfigured."""
+
+    def __init__(self, hotkey: str, message: str):
+        self.hotkey = hotkey
+        super().__init__(f"[{hotkey}] {message}")
+
+
 class ModelUpdater:
     """Checks if the currently tracked model for a hotkey matches what the miner committed to the chain."""
 
@@ -58,31 +66,43 @@ async def _get_metadata(self, hotkey: str) -> Optional[ModelMetadata]:
         """Get metadata about a model by hotkey"""
         return await self.metadata_store.retrieve_model_metadata(hotkey)
 
-    async def sync_model(self, hotkey: str, force: bool = False) -> bool:
+    async def sync_model(
+        self, hotkey: str, curr_block: int, force: bool = False
+    ) -> bool:
         """Updates local model for a hotkey if out of sync and returns if it was updated."
 
         Args:
            hotkey (str): The hotkey of the model to sync.
+           curr_block (int): The current block.
            force (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed.
         """
         # Get the metadata for the miner.
         metadata = await self._get_metadata(hotkey)
 
         if not metadata:
-            bt.logging.trace(
-                f"No valid metadata found on the chain for hotkey {hotkey}"
-            )
-            raise ValueError(
-                f"No valid metadata found on the chain for hotkey {hotkey}"
+            raise MinerMisconfiguredError(
+                hotkey, f"No valid metadata found on the chain"
             )
 
         # Check that the metadata indicates a competition available at time of upload.
         competition = competition_utils.get_competition_for_block(
             metadata.id.competition_id, metadata.block
         )
         if not competition:
-            bt.logging.trace(f"No competition found for {metadata.id.competition_id} at block {metadata.block}")
-            raise ValueError(f"No competition found for {metadata.id.competition_id} at block {metadata.block}")
+            raise MinerMisconfiguredError(
+                hotkey,
+                f"No competition found for {metadata.id.competition_id} at block {metadata.block}",
+            )
+
+        # Check that the metadata is old enough to meet the eval_block_delay for the competition.
+        # If not we return false and will check again next time we go through the update loop.
+        if curr_block - metadata.block < competition.constraints.eval_block_delay:
+            bt.logging.debug(
+                f"""Sync for hotkey {hotkey} delayed as the current block: {curr_block} is not at least 
+                {competition.constraints.eval_block_delay} blocks after the upload block: {metadata.block}. 
+                Will automatically retry later."""
+            )
+            return False
 
         # Check what model id the model tracker currently has for this hotkey.
         tracker_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
@@ -106,16 +126,15 @@ async def sync_model(self, hotkey: str, force: bool = False) -> bool:
         # Check that the hash of the downloaded content matches.
         secure_hash = get_hash_of_two_strings(model.id.hash, hotkey)
         if secure_hash != metadata.id.secure_hash:
-            bt.logging.trace(
-                f"Sync for hotkey {hotkey} failed. Hashes do not match of content: {secure_hash} != {metadata.id.secure_hash}."
-            )
-            raise ValueError(
-                f"Sync for hotkey {hotkey} failed. Hash of content downloaded from hugging face does not match chain metadata. {metadata}"
+            raise MinerMisconfiguredError(
+                hotkey,
+                f"Hash of content downloaded from hugging face does not match chain metadata. {metadata}",
             )
 
         if not ModelUpdater.verify_model_satisfies_parameters(model):
-            raise ValueError(
-                f"Sync for hotkey {hotkey} failed, model does not satisfy parameters for competition {competition.id}"
+            raise MinerMisconfiguredError(
+                hotkey,
+                f"Model does not satisfy parameters for competition {competition.id}",
             )
 
         return True

diff --git a/neurons/config.py b/neurons/config.py
@@ -182,11 +182,6 @@ def miner_config():
         default=4096,
         help="Number of samples trained on per epoch",
     )
-    parser.add_argument(
-        "--attn_implementation",
-        default="flash_attention_2",
-        help="Implementation of attention to use",
-    )
     parser.add_argument(
         "--netuid",
         type=str,

diff --git a/neurons/miner.py b/neurons/miner.py
@@ -131,7 +131,6 @@ async def main(config: bt.config):
     kwargs["torch_dtype"] = (
         torch.bfloat16 if config.dtype == "bfloat16" else torch.float16
     )
-    kwargs["attn_implementation"] = config.attn_implementation
 
     # Init model.
     tokenizer = ft.model.load_tokenizer(model_constraints, cache_dir=config.model_dir)

diff --git a/neurons/validator.py b/neurons/validator.py
@@ -46,7 +46,7 @@
 from competitions.data import CompetitionId
 from competitions import utils as competition_utils
 from model.model_tracker import ModelTracker
-from model.model_updater import ModelUpdater
+from model.model_updater import MinerMisconfiguredError, ModelUpdater
 from model.storage.chain.chain_model_metadata_store import ChainModelMetadataStore
 from model.storage.disk.disk_model_store import DiskModelStore
 from model.storage.hugging_face.hugging_face_model_store import HuggingFaceModelStore
@@ -355,6 +355,8 @@ def update_models(self):
                     > constants.chain_update_cadence
                 ):
                     last_checked_top_models_time = dt.datetime.now()
+                    # Take a deep copy of the metagraph for use in the top uid retry check.
+                    # The regular loop below will use self.metagraph which may be updated as we go.
                     with self.metagraph_lock:
                         metagraph = copy.deepcopy(self.metagraph)
 
@@ -390,31 +392,35 @@ def update_models(self):
                             try:
                                 uid_last_retried_evaluation[uid] = dt.datetime.now()
 
-                                # Redownload this model and schedule it for eval even if it isn't updated by the sync.
+                                # Redownload this model and schedule it for eval even if it hasn't changed.
+                                # Still respect the eval block delay so that previously top uids can't bypass it.
                                 hotkey = metagraph.hotkeys[uid]
-                                asyncio.run(
-                                    self.model_updater.sync_model(hotkey, force=True)
+                                should_retry = self.model_updater.sync_model(
+                                    hotkey,
+                                    metagraph.block.item(),
+                                    force=True,
                                 )
 
-                                # Since this is a top model (as determined by other valis),
-                                # we don't worry if self.pending_uids is already "full".
-                                # Validators should only have ~1 winner per competition and we only check bigger valis
-                                # so there should not be many simultaneous top models not already being evaluated.
-                                top_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
-                                    hotkey
-                                )
-                                if top_model_metadata is not None:
-                                    bt.logging.trace(
-                                        f"Shortcutting to top model or retrying evaluation for previously discarded top model with incentive for UID={uid}"
-                                    )
-                                    with self.pending_uids_to_eval_lock:
-                                        self.pending_uids_to_eval[
-                                            top_model_metadata.id.competition_id
-                                        ].add(uid)
-                                else:
-                                    bt.logging.warning(
-                                        f"Failed to find metadata for uid {uid} with hotkey {hotkey}"
+                                if should_retry:
+                                    # Since this is a top model (as determined by other valis),
+                                    # we don't worry if self.pending_uids is already "full".
+                                    # Validators should only have ~1 winner per competition and we only check bigger valis
+                                    # so there should not be many simultaneous top models not already being evaluated.
+                                    top_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
+                                        hotkey
                                     )
+                                    if top_model_metadata is not None:
+                                        bt.logging.trace(
+                                            f"Shortcutting to top model or retrying evaluation for previously discarded top model with incentive for UID={uid}"
+                                        )
+                                        with self.pending_uids_to_eval_lock:
+                                            self.pending_uids_to_eval[
+                                                top_model_metadata.id.competition_id
+                                            ].add(uid)
+                                    else:
+                                        bt.logging.warning(
+                                            f"Failed to find metadata for uid {uid} with hotkey {hotkey}"
+                                        )
                             except Exception:
                                 bt.logging.debug(
                                     f"Failure in update loop for UID={uid} during top model check. {traceback.format_exc()}"
@@ -468,10 +474,11 @@ def update_models(self):
                 # Get their hotkey from the metagraph.
                 with self.metagraph_lock:
                     hotkey = self.metagraph.hotkeys[next_uid]
+                    curr_block = self.metagraph.block.item()
 
                 # Compare metadata and tracker, syncing new model from remote store to local if necessary.
                 updated = asyncio.run(
-                    self.model_updater.sync_model(hotkey, force=False)
+                    self.model_updater.sync_model(hotkey, curr_block, force=False)
                 )
 
                 if updated:
@@ -488,9 +495,11 @@ def update_models(self):
                             )
                     else:
                         bt.logging.warning(
-                            f"Failed to find metadata for uid {uid} with hotkey {hotkey}"
+                            f"Failed to find metadata for uid {next_uid} with hotkey {hotkey}"
                         )
 
+            except MinerMisconfiguredError as e:
+                bt.logging.trace(e)
             except Exception as e:
                 bt.logging.error(f"Error in update loop: {e}")
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
 bittensor==6.9.3
-flash-attn
 huggingface_hub
 numpy==1.26.4
 python-dotenv

diff --git a/setup.py b/setup.py
@@ -72,7 +72,7 @@ def read_requirements(path):
     include_package_data=True,
     author_email="",
     license="MIT",
-    python_requires=">=3.8",
+    python_requires=">=3.9",
     install_requires=requirements,
     classifiers=[
         "Development Status :: 3 - Alpha",
@@ -81,7 +81,6 @@ def read_requirements(path):
         # Pick your license as you wish
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python :: 3 :: Only",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering",

diff --git a/tests/competitions/test_competition_tracker.py b/tests/competitions/test_competition_tracker.py
@@ -16,6 +16,7 @@ class TestCompetitionTracker(unittest.TestCase):
             sequence_length=4096,
             allowed_architectures=[LlamaForCausalLM],
             tokenizer="Xenova/gpt-4",
+            eval_block_delay=1200,
             kwargs={},
         ),
         reward_percentage=0.6,
@@ -27,6 +28,7 @@ class TestCompetitionTracker(unittest.TestCase):
             sequence_length=2048,
             allowed_architectures=[LlamaForCausalLM],
             tokenizer="Xenova/gpt-4",
+            eval_block_delay=1200,
             kwargs={},
         ),
         reward_percentage=0.4,

diff --git a/tests/competitions/test_utils.py b/tests/competitions/test_utils.py
@@ -36,9 +36,9 @@ def test_get_model_constraints_valid_competition(self):
                 GemmaForCausalLM,
             ],
             tokenizer="Xenova/gpt-4",
+            eval_block_delay=1200,
             kwargs={
                 "torch_dtype": torch.bfloat16,
-                "attn_implementation": "flash_attention_2",
             },
         )
 
@@ -65,9 +65,9 @@ def test_get_competition_for_block_valid_competition(self):
                     GemmaForCausalLM,
                 ],
                 tokenizer="Xenova/gpt-4",
+                eval_block_delay=1200,
                 kwargs={
                     "torch_dtype": torch.bfloat16,
-                    "attn_implementation": "flash_attention_2",
                 },
             ),
             reward_percentage=1.0,
@@ -101,9 +101,9 @@ def test_get_competition_schedule_for_block_valid_block(self):
                         GemmaForCausalLM,
                     ],
                     tokenizer="Xenova/gpt-4",
+                    eval_block_delay=1200,
                     kwargs={
                         "torch_dtype": torch.bfloat16,
-                        "attn_implementation": "flash_attention_2",
                     },
                 ),
                 reward_percentage=1.0,