Skip to content

Commit

Permalink
Merge pull request #56 from macrocosm-os/dev
Browse files Browse the repository at this point in the history
Release 1.0.2.
  • Loading branch information
Sid-Data-Universe authored Jul 18, 2024
2 parents b05af4f + 30f7266 commit 2273945
Show file tree
Hide file tree
Showing 13 changed files with 132 additions and 76 deletions.
3 changes: 3 additions & 0 deletions competitions/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class ModelConstraints:
# The Pretrained tokenizer to use.
tokenizer: str

# Block delay before evaluating uploaded models. Based on look-back period for eval data collection.
eval_block_delay: int

# Any additional arguments to pass to from_pretrained
kwargs: Any = field(default_factory=dict)

Expand Down
6 changes: 3 additions & 3 deletions constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Project Constants.
# ---------------------------------

__version__ = "1.0.1"
__version__ = "1.0.2"
version_split = __version__.split(".")
__spec_version__ = (
(1000 * int(version_split[0]))
Expand All @@ -43,7 +43,7 @@
CORTEX_WANDB_PROJECT = "cortex-t/multi-modality"
CORTEX_WANDB_TYPE = "validator"
CORTEX_MAX_UIDS = 256
CORTEX_MAX_AGE = dt.timedelta(days=1)
CORTEX_MAX_AGE = dt.timedelta(hours=4)
CORTEX_MIN_SCORE = 0.85
# Minimum stake to get data from a cortex validator.
CORTEX_MIN_STAKE = 100_000
Expand Down Expand Up @@ -73,8 +73,8 @@
tokenizer="Xenova/gpt-4",
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
eval_block_delay=1200, # ~4 hours.
),
}

Expand Down
7 changes: 1 addition & 6 deletions docs/miner.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,7 @@ cd finetuning
python -m pip install -e .
```

Note: flash-attn may not have their dependencies set up correctly. If you run into issues try installing those requirements separately first:
```shell
pip install packaging
pip install wheel
pip install torch
```
Note: We require a python version of at least 3.9.

6. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).

Expand Down
11 changes: 3 additions & 8 deletions docs/validator.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ It is important to note that this affects the game theoretics of the incentive l

# System Requirements

Validators will need enough disk space to store the model of every miner in the subnet. Each model (As of Jun 15th, 2024) is limited to 15 GB and 7B parameters, and the validator has cleanup logic to remove old models. It is recommended to have at least 3 TB of disk space.
Validators will need enough disk space to store the model of every miner in the subnet. Each model (As of Jul 15th, 2024) is limited to 15 GB and 7B parameters, and the validator has cleanup logic to remove old models. It is recommended to have at least 3 TB of disk space.

Validators will need enough processing power to evaluate their model. As of Jun 15th, 2024 it is required to have a GPU that supports [flash attention 2](https://github.com/Dao-AILab/flash-attention) with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.
Validators will need enough processing power to evaluate their model. As of Jul 15th, 2024 it is required to have a GPU with atleast 48 GB of VRAM and at least 38 TFLOPs for half precision (bfloat 16) operations.

# Getting Started

Expand All @@ -88,12 +88,7 @@ cd finetuning
python -m pip install -e .
```

Note: flash-attn may not have their dependencies set up correctly. If you run into issues try installing those requirements separately first:
```shell
pip install packaging
pip install wheel
pip install torch
```
Note: We require a python version of at least 3.9.

5. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).

Expand Down
49 changes: 34 additions & 15 deletions model/model_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
from model.utils import get_hash_of_two_strings


class MinerMisconfiguredError(Exception):
"""Error raised when a miner is misconfigured."""

def __init__(self, hotkey: str, message: str):
self.hotkey = hotkey
super().__init__(f"[{hotkey}] {message}")


class ModelUpdater:
"""Checks if the currently tracked model for a hotkey matches what the miner committed to the chain."""

Expand Down Expand Up @@ -58,31 +66,43 @@ async def _get_metadata(self, hotkey: str) -> Optional[ModelMetadata]:
"""Get metadata about a model by hotkey"""
return await self.metadata_store.retrieve_model_metadata(hotkey)

async def sync_model(self, hotkey: str, force: bool = False) -> bool:
async def sync_model(
self, hotkey: str, curr_block: int, force: bool = False
) -> bool:
"""Updates local model for a hotkey if out of sync and returns if it was updated."
Args:
hotkey (str): The hotkey of the model to sync.
curr_block (int): The current block.
force (bool): Whether to force a sync for this model, even if it's chain metadata hasn't changed.
"""
# Get the metadata for the miner.
metadata = await self._get_metadata(hotkey)

if not metadata:
bt.logging.trace(
f"No valid metadata found on the chain for hotkey {hotkey}"
)
raise ValueError(
f"No valid metadata found on the chain for hotkey {hotkey}"
raise MinerMisconfiguredError(
hotkey, f"No valid metadata found on the chain"
)

# Check that the metadata indicates a competition available at time of upload.
competition = competition_utils.get_competition_for_block(
metadata.id.competition_id, metadata.block
)
if not competition:
bt.logging.trace(f"No competition found for {metadata.id.competition_id} at block {metadata.block}")
raise ValueError(f"No competition found for {metadata.id.competition_id} at block {metadata.block}")
raise MinerMisconfiguredError(
hotkey,
f"No competition found for {metadata.id.competition_id} at block {metadata.block}",
)

# Check that the metadata is old enough to meet the eval_block_delay for the competition.
# If not we return false and will check again next time we go through the update loop.
if curr_block - metadata.block < competition.constraints.eval_block_delay:
bt.logging.debug(
f"""Sync for hotkey {hotkey} delayed as the current block: {curr_block} is not at least
{competition.constraints.eval_block_delay} blocks after the upload block: {metadata.block}.
Will automatically retry later."""
)
return False

# Check what model id the model tracker currently has for this hotkey.
tracker_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
Expand All @@ -106,16 +126,15 @@ async def sync_model(self, hotkey: str, force: bool = False) -> bool:
# Check that the hash of the downloaded content matches.
secure_hash = get_hash_of_two_strings(model.id.hash, hotkey)
if secure_hash != metadata.id.secure_hash:
bt.logging.trace(
f"Sync for hotkey {hotkey} failed. Hashes do not match of content: {secure_hash} != {metadata.id.secure_hash}."
)
raise ValueError(
f"Sync for hotkey {hotkey} failed. Hash of content downloaded from hugging face does not match chain metadata. {metadata}"
raise MinerMisconfiguredError(
hotkey,
f"Hash of content downloaded from hugging face does not match chain metadata. {metadata}",
)

if not ModelUpdater.verify_model_satisfies_parameters(model):
raise ValueError(
f"Sync for hotkey {hotkey} failed, model does not satisfy parameters for competition {competition.id}"
raise MinerMisconfiguredError(
hotkey,
f"Model does not satisfy parameters for competition {competition.id}",
)

return True
Expand Down
5 changes: 0 additions & 5 deletions neurons/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,6 @@ def miner_config():
default=4096,
help="Number of samples trained on per epoch",
)
parser.add_argument(
"--attn_implementation",
default="flash_attention_2",
help="Implementation of attention to use",
)
parser.add_argument(
"--netuid",
type=str,
Expand Down
1 change: 0 additions & 1 deletion neurons/miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ async def main(config: bt.config):
kwargs["torch_dtype"] = (
torch.bfloat16 if config.dtype == "bfloat16" else torch.float16
)
kwargs["attn_implementation"] = config.attn_implementation

# Init model.
tokenizer = ft.model.load_tokenizer(model_constraints, cache_dir=config.model_dir)
Expand Down
57 changes: 33 additions & 24 deletions neurons/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from competitions.data import CompetitionId
from competitions import utils as competition_utils
from model.model_tracker import ModelTracker
from model.model_updater import ModelUpdater
from model.model_updater import MinerMisconfiguredError, ModelUpdater
from model.storage.chain.chain_model_metadata_store import ChainModelMetadataStore
from model.storage.disk.disk_model_store import DiskModelStore
from model.storage.hugging_face.hugging_face_model_store import HuggingFaceModelStore
Expand Down Expand Up @@ -355,6 +355,8 @@ def update_models(self):
> constants.chain_update_cadence
):
last_checked_top_models_time = dt.datetime.now()
# Take a deep copy of the metagraph for use in the top uid retry check.
# The regular loop below will use self.metagraph which may be updated as we go.
with self.metagraph_lock:
metagraph = copy.deepcopy(self.metagraph)

Expand Down Expand Up @@ -390,31 +392,35 @@ def update_models(self):
try:
uid_last_retried_evaluation[uid] = dt.datetime.now()

# Redownload this model and schedule it for eval even if it isn't updated by the sync.
# Redownload this model and schedule it for eval even if it hasn't changed.
# Still respect the eval block delay so that previously top uids can't bypass it.
hotkey = metagraph.hotkeys[uid]
asyncio.run(
self.model_updater.sync_model(hotkey, force=True)
should_retry = self.model_updater.sync_model(
hotkey,
metagraph.block.item(),
force=True,
)

# Since this is a top model (as determined by other valis),
# we don't worry if self.pending_uids is already "full".
# Validators should only have ~1 winner per competition and we only check bigger valis
# so there should not be many simultaneous top models not already being evaluated.
top_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
hotkey
)
if top_model_metadata is not None:
bt.logging.trace(
f"Shortcutting to top model or retrying evaluation for previously discarded top model with incentive for UID={uid}"
)
with self.pending_uids_to_eval_lock:
self.pending_uids_to_eval[
top_model_metadata.id.competition_id
].add(uid)
else:
bt.logging.warning(
f"Failed to find metadata for uid {uid} with hotkey {hotkey}"
if should_retry:
# Since this is a top model (as determined by other valis),
# we don't worry if self.pending_uids is already "full".
# Validators should only have ~1 winner per competition and we only check bigger valis
# so there should not be many simultaneous top models not already being evaluated.
top_model_metadata = self.model_tracker.get_model_metadata_for_miner_hotkey(
hotkey
)
if top_model_metadata is not None:
bt.logging.trace(
f"Shortcutting to top model or retrying evaluation for previously discarded top model with incentive for UID={uid}"
)
with self.pending_uids_to_eval_lock:
self.pending_uids_to_eval[
top_model_metadata.id.competition_id
].add(uid)
else:
bt.logging.warning(
f"Failed to find metadata for uid {uid} with hotkey {hotkey}"
)
except Exception:
bt.logging.debug(
f"Failure in update loop for UID={uid} during top model check. {traceback.format_exc()}"
Expand Down Expand Up @@ -468,10 +474,11 @@ def update_models(self):
# Get their hotkey from the metagraph.
with self.metagraph_lock:
hotkey = self.metagraph.hotkeys[next_uid]
curr_block = self.metagraph.block.item()

# Compare metadata and tracker, syncing new model from remote store to local if necessary.
updated = asyncio.run(
self.model_updater.sync_model(hotkey, force=False)
self.model_updater.sync_model(hotkey, curr_block, force=False)
)

if updated:
Expand All @@ -488,9 +495,11 @@ def update_models(self):
)
else:
bt.logging.warning(
f"Failed to find metadata for uid {uid} with hotkey {hotkey}"
f"Failed to find metadata for uid {next_uid} with hotkey {hotkey}"
)

except MinerMisconfiguredError as e:
bt.logging.trace(e)
except Exception as e:
bt.logging.error(f"Error in update loop: {e}")

Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
bittensor==6.9.3
flash-attn
huggingface_hub
numpy==1.26.4
python-dotenv
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def read_requirements(path):
include_package_data=True,
author_email="",
license="MIT",
python_requires=">=3.8",
python_requires=">=3.9",
install_requires=requirements,
classifiers=[
"Development Status :: 3 - Alpha",
Expand All @@ -81,7 +81,6 @@ def read_requirements(path):
# Pick your license as you wish
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering",
Expand Down
2 changes: 2 additions & 0 deletions tests/competitions/test_competition_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class TestCompetitionTracker(unittest.TestCase):
sequence_length=4096,
allowed_architectures=[LlamaForCausalLM],
tokenizer="Xenova/gpt-4",
eval_block_delay=1200,
kwargs={},
),
reward_percentage=0.6,
Expand All @@ -27,6 +28,7 @@ class TestCompetitionTracker(unittest.TestCase):
sequence_length=2048,
allowed_architectures=[LlamaForCausalLM],
tokenizer="Xenova/gpt-4",
eval_block_delay=1200,
kwargs={},
),
reward_percentage=0.4,
Expand Down
6 changes: 3 additions & 3 deletions tests/competitions/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def test_get_model_constraints_valid_competition(self):
GemmaForCausalLM,
],
tokenizer="Xenova/gpt-4",
eval_block_delay=1200,
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
)

Expand All @@ -65,9 +65,9 @@ def test_get_competition_for_block_valid_competition(self):
GemmaForCausalLM,
],
tokenizer="Xenova/gpt-4",
eval_block_delay=1200,
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
),
reward_percentage=1.0,
Expand Down Expand Up @@ -101,9 +101,9 @@ def test_get_competition_schedule_for_block_valid_block(self):
GemmaForCausalLM,
],
tokenizer="Xenova/gpt-4",
eval_block_delay=1200,
kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation": "flash_attention_2",
},
),
reward_percentage=1.0,
Expand Down
Loading

0 comments on commit 2273945

Please sign in to comment.