smartnodes-lab
diff --git a/‎tensorlink/ml/graphing.py‎
Lines changed: 148 additions & 1 deletion b/‎tensorlink/ml/graphing.py‎
Lines changed: 148 additions & 1 deletion
diff --git a/‎tensorlink/ml/module.py‎
Lines changed: 9 additions & 12 deletions b/‎tensorlink/ml/module.py‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎tensorlink/ml/validator.py‎
Lines changed: 11 additions & 11 deletions b/‎tensorlink/ml/validator.py‎
Lines changed: 11 additions & 11 deletions
@@ -9,6 +9,7 @@
 import hashlib
 import inspect
 import random
+import re
 
 
 class AssignmentError(Exception):
@@ -17,6 +18,118 @@ class AssignmentError(Exception):
     pass
 
 
+def _create_grouped_entry(parent_path: str, group: list) -> dict:
+    """
+    Create a single config entry for a group of consecutive layers.
+    """
+    if len(group) == 1:
+        # Single layer, return as-is
+        _, path, cfg = group[0]
+        return {path: cfg}
+
+    # Multiple layers - create grouped entry
+    layer_indices = [idx for idx, _, _ in group]
+    paths = [path for _, path, _ in group]
+    configs = [cfg for _, _, cfg in group]
+
+    start_idx = min(layer_indices)
+    end_idx = max(layer_indices)
+
+    # Use range notation in the key
+    grouped_path = f"{parent_path}{start_idx}-{end_idx}"
+
+    # Merge configurations
+    total_memory = sum(cfg.get("memory", 0) for cfg in configs)
+    worker = configs[0]["assigned_workers"][0]
+
+    grouped_config = {
+        "type": "offloaded_group",
+        "name": configs[0].get("name", ""),
+        "assigned_workers": [worker],
+        "layer_range": (start_idx, end_idx),
+        "layer_paths": paths,
+        "memory": total_memory,
+        "module": configs[0].get("module", ""),
+        "training": configs[0].get("training", False),
+        "optimizer_type": configs[0].get("optimizer_type", "adam"),
+        "num_layers": len(group),
+    }
+
+    # Preserve parent_forward_code if present
+    if "parent_forward_code" in configs[0]:
+        grouped_config["parent_forward_code"] = configs[0]["parent_forward_code"]
+        grouped_config["parent_module_path"] = configs[0]["parent_module_path"]
+
+    return {grouped_path: grouped_config}
+
+
+def _group_sequential_layers(config: dict) -> dict:
+    """
+    Group consecutive layers assigned to the same worker into single entries.
+
+    For example:
+    model.layers.0 -> worker1
+    model.layers.1 -> worker1
+    model.layers.2 -> worker1
+
+    Becomes:
+    model.layers.0-2 -> worker1
+    """
+    # Group paths by their parent and extract layer patterns
+    layer_groups = defaultdict(list)
+
+    for path, cfg in config.items():
+        if cfg.get("type") != "offloaded":
+            continue
+
+        # Match patterns like "model.layers.0", "model.encoder.layer.5", etc.
+        match = re.match(r'^(.+\.)(\d+)$', path)
+        if match:
+            parent_path = match.group(1)  # e.g., "model.layers."
+            layer_idx = int(match.group(2))
+            layer_groups[parent_path].append((layer_idx, path, cfg))
+
+    # Create new grouped config
+    new_config = {}
+    processed_paths = set()
+
+    for parent_path, layers in layer_groups.items():
+        # Sort by layer index
+        layers.sort(key=lambda x: x[0])
+
+        # Group consecutive layers with same worker
+        current_group = []
+        current_worker = None
+
+        for layer_idx, path, cfg in layers:
+            worker = cfg["assigned_workers"][0] if cfg["assigned_workers"] else None
+
+            if worker == current_worker and current_group:
+                # Extend current group
+                current_group.append((layer_idx, path, cfg))
+            else:
+                # Save previous group if exists
+                if current_group:
+                    new_config.update(_create_grouped_entry(parent_path, current_group))
+                    processed_paths.update(p for _, p, _ in current_group)
+
+                # Start new group
+                current_group = [(layer_idx, path, cfg)]
+                current_worker = worker
+
+        # Don't forget the last group
+        if current_group:
+            new_config.update(_create_grouped_entry(parent_path, current_group))
+            processed_paths.update(p for _, p, _ in current_group)
+
+    # Add all non-layer modules that weren't grouped
+    for path, cfg in config.items():
+        if path not in processed_paths:
+            new_config[path] = cfg
+
+    return new_config
+
+
 class ModelParser:
     def __init__(self, user_memory: int = 0):
         self.user_memory = user_memory
@@ -76,6 +189,8 @@ def create_distributed_config(
                 optimizer_type=optimizer_type,
             )
 
+            config = _group_sequential_layers(config)
+
         except AssignmentError:
             success = False
 
@@ -101,7 +216,7 @@ def _recurse_module(
             ids = []
 
         memory, breakdown = estimate_memory(
-            module, training, batch_size=1024, optimizer_type=optimizer_type
+            module, training, seq_length=1024, optimizer_type=optimizer_type
         )
 
         assigned_worker = self._try_assign_worker(
@@ -241,3 +356,35 @@ def _extract_forward_code(self, module: nn.Module):
                     f"Could not extract forward code for {module_class.__name__}: {e}"
                 )
             return None
+
+
+class ModelSegmentAnalyzer:
+    """
+    Analyzes the forward method of a model to identify three key segments:
+    1. Pre-offload: Model chunk executed on
+    """
+
+
+"""
+Example workflow
+
+
+def forward(self, x):
+    x = self.layer1(x)
+    
+    for i in range(len(self.layerlist)):
+        x = self.layerlist[i](x)  # if i > 2, worker 2 is used instead
+        
+        
+worker1: 
+x = self.layer1(x)
+for i in range(2):
+    x = self.layerslist[i](x)
+
+
+worker2:
+
+for i in range(3,5):
+    x = self.layerslist[i](x)
+    
+"""
@@ -543,7 +543,8 @@ def distribute_model(self, config=None):
             self._load_model_skeleton()
             self._wrap_hf_model(config)
         else:
-            self.wrap_module(config)
+            raise "Custom models are currently not supported."
+            # self.wrap_module(config)
 
         if len(config) == 1:
             module, module_name = access_module(self.model, [-1])
@@ -672,16 +673,16 @@ def wrap_module(self, module_id: list, worker_id):
 
     def _wrap_hf_model(self, config: dict):
         # Iterate through each worker and their assigned modules
-        for module_id, worker_modules in config.items():
-            worker_id = next(iter(worker_modules.values())).get("assigned_workers")[0]
+        for module_id, module_info in config.items():
+            worker_id = module_info["assigned_workers"][0]
             file_name = f"{module_id}_{worker_id}.pt"
-            module_info = str(PreTrainedModel)
-            offloaded_module = OffloadedModule(self, module_info, worker_id, module_id)
+            module_name = module_info["module"]
+            offloaded_module = OffloadedModule(self, module_name, worker_id, module_id)
             with open(file_name, "wb") as f:
                 f.close()
 
             # Spawn a worker thread for the offloaded module
-            offloaded_module.spawn_worker(file_name)
+            offloaded_module.spawn_worker(file_name, module_info)
             setattr(self, "model", offloaded_module)
 
     def send_request(self, request_type, args):
@@ -793,10 +794,6 @@ def __init__(
 
         self.entire_model = False
         self.module_name = module_name.split("(")[0]
-        try:
-            self.module_info = module_name.split("(")[1][:-1]
-        except:
-            self.module_info = self.module_name
 
         self.parent_model = parent_model
         self.worker_id = worker_id
@@ -809,7 +806,7 @@ def children(self):
         # Return an empty iterator to hide deeper children
         return iter([])
 
-    def spawn_worker(self, name):
+    def spawn_worker(self, name: str, module_info: dict):
         # # Initialize a threading Timer to monitor the loading process
         # timer = threading.Tier(MAX_WAIT_TIME, self.handle_timeout)
         # timer.start()
@@ -818,7 +815,7 @@ def spawn_worker(self, name):
         # Send the module to the worker roles
 
         self.parent_model.send_request(
-            "send_model", (name, self.worker_id, self.module_id)
+            "send_model", (name, self.worker_id, self.module_id, module_info)
         )
 
         # Wait for the module to be loaded on worker
 
@@ -284,7 +284,6 @@ def _manage_auto_loaded_models(self):
                     "debug_print",
                     (f"Auto-loading model: {model_name}", "green", logging.INFO),
                 )
-                self.models_initializing.add(model_name)
                 self._initialize_hosted_job(model_name)
 
         # Try to finalize models that are initializing
@@ -313,7 +312,7 @@ def _manage_auto_loaded_models(self):
                         )
                         self._remove_hosted_job(model_name)
 
-    def inspect_model(self, model_name: str, job_data: dict):
+    def inspect_model(self, model_name: str, job_data: dict) -> dict:
         """Inspect a model to determine network requirements and store distribution in JSON cache"""
         parser = ModelParser()
         model_name: str = job_data.get("model_name", model_name)
@@ -350,7 +349,9 @@ def inspect_model(self, model_name: str, job_data: dict):
 
         # Send out job request
         try:
-            self.send_request("send_job_request", job_data)
+            new_job_data = self.send_request("send_job_request", job_data)
+            return new_job_data
+
         except Exception as e:
             print(str(e))
 
@@ -541,12 +542,12 @@ def _handle_generate_request(self, request: GenerationRequest):
 
     def _try_finalize_initializing_models(self):
         """Attempt to finalize all models that are currently initializing."""
-        for model_name in list(self.models_initializing):
-            if self._finalize_hosted_job(model_name):
+        for job_id in list(self.models_initializing):
+            if self._finalize_hosted_job(job_id):
                 self.send_request(
                     "debug_print",
                     (
-                        f"Successfully finalized model: {model_name}",
+                        f"Successfully finalized model: {job_id}",
                         "green",
                         logging.INFO,
                     ),
@@ -596,8 +597,8 @@ def _initialize_hosted_job(
             }
 
             # Inspect model to determine network requirements
-            self.inspect_model(model_name, job_data)
-
+            job_data = self.inspect_model(model_name, job_data)
+            self.models_initializing.add(job_data.get("id"))
             self.send_request(
                 "debug_print",
                 (f"Initialized hosted job for {model_name}", "green", logging.INFO),
@@ -611,11 +612,11 @@ def _initialize_hosted_job(
             if model_name in self.model_state:
                 del self.model_state[model_name]
 
-    def _finalize_hosted_job(self, model_name: str):
+    def _finalize_hosted_job(self, job_id: str):
         """Finalize a hosted job by setting up the distributed model with workers."""
         try:
             # Check if we have module info ready
-            args = self.send_request("check_module", None)
+            args = self.send_request("check_module", job_id)
 
             if not args or not isinstance(args, dict):
                 # Module not ready yet
@@ -638,7 +639,6 @@ def _finalize_hosted_job(self, model_name: str):
 
             # Register the distributed model's modules
             for module_id, module_info in distribution.items():
-                module_id = hashlib.sha256(json.dumps(module_info).encode()).hexdigest()
                 self.modules[module_id] = module_info
 
             # Distribute the model across workers