more README

adityaranjan · adityaranjan · commit 88e57ac6d2a1 · 2025-04-27T12:37:33.000-07:00
diff --git a/benchmarking/pyg_serial.py b/benchmarking/pyg_serial.py
@@ -31,7 +31,9 @@ def create_parser():
 
 
 def get_dataset(download_path=None):
-    dataset = PygNodePropPredDataset(name="ogbn-products", root=input_dir, transform=T.NormalizeFeatures())
+    dataset = PygNodePropPredDataset(
+        name="ogbn-products", root=input_dir, transform=T.NormalizeFeatures()
+    )
     gcn_norm = T.GCNNorm()
     return (gcn_norm.forward(dataset[0]), dataset.num_classes)
 
diff --git a/benchmarking/spmm.py b/benchmarking/spmm.py
@@ -5,7 +5,7 @@
 
 
 def multiply_sharded_matrices_padded(
-    pt_file
+    pt_file,
     shard_row,
     shard_col,
     shard_x_col,
@@ -36,14 +36,8 @@ def multiply_sharded_matrices_padded(
         original_x_cols = x.shape[1]
 
         # Calculate padded dimensions for edge_index (implied adjacency matrix)
-        padded_rows = (
-            (original_num_nodes + shard_row - 1) // shard_row * shard_row
-        )
-        padded_cols_x = (
-            (original_num_nodes + shard_col - 1)
-            // shard_col
-            * shard_col
-        )
+        padded_rows = (original_num_nodes + shard_row - 1) // shard_row * shard_row
+        padded_cols_x = (original_num_nodes + shard_col - 1) // shard_col * shard_col
 
         # Calculate padded dimensions for x
         padded_x_rows = (
@@ -54,9 +48,7 @@ def multiply_sharded_matrices_padded(
             padded_x_cols = original_x_cols
         else:
             padded_x_cols = (
-                (original_x_cols + shard_x_col - 1)
-                // shard_x_col
-                * shard_x_col
+                (original_x_cols + shard_x_col - 1) // shard_x_col * shard_x_col
             )
 
         # Calculate shard sizes for padded dimensions
@@ -99,7 +91,10 @@ def multiply_sharded_matrices_padded(
         x_end_col = x_col_shard_size
         sharded_x = padded_x[x_start_row:x_end_row, x_start_col:x_end_col]
 
-        print("Theoretical # of FLOPs (2 * NNZ * D): " + str(2 * sharded_adj_t._nnz() * sharded_x.shape[1]))
+        print(
+            "Theoretical # of FLOPs (2 * NNZ * D): "
+            + str(2 * sharded_adj_t._nnz() * sharded_x.shape[1])
+        )
 
         # Move tensors to CUDA if available
         if torch.cuda.is_available():
@@ -151,7 +146,7 @@ def multiply_sharded_matrices_padded(
     parser.add_argument(
         "pt_file",
         type=int,
-        help="Path to plexus processed .pt file containing the data"
+        help="Path to plexus processed .pt file containing the data",
     )
     parser.add_argument(
         "shard_row",
@@ -189,5 +184,5 @@ def multiply_sharded_matrices_padded(
         args.shard_col,
         args.shard_x_col,
         args.iterations,
-        args.warmup
+        args.warmup,
     )
diff --git a/plexus/README.md b/plexus/README.md
@@ -0,0 +1,23 @@
+## Files
+
+-   **gcn_conv.py**: This file implements a 3D tensor-parallel version of the `GCNConv` layer, a fundamental component in Graph Convolutional Networks.
+
+-   **cross_entropy.py**: This file provides a parallel implementation of the cross-entropy loss function, a standard loss function used for node-level classification.
+
+-   **utils/**: This subdirectory contains several utility modules that provide essential functionalities for the Plexus framework:
+
+    -   **general.py**: This module includes generic utility functions used throughout the framework, including the following.
+        -   setting s random seed for reproducible experiments.
+        -   padfinh a number to make it divisible by another number, which is helpful when sharding.
+        -   functions for retrieving process group information
+        -   functions for printing timing information
+
+    -   **dataset.py**: This module provides utilities for preprocessing graph datasets. Key functions include:
+        -   `preprocess_graph()`: Preprocesses a graph dataset.  This includes normalizing features and the adjacency matrix, and applying the double permutation scheme specific to Plexus.  It is recommended to use the `set_seed` function from `general.py` before calling this function, as random initialization is used for features in datasets that do not originally contain them.
+        -   `partition_graph_2d()`: Statically 2D partitions a preprocessed graph, creating an individual file for each 2D matrix partition. This allows for distributing the data across multiple devices, preventing each GPU from having to load the entire dataset.
+        -   Other utility functions for data conversion and manipulation.
+
+    -   **dataloader.py**: This module contains the `DataLoader` class, which is responsible for efficiently loading preprocessed graph data. The `DataLoader` supports two modes:
+        -   Loading unpartitioned (original) preprocessed data.
+        -   Loading partitioned data generated by the `partition_graph_2d()` function. In this case, the `DataLoader` automatically determines which files to load for each GPU and extracts the relevant data shards.
+
diff --git a/plexus/utils/README.md b/plexus/utils/README.md
@@ -0,0 +1,23 @@
+## Files
+
+-   **gcn_conv.py**: This file implements a 3D tensor-parallel version of the `GCNConv` layer, a fundamental component in Graph Convolutional Networks.
+
+-   **cross_entropy.py**: This file provides a parallel implementation of the cross-entropy loss function, a standard loss function used for node-level classification.
+
+-   **utils/**: This subdirectory contains several utility modules that provide essential functionalities for the Plexus framework:
+
+    -   **general.py**: This module includes generic utility functions used throughout the framework, including the following.
+        -   setting s random seed for reproducible experiments.
+        -   padfinh a number to make it divisible by another number, which is helpful when sharding.
+        -   functions for retrieving process group information
+        -   functions for printing timing information
+
+    -   **dataset.py**: This module provides utilities for preprocessing graph datasets. Key functions include:
+        -   `preprocess_graph()`: Preprocesses a graph dataset.  This includes normalizing features and the adjacency matrix, and applying the double permutation scheme specific to Plexus.  It is recommended to use the `set_seed` function from `general.py` before calling this function, as random initialization is used for features in datasets that do not originally contain them.
+        -   `partition_graph_2d()`: Statically 2D partitions a preprocessed graph, creating an individual file for each 2D matrix partition. This allows for distributing the data across multiple devices, preventing each GPU from having to load the entire dataset.
+        -   Other utility functions for data conversion and manipulation.
+
+    -   **dataloader.py**: This module contains the `DataLoader` class, which is responsible for efficiently loading preprocessed graph data. The `DataLoader` supports two modes:
+        -   Loading unpartitioned (original) preprocessed data.
+        -   Loading partitioned data generated by the `partition_graph_2d()` function. In this case, the `DataLoader` automatically determines which files to load for each GPU and extracts the relevant data shards.
+
diff --git a/plexus/utils/dataset.py b/plexus/utils/dataset.py
@@ -507,24 +507,3 @@ def process_partition(chunk_idx_dim1, chunk_idx_dim2):
         for future in futures:
             future.result()  # Ensure completion
 
-
-if __name__ == "__main__":
-    # don't delete these two lines
-    set_seed(0)
-    torch.serialization.add_safe_globals(
-        [dtype, scalar, GlobalStorage, DataEdgeAttr, DataTensorAttr]
-    )
-
-    """
-    preprocess_graph(
-        "papers",
-        "/pscratch/sd/a/aranjan/gnn-env/gnn-datasets/original",
-        "/pscratch/sd/a/aranjan/gnn-env/gnn-datasets/papers",
-    )
-    """
-
-    partition_graph_2d(
-        "/pscratch/sd/a/aranjan/gnn-env/gnn-datasets/papers/processed_papers.pt",
-        16,
-        "/pscratch/sd/a/aranjan/gnn-env/gnn-datasets/partitioned_papers",
-    )