Merge branch 'develop' of https://github.com/dice-group/dice-embeddings…

… into kfold-crossvalidation-verbose
dice-group · Nov 29, 2024 · 85f2383 · 85f2383
2 parents 27d9dd6 + fb436e6
commit 85f2383
Show file tree

Hide file tree

Showing 10 changed files with 362 additions and 197 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,4 @@
+[![Downloads](https://static.pepy.tech/badge/dicee)](https://pepy.tech/project/dicee)
 [![Downloads](https://img.shields.io/pypi/dm/dicee)](https://pypi.org/project/dicee/)
 [![Coverage](https://img.shields.io/badge/coverage-54%25-green)](https://dice-group.github.io/dice-embeddings/usage/main.html#coverage-report)
 [![Pypi](https://img.shields.io/badge/pypi-0.1.4-blue)](https://pypi.org/project/dicee/0.1.4/)

diff --git a/dicee/config.py b/dicee/config.py
@@ -85,7 +85,6 @@ def __init__(self, **kwargs):
 
         self.label_smoothing_rate: float = 0.0
 
-
         self.num_core: int = 0
         """Number of CPUs to be used in the mini-batch loading process"""
 
@@ -139,6 +138,9 @@ def __init__(self, **kwargs):
         self.continual_learning=None
         "Path of a pretrained model size of LLM"
 
+        self.auto_batch_finding=False
+        "A flag for using auto batch finding"
+
     def __iter__(self):
         # Iterate
         for k, v in self.__dict__.items():

diff --git a/dicee/models/ensemble.py b/dicee/models/ensemble.py
@@ -1,25 +1,20 @@
 import torch
 import copy
 
-import torch._dynamo
-
-torch._dynamo.config.suppress_errors = True
-
-
 class EnsembleKGE:
     def __init__(self, seed_model):
         self.models = []
         self.optimizers = []
         self.loss_history = []
         for i in range(torch.cuda.device_count()):
             i_model=copy.deepcopy(seed_model)
-            i_model.to(torch.device(f"cuda:{i}"))
             # TODO: Why we cant send the compile model to cpu ?
-            # i_model = torch.compile(i_model)
+            #i_model = torch.compile(i_model)
+            i_model.to(torch.device(f"cuda:{i}"))
             self.optimizers.append(i_model.configure_optimizers())
             self.models.append(i_model)
         # Maybe use the original model's name ?
-        self.name="TP_"+self.models[0].name
+        self.name=self.models[0].name
         self.train_mode=True
 
     def named_children(self):
@@ -87,7 +82,25 @@ def __call__(self,x_batch):
     def step(self):
         for opt in self.optimizers:
             opt.step()
-
+
+    def get_embeddings(self):
+        entity_embeddings=[]
+        relation_embeddings=[]
+        # () Iterate
+        for trained_model in self.models:
+            entity_emb, relation_ebm = trained_model.get_embeddings()
+            entity_embeddings.append(entity_emb)
+            if relation_ebm is not None:
+                relation_embeddings.append(relation_ebm)
+        # () Concat the embedding vectors horizontally.
+        entity_embeddings=torch.cat(entity_embeddings,dim=1)
+        if relation_embeddings:
+            relation_embeddings=torch.cat(relation_embeddings,dim=1)
+        else:
+            relation_embeddings=None
+
+        return entity_embeddings, relation_embeddings
+
     """
     def __getattr__(self, name):
         # Create a function that will call the same attribute/method on each model

diff --git a/dicee/sanity_checkers.py b/dicee/sanity_checkers.py
@@ -32,11 +32,11 @@ def validate_knowledge_graph(args):
 
     elif args.path_single_kg is not None:
         if args.sparql_endpoint is not None or args.path_single_kg is not None:
-            print(f'The dataset_dir and sparql_endpoint arguments '
-                  f'must be None if path_single_kg is given.'
-                  f'***{args.dataset_dir}***\n'
-                  f'***{args.sparql_endpoint}***\n'
-                  f'These two parameters are set to None.')
+            #print(f'The dataset_dir and sparql_endpoint arguments '
+            #      f'must be None if path_single_kg is given.'
+            #      f'***{args.dataset_dir}***\n'
+            #      f'***{args.sparql_endpoint}***\n'
+            #      f'These two parameters are set to None.')
             args.dataset_dir = None
             args.sparql_endpoint = None
 
@@ -61,11 +61,11 @@ def validate_knowledge_graph(args):
                 f"Use --path_single_kg **folder/dataset.format**, if you have a single file.")
 
         if args.sparql_endpoint is not None or args.path_single_kg is not None:
-            print(f'The sparql_endpoint and path_single_kg arguments '
-                  f'must be None if dataset_dir is given.'
-                  f'***{args.sparql_endpoint}***\n'
-                  f'***{args.path_single_kg}***\n'
-                  f'These two parameters are set to None.')
+            #print(f'The sparql_endpoint and path_single_kg arguments '
+            #      f'must be None if dataset_dir is given.'
+            #      f'***{args.sparql_endpoint}***\n'
+            #      f'***{args.path_single_kg}***\n'
+            #      f'These two parameters are set to None.')
             args.sparql_endpoint = None
             args.path_single_kg = None
 

diff --git a/dicee/scripts/run.py b/dicee/scripts/run.py
@@ -123,6 +123,9 @@ def get_default_arguments(description=None):
     parser.add_argument("--swa",
                         action="store_true",
                         help="Stochastic weight averaging")
+    parser.add_argument("--auto_batch_finding",
+                        action="store_true",
+                        help="Find a batch size fitting in GPUs. Only available for TP trainer")
     parser.add_argument('--degree', type=int, default=0,
                         help='degree for polynomial embeddings')
     parser.add_argument('--disable_checkpointing', action='store_true', help='Disable creation of checkpoints during training')

diff --git a/dicee/static_funcs.py b/dicee/static_funcs.py
@@ -684,16 +684,15 @@ def download_pretrained_model(url: str) -> str:
         download_files_from_url(url_to_download_from, destination_folder=dir_name)
     return dir_name
 
-def write_csv_from_model_parallel(path: str) -> None:
+def write_csv_from_model_parallel(path: str) :
     """Create"""
     assert os.path.exists(path), "Path does not exist"
-
     # Detect files that start with model_ and end with .pt
     model_files = [f for f in os.listdir(path) if f.startswith("model_") and f.endswith(".pt")]
     model_files.sort()  # Sort to maintain order if necessary (e.g., model_0.pt, model_1.pt)
 
-    entity_csv_path = os.path.join(path, "entity_embeddings.csv")
-    relation_csv_path = os.path.join(path, "relation_embeddings.csv")
+    entity_embeddings=[]
+    relation_embeddings=[]
 
     # Process each model file
     for model_file in model_files:
@@ -702,65 +701,50 @@ def write_csv_from_model_parallel(path: str) -> None:
         model = torch.load(model_path)
         # Assuming model has a get_embeddings method
         entity_emb, relation_emb = model["_orig_mod.entity_embeddings.weight"], model["_orig_mod.relation_embeddings.weight"]
-        # Convert to numpy
-        entity_emb = entity_emb.numpy()
-        relation_emb = relation_emb.numpy()
-
-        # Write or append to CSV
-        if not os.path.exists(entity_csv_path) or not os.path.exists(relation_csv_path):
-            # If CSV files do not exist, create them
-            pd.DataFrame(entity_emb).to_csv(entity_csv_path, index=True, header=False)
-            pd.DataFrame(relation_emb).to_csv(relation_csv_path, index=True, header=False)
-        else:
-            # If CSV files exist, concatenate to the existing rows
-            existing_entity_df = pd.read_csv(entity_csv_path, header=None)
-            existing_relation_df = pd.read_csv(relation_csv_path, header=None)
+        entity_embeddings.append(entity_emb)
+        relation_embeddings.append(relation_emb)
 
-            # Concatenate along the columns (axis=1)
-            new_entity_df = pd.concat([existing_entity_df, pd.DataFrame(entity_emb)], axis=1)
-            new_relation_df = pd.concat([existing_relation_df, pd.DataFrame(relation_emb)], axis=1)
+    return torch.cat(entity_embeddings, dim=1), torch.cat(relation_embeddings, dim=1)
 
-            # Write the updated data back to the CSV files
-            new_entity_df.to_csv(entity_csv_path, index=False, header=False)
-            new_relation_df.to_csv(relation_csv_path, index=False, header=False)
 
 def from_pretrained_model_write_embeddings_into_csv(path: str) -> None:
     """ """
     assert os.path.exists(path), "Path does not exist"
     config = load_json(path + '/configuration.json')
-    if config["trainer"]=="MP":
-        write_csv_from_model_parallel(path)
+    entity_csv_path = os.path.join(path, f"{config['model']}_entity_embeddings.csv")
+    relation_csv_path = os.path.join(path, f"{config['model']}_relation_embeddings.csv")
+
+    if config["trainer"]=="TP":
+        entity_emb, relation_emb = write_csv_from_model_parallel(path)
     else:
-        entity_csv_path = os.path.join(path, f"{config['model']}_entity_embeddings.csv")
-        relation_csv_path = os.path.join(path, f"{config['model']}_relation_embeddings.csv")
         # Load model
         model = torch.load(os.path.join(path, "model.pt"))
         # Assuming model has a get_embeddings method
         entity_emb, relation_emb = model["entity_embeddings.weight"], model["relation_embeddings.weight"]
-        str_entity = pd.read_csv(f"{path}/entity_to_idx.csv", index_col=0)["entity"]
-        assert str_entity.index.is_monotonic_increasing
-        str_entity=str_entity.to_list()
-        # Write entity embeddings with headers and indices
-        with open(entity_csv_path, "w", newline="") as f:
-            writer = csv.writer(f)
-            # Add header (e.g., "", "0", "1", ..., "N")
-            headers = [""] + [f"{i}" for i in range(entity_emb.size(1))]
-            writer.writerow(headers)
-            # Add rows with index
-            for i_row, (name,row) in enumerate(zip(str_entity,entity_emb)):
-                writer.writerow([name] + row.tolist())
-        str_relations = pd.read_csv(f"{path}/relation_to_idx.csv", index_col=0)["relation"]
-        assert str_relations.index.is_monotonic_increasing
-
-        # Write relation embeddings with headers and indices
-        with open(relation_csv_path, "w", newline="") as f:
-            writer = csv.writer(f)
-            # Add header (e.g., "", "0", "1", ..., "N")
-            headers = [""] + [f"{i}" for i in range(relation_emb.size(1))]
-            writer.writerow(headers)
-            # Add rows with index
-            for i_row, (name, row) in enumerate(zip(str_relations,relation_emb)):
-                writer.writerow([name]+ row.tolist())
+    str_entity = pd.read_csv(f"{path}/entity_to_idx.csv", index_col=0)["entity"]
+    assert str_entity.index.is_monotonic_increasing
+    str_entity=str_entity.to_list()
+    # Write entity embeddings with headers and indices
+    with open(entity_csv_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        # Add header (e.g., "", "0", "1", ..., "N")
+        headers = [""] + [f"{i}" for i in range(entity_emb.size(1))]
+        writer.writerow(headers)
+        # Add rows with index
+        for i_row, (name,row) in enumerate(zip(str_entity,entity_emb)):
+            writer.writerow([name] + row.tolist())
+    str_relations = pd.read_csv(f"{path}/relation_to_idx.csv", index_col=0)["relation"]
+    assert str_relations.index.is_monotonic_increasing
+
+    # Write relation embeddings with headers and indices
+    with open(relation_csv_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        # Add header (e.g., "", "0", "1", ..., "N")
+        headers = [""] + [f"{i}" for i in range(relation_emb.size(1))]
+        writer.writerow(headers)
+        # Add rows with index
+        for i_row, (name, row) in enumerate(zip(str_relations,relation_emb)):
+            writer.writerow([name]+ row.tolist())
 
     """