Merge pull request #166 from dice-group/develop

Demirrr · web-flow · commit 7566936efcb7 · 2023-10-16T16:41:44.000+02:00
Develop
diff --git a/README.md b/README.md
@@ -74,14 +74,15 @@ from dicee.executer import Execute
 from dicee.config import Namespace
 args = Namespace()
 args.model = 'Keci'
-args.scoring_technique = "AllvsAll"
-args.path_dataset_folder = "KGs/UMLS/"
-args.path_to_store_single_run="Keci_UMLS"
+args.scoring_technique = "KvsAll"  # 1vsAll, or AllvsAll, or NegSample
+args.dataset_dir = "KGs/UMLS/"
+args.path_to_store_single_run = "Keci_UMLS"
 args.num_epochs = 100
 args.embedding_dim = 32
+args.batch_size = 1024
 reports = Execute(args).start()
-# reports["Train"]["MRR"] =>0.97089
-# reports["Test"]["MRR"] => 0.8197
+print(reports["Train"]["MRR"]) # => 0.9912
+print(reports["Test"]["MRR"]) # => 0.8155
 # See the Keci_UMLS folder embeddings and all other files
 ```
 where the data is in the following form
@@ -93,16 +94,22 @@ alga    isa     entity
 ```
 A KGE model can also be trained from the command line
 ```bash
-python -m dicee.run --path_dataset_folder "KGs/UMLS" --model Keci --eval_model "train_val_test"
+dicee --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test"
 ```
 Models can be easily trained in a single node multi-gpu setting
 ```bash
-python -m dicee.run --accelerator "gpu" --strategy "ddp" --path_dataset_folder "KGs/UMLS" --model Keci --eval_model "train_val_test" 
+dicee --accelerator "gpu" --strategy "ddp" --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" 
 ```
+Similarly, models can be easily trained in a multi-node multi-gpu setting
+```bash
+torchrun --nnodes 2 --nproc_per_node=gpu  --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
+torchrun --nnodes 2 --nproc_per_node=gpu  --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
+```
+
 Train a KGE model by providing the path of a single file and store all parameters under newly created directory
 called `KeciFamilyRun`.
 ```bash
-python -m dicee.run --path_single_kg "KGs/Family/train.txt" --model Keci --path_to_store_single_run KeciFamilyRun
+dicee --path_single_kg "KGs/Family/family-benchmark_rich_background.owl" --model Keci --path_to_store_single_run KeciFamilyRun --backend rdflib
 ```
 where the data is in the following form
 ```bash
@@ -114,7 +121,7 @@ _:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07
 **Apart from n-triples or standard link prediction dataset formats, we support ["owl", "nt", "turtle", "rdf/xml", "n3"]***.
 Moreover, a KGE model can be also trained  by providing **an endpoint of a triple store**.
 ```bash
-python -m dicee.run --sparql_endpoint "http://localhost:3030/mutagenesis/" --model Keci
+dicee --sparql_endpoint "http://localhost:3030/mutagenesis/" --model Keci
 ```
 For more, please refer to `examples`.
 </details>
diff --git a/dicee/config.py b/dicee/config.py
@@ -2,13 +2,13 @@
 class Namespace(argparse.Namespace):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.dataset_dir: str = 'KGs/UMLS'
+        self.dataset_dir: str = None
         "The path of a folder containing train.txt, and/or valid.txt and/or test.txt"
 
         self.save_embeddings_as_csv: bool = False
         "A flag for saving embeddings in csv file."
 
-        self.storage_path: str = 'Experiments'
+        self.storage_path: str = "Experiments"
         "A directory named with time of execution under --storage_path that contains related data about embeddings."
 
         self.path_to_store_single_run: str = None
diff --git a/dicee/knowledge_graph_embeddings.py b/dicee/knowledge_graph_embeddings.py
@@ -46,19 +46,19 @@ def eval_lp_performance(self, dataset=List[Tuple[str, str, str]], filtered=True)
             return evaluate_lp(model=self.model, triple_idx=idx_dataset, num_entities=len(self.entity_to_idx),
                                er_vocab=None, re_vocab=None)
 
-    def predict_missing_head_entity(self, relation: List[str], tail_entity: List[str]) -> Tuple:
+    def predict_missing_head_entity(self, relation: Union[List[str], str], tail_entity: Union[List[str], str]) -> Tuple:
         """
         Given a relation and a tail entity, return top k ranked head entity.
 
         argmax_{e \in E } f(e,r,t), where r \in R, t \in E.
 
         Parameter
         ---------
-        relation: List[str]
+        relation:  Union[List[str], str]
 
         String representation of selected relations.
 
-        tail_entity: List[str]
+        tail_entity: Union[List[str], str]
 
         String representation of selected entities.
 
@@ -74,14 +74,22 @@ def predict_missing_head_entity(self, relation: List[str], tail_entity: List[str
         """
 
         head_entity = torch.arange(0, len(self.entity_to_idx))
-        relation = torch.LongTensor([self.relation_to_idx[i] for i in relation])
-        tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
+        if isinstance(relation, list):
+            relation = torch.LongTensor([self.relation_to_idx[i] for i in relation])
+        else:
+            relation = torch.LongTensor([self.relation_to_idx[relation]])
+        if isinstance(tail_entity, list):
+            tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
+        else:
+            tail_entity = torch.LongTensor([self.entity_to_idx[tail_entity]])
+
         x = torch.stack((head_entity,
                          relation.repeat(self.num_entities, ),
                          tail_entity.repeat(self.num_entities, )), dim=1)
         return self.model.forward(x)
 
-    def predict_missing_relations(self, head_entity: List[str], tail_entity: List[str]) -> Tuple:
+    def predict_missing_relations(self, head_entity: Union[List[str], str],
+                                  tail_entity: Union[List[str], str]) -> Tuple:
         """
         Given a head entity and a tail entity, return top k ranked relations.
 
@@ -109,19 +117,23 @@ def predict_missing_relations(self, head_entity: List[str], tail_entity: List[st
         Highest K scores and entities
         """
 
-        head_entity = torch.LongTensor([self.entity_to_idx[i] for i in head_entity])
         relation = torch.arange(0, len(self.relation_to_idx))
-        tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
 
+        if isinstance(head_entity, list):
+            head_entity = torch.LongTensor([self.entity_to_idx[i] for i in head_entity])
+        else:
+            head_entity = torch.LongTensor([self.entity_to_idx[head_entity]])
+        if isinstance(tail_entity, list):
+            tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
+        else:
+            tail_entity = torch.LongTensor([self.entity_to_idx[tail_entity]])
         x = torch.stack((head_entity.repeat(self.num_relations, ),
                          relation,
                          tail_entity.repeat(self.num_relations, )), dim=1)
         return self.model(x)
-        # scores = self.model(x)
-        # sort_scores, sort_idxs = torch.topk(scores, topk)
-        # return sort_scores, [self.idx_to_relations[i] for i in sort_idxs.tolist()]
 
-    def predict_missing_tail_entity(self, head_entity: List[str], relation: List[str]) -> torch.FloatTensor:
+    def predict_missing_tail_entity(self, head_entity: Union[List[str], str],
+                                    relation: Union[List[str], str]) -> torch.FloatTensor:
         """
         Given a head entity and a relation, return top k ranked entities
 
@@ -143,21 +155,38 @@ def predict_missing_tail_entity(self, head_entity: List[str], relation: List[str
 
         scores
         """
-        x = torch.cat((torch.LongTensor([self.entity_to_idx[i] for i in head_entity]).unsqueeze(-1),
-                       torch.LongTensor([self.relation_to_idx[i] for i in relation]).unsqueeze(-1)), dim=1)
+        tail_entity = torch.arange(0, len(self.entity_to_idx))
+
+        if isinstance(head_entity, list):
+            head_entity = torch.LongTensor([self.entity_to_idx[i] for i in head_entity])
+        else:
+            head_entity = torch.LongTensor([self.entity_to_idx[head_entity]])
+        if isinstance(relation, list):
+            relation = torch.LongTensor([self.relation_to_idx[i] for i in relation])
+        else:
+            relation = torch.LongTensor([self.relation_to_idx[relation]])
+
+        x = torch.stack((head_entity.repeat(self.num_entities, ),
+                         relation.repeat(self.num_entities, ),
+                         tail_entity), dim=1)
         return self.model.forward(x)
 
-    def predict(self, *, h: List[str] = None, r: List[str] = None, t: List[str] = None):
+    def predict(self, *, h: Union[List[str], str] = None, r: Union[List[str], str] = None,
+                t: Union[List[str], str] = None) -> torch.FloatTensor:
+        """
+        Predict missing triples by means of
+        """
         # (1) Sanity checking.
         if h is not None:
-            assert isinstance(h, list)
+            assert isinstance(h, list) or isinstance(h, str)
             assert isinstance(h[0], str)
         if r is not None:
-            assert isinstance(r, list)
+            assert isinstance(r, list) or isinstance(r, str)
             assert isinstance(r[0], str)
         if t is not None:
-            assert isinstance(t, list)
+            assert isinstance(t, list) or isinstance(t, str)
             assert isinstance(t[0], str)
+
         # (2) Predict missing head entity given a relation and a tail entity.
         if h is None:
             assert r is not None
@@ -177,7 +206,6 @@ def predict(self, *, h: List[str] = None, r: List[str] = None, t: List[str] = No
             # h r ?
             scores = self.predict_missing_tail_entity(h, r)
         else:
-            assert len(h) == len(r) == len(t)
             scores = self.triple_score(h, r, t)
         return torch.sigmoid(scores)
 
@@ -261,8 +289,8 @@ def predict_topk(self, *, h: List[str] = None, r: List[str] = None, t: List[str]
         else:
             raise AttributeError('Use triple_score method')
 
-    def triple_score(self, h: List[str] = None, r: List[str] = None,
-                     t: List[str] = None, logits=False) -> torch.FloatTensor:
+    def triple_score(self, h: Union[List[str], str] = None, r: Union[List[str], str] = None,
+                     t: Union[List[str], str] = None, logits=False) -> torch.FloatTensor:
         """
         Predict triple score
 
@@ -289,9 +317,14 @@ def triple_score(self, h: List[str] = None, r: List[str] = None,
 
         pytorch tensor of triple score
         """
-        h = torch.LongTensor([self.entity_to_idx[i] for i in h]).reshape(len(h), 1)
-        r = torch.LongTensor([self.relation_to_idx[i] for i in r]).reshape(len(r), 1)
-        t = torch.LongTensor([self.entity_to_idx[i] for i in t]).reshape(len(t), 1)
+        if isinstance(h, list) and isinstance(r, list) and isinstance(t, list):
+            h = torch.LongTensor([self.entity_to_idx[i] for i in h]).reshape(len(h), 1)
+            r = torch.LongTensor([self.relation_to_idx[i] for i in r]).reshape(len(r), 1)
+            t = torch.LongTensor([self.entity_to_idx[i] for i in t]).reshape(len(t), 1)
+        else:
+            h = torch.LongTensor([self.entity_to_idx[h]]).reshape(1, 1)
+            r = torch.LongTensor([self.relation_to_idx[r]]).reshape(1, 1)
+            t = torch.LongTensor([self.entity_to_idx[t]]).reshape(1, 1)
 
         x = torch.hstack((h, r, t))
         if self.apply_semantic_constraint:
@@ -343,7 +376,8 @@ def negnorm(self, tens_1: torch.Tensor, lambda_: float, neg_norm: str = 'standar
     def __single_hop_query_answering(self, query: Tuple[str, Tuple[str, ...]]):
         head, relation = query
         assert len(relation) == 1
-        return self.predict(h=[head], r=[relation[0]])
+        # scores for all entities
+        return self.predict(h=head, r=relation[0])
 
     def __return_answers_and_scores(self, query_score_of_all_entities, k: int):
         query_score_of_all_entities = [(ei, s) for ei, s in zip(self.entity_to_idx.keys(), query_score_of_all_entities)]
@@ -443,10 +477,10 @@ def answer_multi_hop_query(self, query_type: str = None, query: Tuple[Union[str,
                                                                              tnorm=tnorm,
                                                                              k=k):
                 top_k_scores1.append(score_of_e_r1_a)
-                # () Scores for all entities E
-                atom2_scores.append(self.predict(h=[top_k_entity], r=[relation2]))
+                # (.) Scores for all entities E
+                atom2_scores.append(self.predict(h=top_k_entity, r=relation2))
             # k by E tensor
-            atom2_scores = torch.cat(atom2_scores, dim=0)
+            atom2_scores = torch.vstack(atom2_scores)
             topk_scores1_expanded = torch.FloatTensor(top_k_scores1).view(-1, 1).repeat(1, atom2_scores.shape[1])
             query_scores, _ = torch.max(self.t_norm(topk_scores1_expanded, atom2_scores, tnorm), dim=0)
             if only_scores:
@@ -468,7 +502,7 @@ def answer_multi_hop_query(self, query_type: str = None, query: Tuple[Union[str,
                 # () Scores for all entities E
                 atom2_scores.append(self.predict(h=[top_k_entity], r=[relation3]))
             # k by E tensor
-            atom2_scores = torch.cat(atom2_scores, dim=0)
+            atom2_scores = torch.vstack(atom2_scores)
             topk_scores1_expanded = torch.FloatTensor(top_k_scores1).view(-1, 1).repeat(1, atom2_scores.shape[1])
             query_scores, _ = torch.max(self.t_norm(topk_scores1_expanded, atom2_scores, tnorm), dim=0)
             if only_scores:
diff --git a/dicee/query_generator.py b/dicee/query_generator.py
@@ -151,7 +151,7 @@ def achieve_answer(self, query: List[Union[str, List]],
         all_relation_flag = True
         for ele in query[-1]:
             # @TODO: unclear
-            if isinstance(ele,int) or (ele == -1):
+            if not isinstance(ele, int) or (ele == -1):
                 all_relation_flag = False
                 break
         if all_relation_flag:
diff --git a/dicee/run.py b/dicee/run.py
@@ -9,7 +9,7 @@ def get_default_arguments(description=None):
     parser = pl.Trainer.add_argparse_args(argparse.ArgumentParser(add_help=False))
     # Default Trainer param https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#methods
     # Data related arguments
-    parser.add_argument("--path_dataset_folder", type=str, default=None,
+    parser.add_argument("--dataset_dir", type=str, default=None,
                         help="The path of a folder containing train.txt, and/or valid.txt and/or test.txt"
                              ",e.g., KGs/UMLS")
     parser.add_argument("--sparql_endpoint", type=str, default=None,
@@ -40,15 +40,15 @@ def get_default_arguments(description=None):
                         choices=['Adam', 'SGD'])
     parser.add_argument('--embedding_dim', type=int, default=32,
                         help='Number of dimensions for an embedding vector. ')
-    parser.add_argument("--num_epochs", type=int, default=50, help='Number of epochs for training. ')
+    parser.add_argument("--num_epochs", type=int, default=1, help='Number of epochs for training. ')
     parser.add_argument('--batch_size', type=int, default=1024,
                         help='Mini batch size. If None, automatic batch finder is applied')
     parser.add_argument("--lr", type=float, default=0.1)
     parser.add_argument('--callbacks', type=json.loads,
                         default={},
                         help='{"PPE":{ "last_percent_to_consider": 10}}'
                              '"Perturb": {"level": "out", "ratio": 0.2, "method": "RN", "scaler": 0.3}')
-    parser.add_argument("--backend", type=str, default='pandas',
+    parser.add_argument("--backend", type=str, default="pandas",
                         choices=["pandas", "polars", "rdflib"],
                         help='Backend for loading, preprocessing, indexing input knowledge graph.')
     parser.add_argument("--trainer", type=str, default='PL',
@@ -102,6 +102,8 @@ def get_default_arguments(description=None):
         return parser.parse_args()
     return parser.parse_args(description)
 
+def main():
+    Execute(get_default_arguments()).start()
 
 if __name__ == '__main__':
-    Execute(get_default_arguments()).start()
+    main()
diff --git a/dicee/static_funcs.py b/dicee/static_funcs.py
@@ -75,7 +75,6 @@ def load_model(path_of_experiment_folder: str, model_name='model.pt') -> Tuple[o
     configs = load_json(path_of_experiment_folder + '/configuration.json')
     configs["num_entities"] = num_ent
     configs["num_relations"] = num_rel
-    #configs["embedding_dim"] = ent_dim
 
     print(f'Done! It took {time.time() - start_time:.3f}')
     # (4) Select the model
diff --git a/docs/index.rst b/docs/index.rst
@@ -19,16 +19,16 @@ Welcome to DICE Embeddings!
    .. code-block:: bash
 
       // 1 CPU
-      (dicee) $ python -m dicee.run --path_dataset_folder KGs/UMLS
+      (dicee) $ dicee --dataset_dir KGs/UMLS
       // 10 CPU
-      (dicee) $ python -m dicee.run --path_dataset_folder KGs/UMLS --num_core 10
+      (dicee) $ dicee --dataset_dir KGs/UMLS --num_core 10
       // Distributed Data Parallel (DDP) with all GPUs
-      (dicee) $ python -m dicee.run --trainer PL --accelerator gpu --strategy ddp --path_dataset_folder KGs/UMLS
+      (dicee) $ dicee --trainer PL --accelerator gpu --strategy ddp --dataset_dir KGs/UMLS
       // Model Parallel with all GPUs and low precision
-      (dicee) $ python -m dicee.run --trainer PL --accelerator gpu --strategy deepspeed_stage_3 --path_dataset_folder KGs/UMLS --precision 16
+      (dicee) $ dicee --trainer PL --accelerator gpu --strategy deepspeed_stage_3 --dataset_dir KGs/UMLS --precision 16
       // DDP with all GPUs on two nodes (felis and nebula):
-      (dicee) cdemir@felis  $ torchrun --nnodes 2 --nproc_per_node=gpu  --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.main --trainer torchDDP --path_dataset_folder KGs/UMLS
-      (dicee) cdemir@nebula $ torchrun --nnodes 2 --nproc_per_node=gpu  --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.main --trainer torchDDP --path_dataset_folder KGs/UMLS
+      (dicee) cdemir@felis  $ torchrun --nnodes 2 --nproc_per_node=gpu  --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
+      (dicee) cdemir@nebula $ torchrun --nnodes 2 --nproc_per_node=gpu  --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
 
 .. toctree::
    :maxdepth: 2
diff --git a/setup.py b/setup.py
@@ -27,6 +27,7 @@
         "Programming Language :: Python :: 3.9",
         "License :: OSI Approved :: MIT License"],
     python_requires='>=3.9',
+    entry_points={"console_scripts": ["dicee = dicee.run:main"]},
     long_description=long_description,
     long_description_content_type="text/markdown",
 )

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@`
`27`	`27`	`"Programming Language :: Python :: 3.9",`
`28`	`28`	`"License :: OSI Approved :: MIT License"],`
`29`	`29`	`python_requires='>=3.9',`
	`30`	`+ entry_points={"console_scripts": ["dicee = dicee.run:main"]},`
`30`	`31`	`long_description=long_description,`
`31`	`32`	`long_description_content_type="text/markdown",`
`32`	`33`	`)`