Skip to content

Commit 7566936

Browse files
authored
Merge pull request #166 from dice-group/develop
Develop
2 parents 7d6d09d + fa3976c commit 7566936

File tree

8 files changed

+95
-52
lines changed

8 files changed

+95
-52
lines changed

README.md

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,15 @@ from dicee.executer import Execute
7474
from dicee.config import Namespace
7575
args = Namespace()
7676
args.model = 'Keci'
77-
args.scoring_technique = "AllvsAll"
78-
args.path_dataset_folder = "KGs/UMLS/"
79-
args.path_to_store_single_run="Keci_UMLS"
77+
args.scoring_technique = "KvsAll" # 1vsAll, or AllvsAll, or NegSample
78+
args.dataset_dir = "KGs/UMLS/"
79+
args.path_to_store_single_run = "Keci_UMLS"
8080
args.num_epochs = 100
8181
args.embedding_dim = 32
82+
args.batch_size = 1024
8283
reports = Execute(args).start()
83-
# reports["Train"]["MRR"] =>0.97089
84-
# reports["Test"]["MRR"] => 0.8197
84+
print(reports["Train"]["MRR"]) # => 0.9912
85+
print(reports["Test"]["MRR"]) # => 0.8155
8586
# See the Keci_UMLS folder embeddings and all other files
8687
```
8788
where the data is in the following form
@@ -93,16 +94,22 @@ alga isa entity
9394
```
9495
A KGE model can also be trained from the command line
9596
```bash
96-
python -m dicee.run --path_dataset_folder "KGs/UMLS" --model Keci --eval_model "train_val_test"
97+
dicee --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test"
9798
```
9899
Models can be easily trained in a single node multi-gpu setting
99100
```bash
100-
python -m dicee.run --accelerator "gpu" --strategy "ddp" --path_dataset_folder "KGs/UMLS" --model Keci --eval_model "train_val_test"
101+
dicee --accelerator "gpu" --strategy "ddp" --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test"
101102
```
103+
Similarly, models can be easily trained in a multi-node multi-gpu setting
104+
```bash
105+
torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
106+
torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
107+
```
108+
102109
Train a KGE model by providing the path of a single file and store all parameters under newly created directory
103110
called `KeciFamilyRun`.
104111
```bash
105-
python -m dicee.run --path_single_kg "KGs/Family/train.txt" --model Keci --path_to_store_single_run KeciFamilyRun
112+
dicee --path_single_kg "KGs/Family/family-benchmark_rich_background.owl" --model Keci --path_to_store_single_run KeciFamilyRun --backend rdflib
106113
```
107114
where the data is in the following form
108115
```bash
@@ -114,7 +121,7 @@ _:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07
114121
**Apart from n-triples or standard link prediction dataset formats, we support ["owl", "nt", "turtle", "rdf/xml", "n3"]***.
115122
Moreover, a KGE model can be also trained by providing **an endpoint of a triple store**.
116123
```bash
117-
python -m dicee.run --sparql_endpoint "http://localhost:3030/mutagenesis/" --model Keci
124+
dicee --sparql_endpoint "http://localhost:3030/mutagenesis/" --model Keci
118125
```
119126
For more, please refer to `examples`.
120127
</details>

dicee/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
class Namespace(argparse.Namespace):
33
def __init__(self, **kwargs):
44
super().__init__(**kwargs)
5-
self.dataset_dir: str = 'KGs/UMLS'
5+
self.dataset_dir: str = None
66
"The path of a folder containing train.txt, and/or valid.txt and/or test.txt"
77

88
self.save_embeddings_as_csv: bool = False
99
"A flag for saving embeddings in csv file."
1010

11-
self.storage_path: str = 'Experiments'
11+
self.storage_path: str = "Experiments"
1212
"A directory named with time of execution under --storage_path that contains related data about embeddings."
1313

1414
self.path_to_store_single_run: str = None

dicee/knowledge_graph_embeddings.py

Lines changed: 63 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,19 @@ def eval_lp_performance(self, dataset=List[Tuple[str, str, str]], filtered=True)
4646
return evaluate_lp(model=self.model, triple_idx=idx_dataset, num_entities=len(self.entity_to_idx),
4747
er_vocab=None, re_vocab=None)
4848

49-
def predict_missing_head_entity(self, relation: List[str], tail_entity: List[str]) -> Tuple:
49+
def predict_missing_head_entity(self, relation: Union[List[str], str], tail_entity: Union[List[str], str]) -> Tuple:
5050
"""
5151
Given a relation and a tail entity, return top k ranked head entity.
5252
5353
argmax_{e \in E } f(e,r,t), where r \in R, t \in E.
5454
5555
Parameter
5656
---------
57-
relation: List[str]
57+
relation: Union[List[str], str]
5858
5959
String representation of selected relations.
6060
61-
tail_entity: List[str]
61+
tail_entity: Union[List[str], str]
6262
6363
String representation of selected entities.
6464
@@ -74,14 +74,22 @@ def predict_missing_head_entity(self, relation: List[str], tail_entity: List[str
7474
"""
7575

7676
head_entity = torch.arange(0, len(self.entity_to_idx))
77-
relation = torch.LongTensor([self.relation_to_idx[i] for i in relation])
78-
tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
77+
if isinstance(relation, list):
78+
relation = torch.LongTensor([self.relation_to_idx[i] for i in relation])
79+
else:
80+
relation = torch.LongTensor([self.relation_to_idx[relation]])
81+
if isinstance(tail_entity, list):
82+
tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
83+
else:
84+
tail_entity = torch.LongTensor([self.entity_to_idx[tail_entity]])
85+
7986
x = torch.stack((head_entity,
8087
relation.repeat(self.num_entities, ),
8188
tail_entity.repeat(self.num_entities, )), dim=1)
8289
return self.model.forward(x)
8390

84-
def predict_missing_relations(self, head_entity: List[str], tail_entity: List[str]) -> Tuple:
91+
def predict_missing_relations(self, head_entity: Union[List[str], str],
92+
tail_entity: Union[List[str], str]) -> Tuple:
8593
"""
8694
Given a head entity and a tail entity, return top k ranked relations.
8795
@@ -109,19 +117,23 @@ def predict_missing_relations(self, head_entity: List[str], tail_entity: List[st
109117
Highest K scores and entities
110118
"""
111119

112-
head_entity = torch.LongTensor([self.entity_to_idx[i] for i in head_entity])
113120
relation = torch.arange(0, len(self.relation_to_idx))
114-
tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
115121

122+
if isinstance(head_entity, list):
123+
head_entity = torch.LongTensor([self.entity_to_idx[i] for i in head_entity])
124+
else:
125+
head_entity = torch.LongTensor([self.entity_to_idx[head_entity]])
126+
if isinstance(tail_entity, list):
127+
tail_entity = torch.LongTensor([self.entity_to_idx[i] for i in tail_entity])
128+
else:
129+
tail_entity = torch.LongTensor([self.entity_to_idx[tail_entity]])
116130
x = torch.stack((head_entity.repeat(self.num_relations, ),
117131
relation,
118132
tail_entity.repeat(self.num_relations, )), dim=1)
119133
return self.model(x)
120-
# scores = self.model(x)
121-
# sort_scores, sort_idxs = torch.topk(scores, topk)
122-
# return sort_scores, [self.idx_to_relations[i] for i in sort_idxs.tolist()]
123134

124-
def predict_missing_tail_entity(self, head_entity: List[str], relation: List[str]) -> torch.FloatTensor:
135+
def predict_missing_tail_entity(self, head_entity: Union[List[str], str],
136+
relation: Union[List[str], str]) -> torch.FloatTensor:
125137
"""
126138
Given a head entity and a relation, return top k ranked entities
127139
@@ -143,21 +155,38 @@ def predict_missing_tail_entity(self, head_entity: List[str], relation: List[str
143155
144156
scores
145157
"""
146-
x = torch.cat((torch.LongTensor([self.entity_to_idx[i] for i in head_entity]).unsqueeze(-1),
147-
torch.LongTensor([self.relation_to_idx[i] for i in relation]).unsqueeze(-1)), dim=1)
158+
tail_entity = torch.arange(0, len(self.entity_to_idx))
159+
160+
if isinstance(head_entity, list):
161+
head_entity = torch.LongTensor([self.entity_to_idx[i] for i in head_entity])
162+
else:
163+
head_entity = torch.LongTensor([self.entity_to_idx[head_entity]])
164+
if isinstance(relation, list):
165+
relation = torch.LongTensor([self.relation_to_idx[i] for i in relation])
166+
else:
167+
relation = torch.LongTensor([self.relation_to_idx[relation]])
168+
169+
x = torch.stack((head_entity.repeat(self.num_entities, ),
170+
relation.repeat(self.num_entities, ),
171+
tail_entity), dim=1)
148172
return self.model.forward(x)
149173

150-
def predict(self, *, h: List[str] = None, r: List[str] = None, t: List[str] = None):
174+
def predict(self, *, h: Union[List[str], str] = None, r: Union[List[str], str] = None,
175+
t: Union[List[str], str] = None) -> torch.FloatTensor:
176+
"""
177+
Predict missing triples by means of
178+
"""
151179
# (1) Sanity checking.
152180
if h is not None:
153-
assert isinstance(h, list)
181+
assert isinstance(h, list) or isinstance(h, str)
154182
assert isinstance(h[0], str)
155183
if r is not None:
156-
assert isinstance(r, list)
184+
assert isinstance(r, list) or isinstance(r, str)
157185
assert isinstance(r[0], str)
158186
if t is not None:
159-
assert isinstance(t, list)
187+
assert isinstance(t, list) or isinstance(t, str)
160188
assert isinstance(t[0], str)
189+
161190
# (2) Predict missing head entity given a relation and a tail entity.
162191
if h is None:
163192
assert r is not None
@@ -177,7 +206,6 @@ def predict(self, *, h: List[str] = None, r: List[str] = None, t: List[str] = No
177206
# h r ?
178207
scores = self.predict_missing_tail_entity(h, r)
179208
else:
180-
assert len(h) == len(r) == len(t)
181209
scores = self.triple_score(h, r, t)
182210
return torch.sigmoid(scores)
183211

@@ -261,8 +289,8 @@ def predict_topk(self, *, h: List[str] = None, r: List[str] = None, t: List[str]
261289
else:
262290
raise AttributeError('Use triple_score method')
263291

264-
def triple_score(self, h: List[str] = None, r: List[str] = None,
265-
t: List[str] = None, logits=False) -> torch.FloatTensor:
292+
def triple_score(self, h: Union[List[str], str] = None, r: Union[List[str], str] = None,
293+
t: Union[List[str], str] = None, logits=False) -> torch.FloatTensor:
266294
"""
267295
Predict triple score
268296
@@ -289,9 +317,14 @@ def triple_score(self, h: List[str] = None, r: List[str] = None,
289317
290318
pytorch tensor of triple score
291319
"""
292-
h = torch.LongTensor([self.entity_to_idx[i] for i in h]).reshape(len(h), 1)
293-
r = torch.LongTensor([self.relation_to_idx[i] for i in r]).reshape(len(r), 1)
294-
t = torch.LongTensor([self.entity_to_idx[i] for i in t]).reshape(len(t), 1)
320+
if isinstance(h, list) and isinstance(r, list) and isinstance(t, list):
321+
h = torch.LongTensor([self.entity_to_idx[i] for i in h]).reshape(len(h), 1)
322+
r = torch.LongTensor([self.relation_to_idx[i] for i in r]).reshape(len(r), 1)
323+
t = torch.LongTensor([self.entity_to_idx[i] for i in t]).reshape(len(t), 1)
324+
else:
325+
h = torch.LongTensor([self.entity_to_idx[h]]).reshape(1, 1)
326+
r = torch.LongTensor([self.relation_to_idx[r]]).reshape(1, 1)
327+
t = torch.LongTensor([self.entity_to_idx[t]]).reshape(1, 1)
295328

296329
x = torch.hstack((h, r, t))
297330
if self.apply_semantic_constraint:
@@ -343,7 +376,8 @@ def negnorm(self, tens_1: torch.Tensor, lambda_: float, neg_norm: str = 'standar
343376
def __single_hop_query_answering(self, query: Tuple[str, Tuple[str, ...]]):
344377
head, relation = query
345378
assert len(relation) == 1
346-
return self.predict(h=[head], r=[relation[0]])
379+
# scores for all entities
380+
return self.predict(h=head, r=relation[0])
347381

348382
def __return_answers_and_scores(self, query_score_of_all_entities, k: int):
349383
query_score_of_all_entities = [(ei, s) for ei, s in zip(self.entity_to_idx.keys(), query_score_of_all_entities)]
@@ -443,10 +477,10 @@ def answer_multi_hop_query(self, query_type: str = None, query: Tuple[Union[str,
443477
tnorm=tnorm,
444478
k=k):
445479
top_k_scores1.append(score_of_e_r1_a)
446-
# () Scores for all entities E
447-
atom2_scores.append(self.predict(h=[top_k_entity], r=[relation2]))
480+
# (.) Scores for all entities E
481+
atom2_scores.append(self.predict(h=top_k_entity, r=relation2))
448482
# k by E tensor
449-
atom2_scores = torch.cat(atom2_scores, dim=0)
483+
atom2_scores = torch.vstack(atom2_scores)
450484
topk_scores1_expanded = torch.FloatTensor(top_k_scores1).view(-1, 1).repeat(1, atom2_scores.shape[1])
451485
query_scores, _ = torch.max(self.t_norm(topk_scores1_expanded, atom2_scores, tnorm), dim=0)
452486
if only_scores:
@@ -468,7 +502,7 @@ def answer_multi_hop_query(self, query_type: str = None, query: Tuple[Union[str,
468502
# () Scores for all entities E
469503
atom2_scores.append(self.predict(h=[top_k_entity], r=[relation3]))
470504
# k by E tensor
471-
atom2_scores = torch.cat(atom2_scores, dim=0)
505+
atom2_scores = torch.vstack(atom2_scores)
472506
topk_scores1_expanded = torch.FloatTensor(top_k_scores1).view(-1, 1).repeat(1, atom2_scores.shape[1])
473507
query_scores, _ = torch.max(self.t_norm(topk_scores1_expanded, atom2_scores, tnorm), dim=0)
474508
if only_scores:

dicee/query_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def achieve_answer(self, query: List[Union[str, List]],
151151
all_relation_flag = True
152152
for ele in query[-1]:
153153
# @TODO: unclear
154-
if isinstance(ele,int) or (ele == -1):
154+
if not isinstance(ele, int) or (ele == -1):
155155
all_relation_flag = False
156156
break
157157
if all_relation_flag:

dicee/run.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def get_default_arguments(description=None):
99
parser = pl.Trainer.add_argparse_args(argparse.ArgumentParser(add_help=False))
1010
# Default Trainer param https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#methods
1111
# Data related arguments
12-
parser.add_argument("--path_dataset_folder", type=str, default=None,
12+
parser.add_argument("--dataset_dir", type=str, default=None,
1313
help="The path of a folder containing train.txt, and/or valid.txt and/or test.txt"
1414
",e.g., KGs/UMLS")
1515
parser.add_argument("--sparql_endpoint", type=str, default=None,
@@ -40,15 +40,15 @@ def get_default_arguments(description=None):
4040
choices=['Adam', 'SGD'])
4141
parser.add_argument('--embedding_dim', type=int, default=32,
4242
help='Number of dimensions for an embedding vector. ')
43-
parser.add_argument("--num_epochs", type=int, default=50, help='Number of epochs for training. ')
43+
parser.add_argument("--num_epochs", type=int, default=1, help='Number of epochs for training. ')
4444
parser.add_argument('--batch_size', type=int, default=1024,
4545
help='Mini batch size. If None, automatic batch finder is applied')
4646
parser.add_argument("--lr", type=float, default=0.1)
4747
parser.add_argument('--callbacks', type=json.loads,
4848
default={},
4949
help='{"PPE":{ "last_percent_to_consider": 10}}'
5050
'"Perturb": {"level": "out", "ratio": 0.2, "method": "RN", "scaler": 0.3}')
51-
parser.add_argument("--backend", type=str, default='pandas',
51+
parser.add_argument("--backend", type=str, default="pandas",
5252
choices=["pandas", "polars", "rdflib"],
5353
help='Backend for loading, preprocessing, indexing input knowledge graph.')
5454
parser.add_argument("--trainer", type=str, default='PL',
@@ -102,6 +102,8 @@ def get_default_arguments(description=None):
102102
return parser.parse_args()
103103
return parser.parse_args(description)
104104

105+
def main():
106+
Execute(get_default_arguments()).start()
105107

106108
if __name__ == '__main__':
107-
Execute(get_default_arguments()).start()
109+
main()

dicee/static_funcs.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ def load_model(path_of_experiment_folder: str, model_name='model.pt') -> Tuple[o
7575
configs = load_json(path_of_experiment_folder + '/configuration.json')
7676
configs["num_entities"] = num_ent
7777
configs["num_relations"] = num_rel
78-
#configs["embedding_dim"] = ent_dim
7978

8079
print(f'Done! It took {time.time() - start_time:.3f}')
8180
# (4) Select the model

docs/index.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,16 @@ Welcome to DICE Embeddings!
1919
.. code-block:: bash
2020
2121
// 1 CPU
22-
(dicee) $ python -m dicee.run --path_dataset_folder KGs/UMLS
22+
(dicee) $ dicee --dataset_dir KGs/UMLS
2323
// 10 CPU
24-
(dicee) $ python -m dicee.run --path_dataset_folder KGs/UMLS --num_core 10
24+
(dicee) $ dicee --dataset_dir KGs/UMLS --num_core 10
2525
// Distributed Data Parallel (DDP) with all GPUs
26-
(dicee) $ python -m dicee.run --trainer PL --accelerator gpu --strategy ddp --path_dataset_folder KGs/UMLS
26+
(dicee) $ dicee --trainer PL --accelerator gpu --strategy ddp --dataset_dir KGs/UMLS
2727
// Model Parallel with all GPUs and low precision
28-
(dicee) $ python -m dicee.run --trainer PL --accelerator gpu --strategy deepspeed_stage_3 --path_dataset_folder KGs/UMLS --precision 16
28+
(dicee) $ dicee --trainer PL --accelerator gpu --strategy deepspeed_stage_3 --dataset_dir KGs/UMLS --precision 16
2929
// DDP with all GPUs on two nodes (felis and nebula):
30-
(dicee) cdemir@felis $ torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.main --trainer torchDDP --path_dataset_folder KGs/UMLS
31-
(dicee) cdemir@nebula $ torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.main --trainer torchDDP --path_dataset_folder KGs/UMLS
30+
(dicee) cdemir@felis $ torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
31+
(dicee) cdemir@nebula $ torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS
3232
3333
.. toctree::
3434
:maxdepth: 2

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"Programming Language :: Python :: 3.9",
2828
"License :: OSI Approved :: MIT License"],
2929
python_requires='>=3.9',
30+
entry_points={"console_scripts": ["dicee = dicee.run:main"]},
3031
long_description=long_description,
3132
long_description_content_type="text/markdown",
3233
)

0 commit comments

Comments
 (0)