Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/dice-group/dice-embeddings
Browse files Browse the repository at this point in the history
… into kfold-crossvalidation-verbose
  • Loading branch information
sshivam95 committed Nov 29, 2024
2 parents 27d9dd6 + fb436e6 commit 85f2383
Show file tree
Hide file tree
Showing 10 changed files with 362 additions and 197 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[![Downloads](https://static.pepy.tech/badge/dicee)](https://pepy.tech/project/dicee)
[![Downloads](https://img.shields.io/pypi/dm/dicee)](https://pypi.org/project/dicee/)
[![Coverage](https://img.shields.io/badge/coverage-54%25-green)](https://dice-group.github.io/dice-embeddings/usage/main.html#coverage-report)
[![Pypi](https://img.shields.io/badge/pypi-0.1.4-blue)](https://pypi.org/project/dicee/0.1.4/)
Expand Down
4 changes: 3 additions & 1 deletion dicee/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def __init__(self, **kwargs):

self.label_smoothing_rate: float = 0.0


self.num_core: int = 0
"""Number of CPUs to be used in the mini-batch loading process"""

Expand Down Expand Up @@ -139,6 +138,9 @@ def __init__(self, **kwargs):
self.continual_learning=None
"Path of a pretrained model size of LLM"

self.auto_batch_finding=False
"A flag for using auto batch finding"

def __iter__(self):
# Iterate
for k, v in self.__dict__.items():
Expand Down
31 changes: 22 additions & 9 deletions dicee/models/ensemble.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
import torch
import copy

import torch._dynamo

torch._dynamo.config.suppress_errors = True


class EnsembleKGE:
def __init__(self, seed_model):
self.models = []
self.optimizers = []
self.loss_history = []
for i in range(torch.cuda.device_count()):
i_model=copy.deepcopy(seed_model)
i_model.to(torch.device(f"cuda:{i}"))
# TODO: Why we cant send the compile model to cpu ?
# i_model = torch.compile(i_model)
#i_model = torch.compile(i_model)
i_model.to(torch.device(f"cuda:{i}"))
self.optimizers.append(i_model.configure_optimizers())
self.models.append(i_model)
# Maybe use the original model's name ?
self.name="TP_"+self.models[0].name
self.name=self.models[0].name
self.train_mode=True

def named_children(self):
Expand Down Expand Up @@ -87,7 +82,25 @@ def __call__(self,x_batch):
def step(self):
for opt in self.optimizers:
opt.step()


def get_embeddings(self):
entity_embeddings=[]
relation_embeddings=[]
# () Iterate
for trained_model in self.models:
entity_emb, relation_ebm = trained_model.get_embeddings()
entity_embeddings.append(entity_emb)
if relation_ebm is not None:
relation_embeddings.append(relation_ebm)
# () Concat the embedding vectors horizontally.
entity_embeddings=torch.cat(entity_embeddings,dim=1)
if relation_embeddings:
relation_embeddings=torch.cat(relation_embeddings,dim=1)
else:
relation_embeddings=None

return entity_embeddings, relation_embeddings

"""
def __getattr__(self, name):
# Create a function that will call the same attribute/method on each model
Expand Down
20 changes: 10 additions & 10 deletions dicee/sanity_checkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ def validate_knowledge_graph(args):

elif args.path_single_kg is not None:
if args.sparql_endpoint is not None or args.path_single_kg is not None:
print(f'The dataset_dir and sparql_endpoint arguments '
f'must be None if path_single_kg is given.'
f'***{args.dataset_dir}***\n'
f'***{args.sparql_endpoint}***\n'
f'These two parameters are set to None.')
#print(f'The dataset_dir and sparql_endpoint arguments '
# f'must be None if path_single_kg is given.'
# f'***{args.dataset_dir}***\n'
# f'***{args.sparql_endpoint}***\n'
# f'These two parameters are set to None.')
args.dataset_dir = None
args.sparql_endpoint = None

Expand All @@ -61,11 +61,11 @@ def validate_knowledge_graph(args):
f"Use --path_single_kg **folder/dataset.format**, if you have a single file.")

if args.sparql_endpoint is not None or args.path_single_kg is not None:
print(f'The sparql_endpoint and path_single_kg arguments '
f'must be None if dataset_dir is given.'
f'***{args.sparql_endpoint}***\n'
f'***{args.path_single_kg}***\n'
f'These two parameters are set to None.')
#print(f'The sparql_endpoint and path_single_kg arguments '
# f'must be None if dataset_dir is given.'
# f'***{args.sparql_endpoint}***\n'
# f'***{args.path_single_kg}***\n'
# f'These two parameters are set to None.')
args.sparql_endpoint = None
args.path_single_kg = None

Expand Down
3 changes: 3 additions & 0 deletions dicee/scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def get_default_arguments(description=None):
parser.add_argument("--swa",
action="store_true",
help="Stochastic weight averaging")
parser.add_argument("--auto_batch_finding",
action="store_true",
help="Find a batch size fitting in GPUs. Only available for TP trainer")
parser.add_argument('--degree', type=int, default=0,
help='degree for polynomial embeddings')
parser.add_argument('--disable_checkpointing', action='store_true', help='Disable creation of checkpoints during training')
Expand Down
86 changes: 35 additions & 51 deletions dicee/static_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,16 +684,15 @@ def download_pretrained_model(url: str) -> str:
download_files_from_url(url_to_download_from, destination_folder=dir_name)
return dir_name

def write_csv_from_model_parallel(path: str) -> None:
def write_csv_from_model_parallel(path: str) :
"""Create"""
assert os.path.exists(path), "Path does not exist"

# Detect files that start with model_ and end with .pt
model_files = [f for f in os.listdir(path) if f.startswith("model_") and f.endswith(".pt")]
model_files.sort() # Sort to maintain order if necessary (e.g., model_0.pt, model_1.pt)

entity_csv_path = os.path.join(path, "entity_embeddings.csv")
relation_csv_path = os.path.join(path, "relation_embeddings.csv")
entity_embeddings=[]
relation_embeddings=[]

# Process each model file
for model_file in model_files:
Expand All @@ -702,65 +701,50 @@ def write_csv_from_model_parallel(path: str) -> None:
model = torch.load(model_path)
# Assuming model has a get_embeddings method
entity_emb, relation_emb = model["_orig_mod.entity_embeddings.weight"], model["_orig_mod.relation_embeddings.weight"]
# Convert to numpy
entity_emb = entity_emb.numpy()
relation_emb = relation_emb.numpy()

# Write or append to CSV
if not os.path.exists(entity_csv_path) or not os.path.exists(relation_csv_path):
# If CSV files do not exist, create them
pd.DataFrame(entity_emb).to_csv(entity_csv_path, index=True, header=False)
pd.DataFrame(relation_emb).to_csv(relation_csv_path, index=True, header=False)
else:
# If CSV files exist, concatenate to the existing rows
existing_entity_df = pd.read_csv(entity_csv_path, header=None)
existing_relation_df = pd.read_csv(relation_csv_path, header=None)
entity_embeddings.append(entity_emb)
relation_embeddings.append(relation_emb)

# Concatenate along the columns (axis=1)
new_entity_df = pd.concat([existing_entity_df, pd.DataFrame(entity_emb)], axis=1)
new_relation_df = pd.concat([existing_relation_df, pd.DataFrame(relation_emb)], axis=1)
return torch.cat(entity_embeddings, dim=1), torch.cat(relation_embeddings, dim=1)

# Write the updated data back to the CSV files
new_entity_df.to_csv(entity_csv_path, index=False, header=False)
new_relation_df.to_csv(relation_csv_path, index=False, header=False)

def from_pretrained_model_write_embeddings_into_csv(path: str) -> None:
""" """
assert os.path.exists(path), "Path does not exist"
config = load_json(path + '/configuration.json')
if config["trainer"]=="MP":
write_csv_from_model_parallel(path)
entity_csv_path = os.path.join(path, f"{config['model']}_entity_embeddings.csv")
relation_csv_path = os.path.join(path, f"{config['model']}_relation_embeddings.csv")

if config["trainer"]=="TP":
entity_emb, relation_emb = write_csv_from_model_parallel(path)
else:
entity_csv_path = os.path.join(path, f"{config['model']}_entity_embeddings.csv")
relation_csv_path = os.path.join(path, f"{config['model']}_relation_embeddings.csv")
# Load model
model = torch.load(os.path.join(path, "model.pt"))
# Assuming model has a get_embeddings method
entity_emb, relation_emb = model["entity_embeddings.weight"], model["relation_embeddings.weight"]
str_entity = pd.read_csv(f"{path}/entity_to_idx.csv", index_col=0)["entity"]
assert str_entity.index.is_monotonic_increasing
str_entity=str_entity.to_list()
# Write entity embeddings with headers and indices
with open(entity_csv_path, "w", newline="") as f:
writer = csv.writer(f)
# Add header (e.g., "", "0", "1", ..., "N")
headers = [""] + [f"{i}" for i in range(entity_emb.size(1))]
writer.writerow(headers)
# Add rows with index
for i_row, (name,row) in enumerate(zip(str_entity,entity_emb)):
writer.writerow([name] + row.tolist())
str_relations = pd.read_csv(f"{path}/relation_to_idx.csv", index_col=0)["relation"]
assert str_relations.index.is_monotonic_increasing

# Write relation embeddings with headers and indices
with open(relation_csv_path, "w", newline="") as f:
writer = csv.writer(f)
# Add header (e.g., "", "0", "1", ..., "N")
headers = [""] + [f"{i}" for i in range(relation_emb.size(1))]
writer.writerow(headers)
# Add rows with index
for i_row, (name, row) in enumerate(zip(str_relations,relation_emb)):
writer.writerow([name]+ row.tolist())
str_entity = pd.read_csv(f"{path}/entity_to_idx.csv", index_col=0)["entity"]
assert str_entity.index.is_monotonic_increasing
str_entity=str_entity.to_list()
# Write entity embeddings with headers and indices
with open(entity_csv_path, "w", newline="") as f:
writer = csv.writer(f)
# Add header (e.g., "", "0", "1", ..., "N")
headers = [""] + [f"{i}" for i in range(entity_emb.size(1))]
writer.writerow(headers)
# Add rows with index
for i_row, (name,row) in enumerate(zip(str_entity,entity_emb)):
writer.writerow([name] + row.tolist())
str_relations = pd.read_csv(f"{path}/relation_to_idx.csv", index_col=0)["relation"]
assert str_relations.index.is_monotonic_increasing

# Write relation embeddings with headers and indices
with open(relation_csv_path, "w", newline="") as f:
writer = csv.writer(f)
# Add header (e.g., "", "0", "1", ..., "N")
headers = [""] + [f"{i}" for i in range(relation_emb.size(1))]
writer.writerow(headers)
# Add rows with index
for i_row, (name, row) in enumerate(zip(str_relations,relation_emb)):
writer.writerow([name]+ row.tolist())

"""
Expand Down
Loading

0 comments on commit 85f2383

Please sign in to comment.