Skip to content

Commit

Permalink
add the facebook dataloader and additions in overlapping graph proble…
Browse files Browse the repository at this point in the history
…m solving
  • Loading branch information
aniket-agarwal1999 committed Dec 2, 2019
1 parent d2706d2 commit e60441c
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 40 deletions.
24 changes: 23 additions & 1 deletion data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import torch
import torch_geometric as pyg
from torch_geometric.data import Dataset
from torch_geometric.data import Data
import torch_geometric.datasets as datasets
import pandas as pd
import re

def get_cora():
dataset = datasets.Planetoid(root='./dataset/Cora', name='Cora')
Expand All @@ -21,4 +23,24 @@ def get_facebook(code):
edge_file = './dataset/Facebook/'+str(code)+'.edges'
label_file = './dataset/Facebook/'+str(code)+'.circles' ### Since the circles file basically contains the ground truth

for
starting_node = []
ending_node = []

with open(edge_file, 'r') as f:
for line in f:
li = line.strip().split()
starting_node.append(int(li[0]))
ending_node.append(int(li[1]))

edge_index = torch.zeros([2, len(starting_node)])
edge_index[0, :] = torch.tensor(starting_node)
edge_index[1, :] = torch.tensor(ending_node)
edge_index = edge_index.long()

communities = []
with open(label_file, 'r') as f:
for line in f:
nodes = re.split(' |\t', line.strip())[1:]
communities.append([x for x in nodes])

return edge_index, communities
8 changes: 3 additions & 5 deletions train_nonoverlapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
parser.add_argument('--tensorboard_dir', type=str, default='./tensorboard_curves')




if __name__ == '__main__':
args = parser.parse_args()

Expand Down Expand Up @@ -100,9 +98,9 @@

if epoch % 100 == 0:
lr_scheduler.step()
modularity, macro_F1, micro_F1 = utils.calculate_nonoverlap_losses(model, dataset, edge_index)
f = open(args.dataset + '_results.txt', 'a+')
f.write('Epoch :', epoch, ' modularity: ', modularity, ' macro_F1: ', macro_F1, ' micro_F1: ', micro_F1, ' \n')
# modularity, macro_F1, micro_F1 = utils.calculate_nonoverlap_losses(model, dataset, edge_index)
# f = open(args.dataset + '_results.txt', 'a+')
# f.write('Epoch :', epoch, ' modularity: ', modularity, ' macro_F1: ', macro_F1, ' micro_F1: ', micro_F1, ' \n')


writer_tensorboard.add_scalars('Total Loss', {'vgraph_loss':vgraph_loss, 'regularization_loss':regularization_loss}, epoch)
Expand Down
110 changes: 110 additions & 0 deletions train_overlapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import torch
import torch_geometric as pyg
import torch.nn as nn
from model import Model
import argparse
from data import *
import os
import utils
import re
from tensorboardX import SummaryWriter

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=5000)
parser.add_argument('--lr', type=float, default=0.05)
parser.add_argument('--negative_sample', type=bool, default=False) ## If we want to use negative sampling or simply use softmax
parser.add_argument('--decay_epoch', type=int, default=100)
parser.add_argument('--lamda', type=float, default=100.0) ## For the smoothness trick
parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints')
parser.add_argument('--embedding_dim', type=int, default=128)
parser.add_argument('--dataset', type=str, default='facebook0') ### Here the choices are the various subgraphs of facebook dataset
parser.add_argument('--gpu_id', type=str, default='0')
parser.add_argument('--tensorboard_dir', type=str, default='./tensorboard_curves')


if __name__ == '__main__':
args = parser.parse_args()

embedding_dim = args.embedding_dim

facebook_code = int(re.split('facebook', args.dataset)[1])
edge_index, communities = get_facebook(facebook_code)

## For defining the model
size = edge_index.shape[1]
categories = len(communities)

edge_index = utils.cuda(edge_index, args.gpu_id)

## For visualization of loss curves
if not os.path.isdir(args.tensorboard_dir):
os.makedirs(args.tensorboard_dir)
writer_tensorboard = SummaryWriter(args.tensorboard_dir + '/latest_model_'+args.dataset)

## Model for embedding and stuff
model = Model(size=size, categories=categories, embedding_dim=128, negative_sample=False)
model = utils.cuda(model, args.gpu_id)

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

### For annealing the learning rate
lambda1 = lambda lr: 0.99*lr
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

if not os.path.isdir(args.checkpoint_dir):
os.makedirs(args.checkpoint_dir)

try:
ckpt = utils.load_checkpoint(args.checkpoint_dir + '/latest_model_' + args.dataset)
start_epoch = ckpt['epoch']
model.load_state_dict(ckpt['model'])
optimizer.load_state_dict(ckpt['optimizer'])
except:
print(' [*] No checkpoint!')
start_epoch = 0

for epoch in range(start_epoch, args.epochs):

optimizer.zero_grad()
model.train()

w = torch.cat((edge_index[0, :], edge_index[1, :]))
c = torch.cat((edge_index[1, :], edge_index[0, :]))

prior, recon_c, q = model(w, c, edge_index)

### vGraph loss
vgraph_loss = utils.vGraph_loss(c, recon_c, prior, q)

### Now we will enforce community-smoothness regularization
### So we need d(p(z|c), p(z|w)), where p(z|w)=prior and p(z|c) can be easily calculated from this
prior_c = torch.cat((prior[prior.shape[0]//2:, :], prior[0:prior.shape[0]//2, :]))

d = (prior_c - prior)**2
alpha = utils.similarity_measure(edge_index, w, c, args.gpu_id)

regularization_loss = alpha*d
regularization_loss = regularization_loss.mean()

total_loss = vgraph_loss + args.lamda*regularization_loss

total_loss.backward()
optimizer.step()

print('Epoch: ', epoch+1, ' done!!')
print('Total error: ', total_loss)

if epoch % 100 == 0:
lr_scheduler.step()

writer_tensorboard.add_scalars('Total Loss', {'vgraph_loss':vgraph_loss, 'regularization_loss':regularization_loss}, epoch)

### Saving the checkpoint
utils.save_checkpoint({'epoch':epoch+1,
'model':model.state_dict(),
'optimizer':optimizer.state_dict()},
args.checkpoint_dir + '/latest_model_'+args.dataset+'.ckpt')


writer_tensorboard.close()

57 changes: 23 additions & 34 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,41 +64,30 @@ def cuda(xs, gpu_id):
return xs


def calculate_nonoverlap_losses(model, dataset, edge_index)
'''
For calculating losses pertaining to the non-overlapping dataset, namely, Macro F1, Micro F1, Modularity, NMI
'''
model.eval()
labels = dataset.y
w = edge_index[0, :]
c = edge_index[1, :]
_, _, q = model(w, c, edge_index)

new_labels = torch.zeros(w.shape[0], 1)
for i in range(w.shape[0]):
new_labels[i] = labels[w[i]]
### THIS SECTION WILL BE ADDED ON AND CONTAINS THE ACCURACY MEASURES FOR OVERLAPPING AND NON-OVERLAPPING SUBPROBLEMS
# def calculate_nonoverlap_losses(model, dataset, edge_index):
# '''
# For calculating losses pertaining to the non-overlapping dataset, namely, Macro F1, Micro F1, Modularity, NMI
# '''
# model.eval()
# labels = dataset.y
# w = edge_index[0, :]
# c = edge_index[1, :]
# _, _, q = model(w, c, edge_index)

# new_labels = torch.zeros(w.shape[0], 1)
# for i in range(w.shape[0]):
# new_labels[i] = labels[w[i]]

kmeans = KMeans(n_clusters=torch.unique(labels).shape[0], random_state=0).fit(q)

###For calculating modularity
assignment = {i: int(kmeans.labels_[i]) for i in range(q.shape[0])}
networkx_graph = pyg.utils.to_networkx(dataset)
modularity = community.modularity(assignment, dataset)
# kmeans = KMeans(n_clusters=torch.unique(labels).shape[0], random_state=0).fit(q.detach().cpu().numpy())

###For calculating macro and micro F1 score
macro_F1 = metrics.f1_score(new_labels.numpy(), kmeans.labels_, average='macro')
micro_F1 = metrics.f1_score(new_labels.numpy(), kmeans.labels_, average='micro')
# ###For calculating modularity
# assignment = {i: int(kmeans.labels_[i]) for i in range(q.shape[0])}
# networkx_graph = pyg.utils.to_networkx(dataset)
# modularity = community.modularity(assignment, networkx_graph)

return modularity, macro_F1, micro_F1
# ###For calculating macro and micro F1 score
# macro_F1 = metrics.f1_score(new_labels.numpy(), kmeans.labels_, average='macro')
# micro_F1 = metrics.f1_score(new_labels.numpy(), kmeans.labels_, average='micro')

def calculate_jaccard():
'''
## This is for the overlapping case
'''



def calculate_f1():
'''
## This is for the overlapping case
'''
# return modularity, macro_F1, micro_F1

0 comments on commit e60441c

Please sign in to comment.