Skip to content

Commit

Permalink
Changes for experimenting with simulated data sets.
Browse files Browse the repository at this point in the history
  • Loading branch information
asrivast28 committed Jul 30, 2020
1 parent 4492667 commit b70b1e3
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 2 deletions.
13 changes: 12 additions & 1 deletion scripts/discretize.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,20 @@ def write_dataset(dataset, name, sep, colobs, varnames, indices):
'''
Write the dataset as a CSV file.
'''
header = False
index = False
if colobs:
dataset = dataset.T
dataset.to_csv(name, sep=sep, header=varnames, index=indices)
if indices:
header = True
if varnames:
index = True
else:
if varnames:
header = True
if indices:
index = True
dataset.to_csv(name, sep=sep, header=header, index=index)


def main():
Expand Down
10 changes: 9 additions & 1 deletion scripts/run_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,15 @@
('complete' , ('data/athaliana/athaliana_complete_discretized.tsv', 18380, 16838, ' ', True, True, True)),
])

all_datasets = OrderedDict(list(small_datasets.items()) + list(big_datasets.items()))
simulated_datasets = OrderedDict([
#(name, (-f, -n, -m, -s, -c, -v, -i)),
('s1' , ('data/simulated/n30000_p0.00005_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
('s2' , ('data/simulated/n30000_p0.0001_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
('s3' , ('data/simulated/n30000_p0.0005_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
('s4' , ('data/simulated/n30000_p0.001_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
])

all_datasets = OrderedDict(list(small_datasets.items()) + list(big_datasets.items()) + list(simulated_datasets.items()))

all_algorithms = [
'gs',
Expand Down
89 changes: 89 additions & 0 deletions scripts/simulate_bn.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env Rscript

##
# @file simulate_bn.R
# @brief Script for simulating a Bayesian network using pcalg
# @author Ankit Srivastava <[email protected]>
#
# Copyright 2020 Georgia Institute of Technology
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

library('pcalg')
library('optparse')


if (!exists('argv')) {
argv = commandArgs(trailing=TRUE)
}

parser <- OptionParser()
parser <- add_option(parser, c('--seed'), type='integer', help='PRNG seed.')
parser <- add_option(parser, c('--nvars', '-n'), type='integer', help='Number of variables in the simulated network.')
parser <- add_option(parser, c('--prob', '-p'), type='double', default=0.05, help='Threshold p-value used for generating the graph.')
parser <- add_option(parser, c('--mbsize'), action='store_true', default=FALSE, help='Print out the average MB size.')
parser <- add_option(parser, c('--bn', '-b'), type='character', help='Name of the dot file to which the network is written.')
parser <- add_option(parser, c('--nobs', '-m'), type='integer', help='Number of observations in the simulated dataset.')
parser <- add_option(parser, c('--datafile', '-d'), type='character', help='Name of the file to which dataset is written.')
parser <- add_option(parser, c('--colobs', '-c'), action='store_true', default=FALSE, help='The file contains observations in columns.')
parser <- add_option(parser, c('--sep', '-s'), type='character', default=' ', help='Delimiting character in the dataset file.')
args <- parse_args(parser, args=argv)


if (!is.null(args$seed)) {
set.seed(args$seed)
}

tGenerate <- proc.time()
nodes <- c()
for (v in seq(1, args$nvars)) {
nodes <- c(nodes, paste('V', v, sep=''))
}
# Graph of type graphNEL
dag <- randomDAG(args$nvars, prob=args$prob, V=nodes)
show(dag)
tGenerate <- proc.time() - tGenerate
cat('Time taken in generating the network:', tGenerate['elapsed'], 'sec\n')

if (args$mbsize) {
cat('Using bnlearn from', find.package('bnlearn'), '\n')
library('bnlearn')
tMBSize <- proc.time()
# Convert to BN
bn <- as.bn(dag)
avgmb <- mean(sapply(nodes, function(n) { length(bn$nodes[[n]]$mb) }))
cat('Average MB size is', avgmb, '\n')
tMBSize <- proc.time() - tMBSize
cat('Time taken in getting the MB sizes:', tMBSize['elapsed'], 'sec\n')
}

if (!is.null(args$bn)) {
tWrite <- proc.time()
write.dot(args$bn, bn)
tWrite <- proc.time() - tWrite
cat('Time taken in writing the network:', tWrite['elapsed'], 'sec\n')
}

if (!is.null(args$nobs)) {
tData <- proc.time()
data <- rmvDAG(args$nobs, dag, use.node.names=TRUE)
if (!args$colobs) {
data <- t(data)
}
tData <- proc.time() - tData
cat('Time taken in getting the data:', tData['elapsed'], 'sec\n')
tWrite <- proc.time()
write.table(data, file=args$datafile, sep=args$sep, row.names=!args$colobs, col.names=args$colobs)
tWrite <- proc.time() - tWrite
cat('Time taken in writing the dataset:', tWrite['elapsed'], 'sec\n')
}

0 comments on commit b70b1e3

Please sign in to comment.