-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_preproc_descr.R
308 lines (253 loc) · 13.1 KB
/
1_preproc_descr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# METADATA
########################################################################################################
# Project: DISSINET / BolIncr / whole network approach / ERGM
# Related manuscript : Incriminations in the inquisition register of Bologna (1291–1310)
# Authors of the related manuscript : David Zbíral; Katia Riccardo; Tomáš Hampejs; Zoltán Brys
#
# OUTLIER REMOVAL AND DESCRIPTIVES
#
# Authors of the R-Code: Zoltán Brys and David Zbíral
#
# Description: this R code
# 1 prepares the environment
# 2 reads and checks input tables
# 3 defines descriptive table function and Jaccard calculation function
# 4 remove outliers
# 5 generate Table 2.
# 6 generate Supporting Information Table 1.
# 7 saves cleaned TSVs
########################################################################################################
# CODING CONVENTION
########################################################################################################
#only use the absolutely neccessary packages
#variable naming:
# at_ denotes attribute/feature/term description vectors
# fn_ denotes file names (fn_inp_ is input file names, fn_out_ is output file names)
# df_ denotes data frames
# cx_ denotes unique cycle var (x = x + 1)
########################################################################################################
# 1 ENVIRONMENT PACKAGES INPUT/OUTPUT FILENAMES
########################################################################################################
#environment
rm(list = ls()) #deleting the memory
if (as.numeric(gsub(".*:(\\s*)(\\d+)(\\s+)\\d+.*", "\\2", (system("free -m", intern = TRUE)[2])))<4096)
stop("Memory is likely not will be enough for ERGM!") #checking free memory
if (!("stats" %in% (.packages()) )) stop("R Environment is not fully loaded!")
#libraries
#none
#input filenames (fn_)
fn_inp_incr_nodes <- paste0(getwd(), "/data/df_nodes.tsv")
fn_inp_incr_edges <- paste0(getwd(), "/data/df_edges.tsv")
#out filenames (fn_)
fn_out_incr_nodes <- paste0(getwd(), "/data/df_cleaned_nodes.tsv")
fn_out_incr_edges <- paste0(getwd(), "/data/df_cleaned_edges.tsv")
#environment prepared, filenames (varaibles starting with fn_) set.
########################################################################################################
# 2 LOAD AND CHECK INPUT DATA
########################################################################################################
#reading
df_incr_nodes <- read.delim(fn_inp_incr_nodes, sep="\t", header=TRUE, fileEncoding = "UTF-8")
df_incr_edges <- read.delim(fn_inp_incr_edges, sep="\t", header=TRUE, fileEncoding = "UTF-8")
#check if the reading was OK
if (!exists("df_incr_nodes")) stop("Input data table of nodes is not loaded!")
if (!exists("df_incr_edges")) stop("Input data table of edges is not loaded!")
#check if both are a data frame
if (!class(df_incr_nodes) == "data.frame") stop("Input data table of nodes is not a data frame!")
if (!class(df_incr_edges) == "data.frame") stop("Input data table of edges is not a data frame!")
#check if there are loops
if (dim(subset(df_incr_edges, from == to))[1] != 0) stop("There are self-loops!")
#check if there are multiple edges
if (!identical(unique(df_incr_edges), df_incr_edges)) stop("There are multiple edges!")
#check if there are nodes defined multiple times
if (!identical(unique(df_incr_nodes$name), df_incr_nodes$name)) stop("Attributes of one node defined multiple times")
#brief check of NAs
if (sum(is.na.data.frame(df_incr_edges))>0) stop ("Edge list contains NA(s)")
if (sum(is.na.data.frame(df_incr_nodes))>0) stop ("Node list contains NA(s)")
#input data loaded and checked
########################################################################################################
# 3 FUNCTIONS
########################################################################################################
#two functions are defined: descriptive tables and jaccard matrix
# this function create a descriptive table of categorical variables
# and calculates binary variance
# and based on a treshold calculates to include/exclude that variable
# inputs:
# data_frame = the data frame
# include = the column names to analyze
# binaryvarmax = decision threshold based on binary variables (max)
# pv = passing descrp string variable
# if binaryvarmax < 0 , than function does not calculate it
descriptives_df <- function(data_frame, include = names(data_frame) , binaryvarmax = 12, pv = "")
{
#check the input
if (!class(data_frame) == "data.frame") stop("Input data table is not a data frame!")
if (is.null(include)) stop("No column names defined")
if (sum(is.na.data.frame(data_frame))>0) stop("NAs in the data frame")
if (sum(!(include %in% names(data_frame)))>0) stop ("One include variable name is not a column name")
#subset int columns and factorize them
data_frame <- subset(data_frame, select = include)
data_frame <- as.data.frame(lapply(data_frame, factor))
#create an empty output table
descriptive_table <- data.frame(
variable = character(0),
category = character(0),
frequency = numeric(0),
percentage = numeric(0),
descr = character(0)
)
#cycle for each column
for (col in names(data_frame))
{
freq_table <- table(data_frame[[col]])
percentages <- prop.table(freq_table) * 100
variable_table <- data.frame(
variable = col,
category = names(freq_table),
frequency = as.vector(freq_table),
percentage = as.vector(percentages)
)
descriptive_table <- rbind(descriptive_table, variable_table)
} #col cycle
#adding variance and exluded
#if binarymax is -1 or lower, then do not calculate
if (binaryvarmax > -1)
{
n1 <- dim(data_frame)[1]
descriptive_table$binvar <- n1 * (descriptive_table$percentage/100) * (1 - (descriptive_table$percentage/100))
descriptive_table$binvar_include <- (descriptive_table$binvar > binaryvarmax)
}
#adding pv descr.
descriptive_table$desc <- rep(pv, dim(descriptive_table)[1] )
return(descriptive_table)
}
#end of decriptive table function
#subfunction Jaccard calulactions
jaccardc <- function (x, y)
{
if (anyNA(x)) stop("NAs in x")
if (anyNA(y)) stop("NAs in y")
if (length(x) != length(x)) stop("Lenghts are non-equal!")
c11 = sum(x == 1 & y == 1)
c10 = sum(x == 1 & y == 0)
c01 = sum(x == 0 & y == 1)
return (c11 / (c11 + c10 + c01))
}
#jaccard similarity calculation for all pairs
jaccard_matrixc <- function(data_frame, include = names(data_frame))
{
#check the input
if (!class(data_frame) == "data.frame") stop("Input data table is not a data frame!")
if (is.null(include)) stop("No column names defined")
if (sum(is.na.data.frame(data_frame))>0) stop("NAs in the data frame")
if (sum(!(include %in% names(data_frame)))>0) stop ("One include variable name is not a column name")
#subset int columns
data_frame <- subset(data_frame, select = include)
#basic params
no_vars <- dim(data_frame)[2] #number of variables
at_names <- colnames(data_frame)
#define result df
df_res = as.data.frame(matrix(NA, nrow = length(at_names), ncol = length(at_names)))
rownames(df_res) <- at_names
colnames(df_res) <- at_names
#calculate jaccard for every row/column
for (rows1 in 1:no_vars) for (cols1 in 1:no_vars)
{
if (rows1 == cols1) {df_res[rows1,cols1] = -1} else
{df_res[rows1,cols1] = jaccardc(data_frame[,rows1], data_frame[,cols1])}
}
return(df_res)
}
########################################################################################################
# 4 DATA PREPROCESSING: REMOVE OUTLIERS
########################################################################################################
#generating outdegree df
df_nodes_deponent_ids <- as.data.frame(df_incr_nodes$name[df_incr_nodes$deponent==1]) #deponent ids
colnames(df_nodes_deponent_ids) <- "dep_id"
df_nodes_outd <- as.data.frame(table(df_incr_edges$from)) #outdegree
df_nodes_deponents_outdeg <- merge(df_nodes_deponent_ids, df_nodes_outd, by.x = "dep_id", by.y = "Var1", all.x = TRUE ) #merge
df_nodes_deponents_outdeg$Freq[is.na(df_nodes_deponents_outdeg$Freq)] <- 0 #adding 0s for those deponents, who did not incriminate
df_nodes_outd_gen <- merge(df_nodes_deponents_outdeg, df_incr_nodes[,c(1,3)], by.x = "dep_id", by.y = "name", all.x = TRUE, all.y = FALSE)
rm(df_nodes_deponent_ids, df_nodes_outd, df_nodes_deponents_outdeg) #del tmp dfs
#identify outliers
summary(df_nodes_outd_gen$Freq)
at_outdeg_th <- quantile(ecdf(df_nodes_outd_gen$Freq), probs = c(0.975) )
df_outliers_id <- df_nodes_outd_gen[df_nodes_outd_gen$Freq > at_outdeg_th,]$dep_id
#remove outliers
df_cln_nodes <- df_incr_nodes[!(df_incr_nodes$name %in% df_outliers_id),]
df_cln_edges <- subset(df_incr_edges, subset = (!from %in% df_outliers_id) & (!to %in% df_outliers_id) )
#remove those nodes, which are non-deponents and have 0 outdegree (left isolated by the removal of these outliers)
df_non_dep_nout <- (df_cln_nodes$name %in% unique(df_cln_edges$to)) | (df_cln_nodes$deponent==1) # has outdegree or deponents
df_cln_nodes <- df_cln_nodes[df_non_dep_nout, ]
#to harmonize variable coding gender male==1
df_cln_nodes$sex <- ifelse(df_cln_nodes$sex=="m", 1, 0)
########################################################################################################
# 5 DESCRIPTIVE TABLE2
########################################################################################################
#defining binary, categorical and chr variable name vectors
at_node_var_chr <- c("name" , "label")
at_node_var_cat <- c("family_id")
at_node_var_bin <- c( "sex" ,
"churchperson",
"middling" ,
"cathar_aff",
"apostle_aff",
"other_heterodoxy_aff",
"deponent" ,
"redeponent" ,
"ever_summoned" ,
"ever_pledged" ,
"ever_incarcerated",
"ever_tortured",
"inq_FV",
"inq_GV",
"inq_GP",
"inq_BdF")
#outlier removed
table2B_binary <- descriptives_df(data_frame = df_cln_nodes,
include = at_node_var_bin ,
binaryvarmax = 30,
pv = "cleanded_sample")
table2B_family_id <- descriptives_df(data_frame = df_cln_nodes,
include = "family_id" ,
binaryvarmax = -1,
pv = "cleanded_sample")
#Table 2 raw
table2_binary <- table2B_binary
table2_family <- table2B_family_id
########################################################################################################
# 6 DESCRIPTIVE AND JACCARD SIMILARITY MEASURE FOR BINARY VARIABLES
########################################################################################################
#based on the low binary variance, the following variables were excluded from further analysis:
# ever incarcerated and ever tortured.
# calcualte Jaccards for all pairs after binary variance based selection
at_node_var_bin <- c( "sex" ,
"churchperson",
"middling" ,
"cathar_aff",
"apostle_aff",
"other_heterodoxy_aff",
"deponent" ,
"redeponent" ,
"ever_summoned" ,
"inq_FV",
"inq_GV",
"inq_GP",
"inq_BdF")
SP1_table_binary_jaccard <- jaccard_matrixc(data_frame = df_cln_nodes,
include = at_node_var_bin)
########################################################################################################
# 7 SAVE OUTLIER REMOVED TABLES
########################################################################################################
#writing results the two graphs, data.frame, df_
write.table(df_cln_nodes,
file = fn_out_incr_nodes,
sep="\t",
row.names = FALSE,
fileEncoding = "UTF-8")
write.table(df_cln_edges,
file = fn_out_incr_edges,
sep="\t",
row.names = FALSE,
fileEncoding = "UTF-8")
# prepared TSVs saved
########################################################################################################