-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthesize_non_dl.R
67 lines (54 loc) · 2.03 KB
/
synthesize_non_dl.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
suppressPackageStartupMessages({
library(data.table)
library(foreach)
library(doParallel)
library(arf)
library(synthpop)
})
synthesize <- function(dataset, synthesizer, reps = 1) {
set.seed(2024)
data <- fread(paste0("data/", dataset, "/real/", dataset, ".csv"))
# create synthesizer folder if it does not exist
dir.create(paste0("data/", dataset, "/syn/", synthesizer), showWarnings = F)
synthesizer_fn <- synthesizers[[synthesizer]]
colnames_orig <- colnames(data)
colnames(data) <- gsub("-", "_", colnames(data)) #rename colnames "-" to "_"
syn_list <- foreach(rep = seq_len(reps)) %do% {
print(paste0("Synthesizing dataset: ", dataset, " with synthesizer: ", synthesizer, " (rep: ", rep, "/", reps, ")"))
syn_file <- paste0("data/", dataset, "/syn/", synthesizer, "/syn_", dataset, "_", synthesizer, "_", rep, ".csv")
# if synthetic file does not exists yet, synthesize
if (file.exists(syn_file)) {
print(paste0("Synthetic data already exists at: ", syn_file))
} else {
syn <- synthesizer_fn(data)
colnames(syn) <- colnames_orig #rename back
fwrite(syn, file = paste0("data/", dataset, "/syn/", synthesizer, "/syn_", dataset, "_", synthesizer, "_", rep, ".csv"))
print(paste0("Synthetic data saved at: ", syn_file))
}
}
return(invisible(NULL))
}
# if called from command line
args_cmd = commandArgs(trailingOnly=TRUE)
args <- list(
dataset = args_cmd[1],
synthesizer = args_cmd[2],
reps = as.integer(args_cmd[3]),
n_cores = as.integer(args_cmd[4])
)
if (length(args_cmd) >= 2) {
if(is.na(args$reps)) args$reps = 1
if(is.na(args$n_cores)) args$n_cores = 1
# register parallel backend if requested
parallel = (args$n_cores>0)
if (parallel) {
registerDoParallel(args$n_cores)
}
# define synthesizers list
synthesizers <- list(
ARF = \(data) rarf(data, finite_bounds = "local", parallel = parallel),
synthpop = \(data) syn(data, print.flag = F)$syn
)
# synthesize
synthesize(args$dataset, args$synthesizer, reps = args$reps)
}