173 lines
6.3 KiB
R
173 lines
6.3 KiB
R
|
|
#!/usr/bin/env Rscript
|
||
|
|
#
|
||
|
|
# Copyright 2015 Google Inc. All rights reserved.
|
||
|
|
#
|
||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
# you may not use this file except in compliance with the License.
|
||
|
|
# You may obtain a copy of the License at
|
||
|
|
#
|
||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
#
|
||
|
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
# See the License for the specific language governing permissions and
|
||
|
|
# limitations under the License.
|
||
|
|
|
||
|
|
# Simulates inputs on which association analysis can be run.
|
||
|
|
# Currently assoc_sim.R only supports 2 variables but can
|
||
|
|
# be easily extended to support more.
|
||
|
|
#
|
||
|
|
# Usage:
|
||
|
|
# $ ./assoc_sim.R -n 1000
|
||
|
|
# Inputs: uvals, params, reports, map, num, unif
|
||
|
|
# see how options are parsed below for more information
|
||
|
|
# Outputs:
|
||
|
|
# reports.csv file containing reports
|
||
|
|
# map_{1, 2, ...}.csv file(s) containing maps of variables
|
||
|
|
|
||
|
|
library("optparse")
|
||
|
|
|
||
|
|
options(stringsAsFactors = FALSE)
|
||
|
|
|
||
|
|
if(!interactive()) {
|
||
|
|
option_list <- list(
|
||
|
|
make_option(c("--uvals", "-v"), default = "uvals.csv",
|
||
|
|
help = "Filename for list of values over which
|
||
|
|
distributions are simulated. The file is a list of
|
||
|
|
comma-separated strings each line of which refers
|
||
|
|
to a new variable."),
|
||
|
|
make_option(c("--params", "-p"), default = "params.csv",
|
||
|
|
help = "Filename for RAPPOR parameters"),
|
||
|
|
make_option(c("--reports", "-r"), default = "reports.csv",
|
||
|
|
help = "Filename for reports"),
|
||
|
|
make_option(c("--map", "-m"), default = "map",
|
||
|
|
help = "Filename *prefix* for map(s)"),
|
||
|
|
make_option(c("--num", "-n"), default = 1e05,
|
||
|
|
help = "Number of reports"),
|
||
|
|
make_option(c("--unif", "-u"), default = FALSE,
|
||
|
|
help = "Run simulation with uniform distribution")
|
||
|
|
)
|
||
|
|
opts <- parse_args(OptionParser(option_list = option_list))
|
||
|
|
}
|
||
|
|
|
||
|
|
source("../analysis/R/encode.R")
|
||
|
|
source("../analysis/R/decode.R")
|
||
|
|
source("../analysis/R/simulation.R")
|
||
|
|
source("../analysis/R/read_input.R")
|
||
|
|
source("../analysis/R/association.R")
|
||
|
|
|
||
|
|
# Read unique values of reports from a csv file
|
||
|
|
# Inputs: filename. The file is expected to contain two rows of strings
|
||
|
|
# (one for each variable):
|
||
|
|
# "google.com", "apple.com", ...
|
||
|
|
# "ssl", "nossl", ...
|
||
|
|
# Returns: a list containing strings
|
||
|
|
GetUniqueValsFromFile <- function(filename) {
|
||
|
|
contents <- read.csv(filename, header = FALSE)
|
||
|
|
# Expect 2 rows of unique vals
|
||
|
|
if(nrow(contents) != 2) {
|
||
|
|
stop(paste("Unique vals file", filename, "expected to have
|
||
|
|
two rows of strings."))
|
||
|
|
}
|
||
|
|
# Removes superfluous "" entries if the lists of unique values
|
||
|
|
# differ in length
|
||
|
|
strip_empty <- function(vec) {
|
||
|
|
vec[!vec %in% c("")]
|
||
|
|
}
|
||
|
|
list(var1 = strip_empty(as.vector(t(contents[1,]))),
|
||
|
|
var2 = strip_empty(as.vector(t(contents[2,]))))
|
||
|
|
}
|
||
|
|
|
||
|
|
# Simulate correlated reports and write into reportsfile
|
||
|
|
# Inputs: N = number of reports
|
||
|
|
# uvals = list containing a list of unique values
|
||
|
|
# params = list with RAPPOR parameters
|
||
|
|
# unif = whether to replace poisson with uniform
|
||
|
|
# mapfile = file to write maps into (with .csv suffixes)
|
||
|
|
# reportsfile = file to write reports into (with .csv suffix)
|
||
|
|
SimulateReports <- function(N, uvals, params, unif,
|
||
|
|
mapfile, reportsfile) {
|
||
|
|
# Compute true distribution
|
||
|
|
m <- params$m
|
||
|
|
|
||
|
|
if (unif) {
|
||
|
|
# Draw uniformly from 1 to 10
|
||
|
|
v1_samples <- as.integer(runif(N, 1, 10))
|
||
|
|
} else {
|
||
|
|
# Draw from a Poisson random variable
|
||
|
|
v1_samples <- rpois(N, 1) + rep(1, N)
|
||
|
|
}
|
||
|
|
|
||
|
|
# Pr[var2 = N + 1 | var1 = N] = 0.5
|
||
|
|
# Pr[var2 = N | var1 = N] = 0.5
|
||
|
|
v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
|
||
|
|
|
||
|
|
tmp_samples <- list(v1_samples, v2_samples)
|
||
|
|
|
||
|
|
# Function to pad strings to uval_vec if sample_vec has
|
||
|
|
# larger support than the number of strings in uval_vec
|
||
|
|
# For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
|
||
|
|
# only have "value1", "value2", and "value3", samples now
|
||
|
|
# over support {"value1", "value2", "value3", "str4", ...}
|
||
|
|
PadStrings <- function(sample_vec, uval_vec) {
|
||
|
|
if (max(sample_vec) > length(uval_vec)) {
|
||
|
|
# Padding uvals to required length
|
||
|
|
len <- length(uval_vec)
|
||
|
|
max_of_samples <- max(sample_vec)
|
||
|
|
uval_vec[(len + 1):max_of_samples] <- apply(
|
||
|
|
as.matrix((len + 1):max_of_samples),
|
||
|
|
1,
|
||
|
|
function(i) sprintf("str%d", i))
|
||
|
|
}
|
||
|
|
uval_vec
|
||
|
|
}
|
||
|
|
|
||
|
|
# Pad and update uvals
|
||
|
|
uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
|
||
|
|
uvals[[i]]))
|
||
|
|
|
||
|
|
# Replace integers in tmp_samples with actual sample strings
|
||
|
|
samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])
|
||
|
|
|
||
|
|
# Randomly assign cohorts in each dimension
|
||
|
|
cohorts <- sample(1:m, N, replace = TRUE)
|
||
|
|
|
||
|
|
# Create and write map into mapfile_1.csv and mapfile_2.csv
|
||
|
|
map <- lapply(uvals, function(u) CreateMap(u, params))
|
||
|
|
write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),
|
||
|
|
sep = ",", col.names = FALSE, na = "", quote = FALSE)
|
||
|
|
write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
|
||
|
|
sep = ",", col.names = FALSE, na = "", quote = FALSE)
|
||
|
|
|
||
|
|
# Write reports into a csv file
|
||
|
|
# Format:
|
||
|
|
# cohort, bloom filter var1, bloom filter var2
|
||
|
|
reports <- lapply(1:2, function(i)
|
||
|
|
EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
|
||
|
|
# Organize cohorts and reports into format
|
||
|
|
write_matrix <- cbind(as.matrix(cohorts),
|
||
|
|
as.matrix(lapply(reports[[1]],
|
||
|
|
function(x) paste(x, collapse = ""))),
|
||
|
|
as.matrix(lapply(reports[[2]],
|
||
|
|
function(x) paste(x, collapse = ""))))
|
||
|
|
write.table(write_matrix, file = reportsfile, quote = FALSE,
|
||
|
|
row.names = FALSE, col.names = FALSE, sep = ",")
|
||
|
|
}
|
||
|
|
|
||
|
|
main <- function(opts) {
|
||
|
|
ptm <- proc.time()
|
||
|
|
|
||
|
|
uvals <- GetUniqueValsFromFile(opts$uvals)
|
||
|
|
params <- ReadParameterFile(opts$params)
|
||
|
|
SimulateReports(opts$num, uvals, params, opts$unif, # inputs
|
||
|
|
opts$map, opts$reports) # outputs
|
||
|
|
|
||
|
|
print("PROC.TIME")
|
||
|
|
print(proc.time() - ptm)
|
||
|
|
}
|
||
|
|
|
||
|
|
if(!interactive()) {
|
||
|
|
main(opts)
|
||
|
|
}
|