269 lines
8.1 KiB
R
Executable File
269 lines
8.1 KiB
R
Executable File
# Copyright 2014 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
#
|
|
# RAPPOR simulation library. Contains code for encoding simulated data and
|
|
# creating the map used to encode and decode reports.
|
|
|
|
library(glmnet)
|
|
library(parallel) # mclapply
|
|
|
|
SetOfStrings <- function(num_strings = 100) {
|
|
# Generates a set of strings for simulation purposes.
|
|
strs <- paste0("V_", as.character(1:num_strings))
|
|
strs
|
|
}
|
|
|
|
GetSampleProbs <- function(params) {
|
|
# Generate different underlying distributions for simulations purposes.
|
|
# Args:
|
|
# - params: a list describing the shape of the true distribution:
|
|
# c(num_strings, prop_nonzero_strings, decay_type,
|
|
# rate_exponetial).
|
|
nstrs <- params[[1]]
|
|
nonzero <- params[[2]]
|
|
decay <- params[[3]]
|
|
expo <- params[[4]]
|
|
background <- params[[5]]
|
|
|
|
probs <- rep(0, nstrs)
|
|
ind <- floor(nstrs * nonzero)
|
|
if (decay == "Linear") {
|
|
probs[1:ind] <- (ind:1) / sum(1:ind)
|
|
} else if (decay == "Constant") {
|
|
probs[1:ind] <- 1 / ind
|
|
} else if (decay == "Exponential") {
|
|
temp <- seq(0, nonzero, length.out = ind)
|
|
temp <- exp(-temp * expo)
|
|
temp <- temp + background
|
|
temp <- temp / sum(temp)
|
|
probs[1:ind] <- temp
|
|
} else {
|
|
stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")')
|
|
}
|
|
probs
|
|
}
|
|
|
|
EncodeAll <- function(x, cohorts, map, params, num_cores = 1) {
|
|
# Encodes the ground truth into RAPPOR reports.
|
|
#
|
|
# Args:
|
|
# x: Observed strings for each report, Nx1 vector
|
|
# cohort: Cohort assignment for each report, Nx1 vector
|
|
# map: list of matrices encoding locations of hashes for each
|
|
# string, for each cohort
|
|
# params: System parameters
|
|
#
|
|
# Returns:
|
|
# RAPPOR reports for each piece of data.
|
|
|
|
p <- params$p
|
|
q <- params$q
|
|
f <- params$f
|
|
k <- params$k
|
|
|
|
qstar <- (1 - f / 2) * q + (f / 2) * p
|
|
pstar <- (1 - f / 2) * p + (f / 2) * q
|
|
|
|
candidates <- colnames(map[[1]])
|
|
if (!all(x %in% candidates)) {
|
|
stop("Some strings are not in the map. set(X) - set(candidates): ",
|
|
paste(setdiff(unique(x), candidates), collapse=" "), "\n")
|
|
}
|
|
bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE,
|
|
USE.NAMES = FALSE)
|
|
reports <- mclapply(bfs, function(x) {
|
|
noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar))
|
|
ind <- which(x)
|
|
noise[ind] <- sample(0:1, length(ind), replace = TRUE,
|
|
prob = c(1 - qstar, qstar))
|
|
noise
|
|
}, mc.cores = num_cores)
|
|
|
|
reports
|
|
}
|
|
|
|
CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) {
|
|
# Creates a list of 0/1 matrices corresponding to mapping between the strs and
|
|
# Bloom filters for each instance of the RAPPOR.
|
|
# Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4,
|
|
# the result could look this:
|
|
# [[1]]
|
|
# 1 0 0 0
|
|
# 0 1 0 0
|
|
# 0 0 0 1
|
|
# [[2]]
|
|
# 0 1 0 0
|
|
# 0 0 0 1
|
|
# 0 0 1 0
|
|
#
|
|
# Args:
|
|
# strs: a vector of strings
|
|
# params: a list of parameters in the following format:
|
|
# (k, h, m, p, q, f).
|
|
# generate_pos: Tells whether to generate an object storing the
|
|
# positions of the nonzeros in the matrix
|
|
# basic: Tells whether to use basic RAPPOR (only works if h=1).
|
|
|
|
M <- length(strs)
|
|
map_by_cohort <- list()
|
|
k <- params$k
|
|
h <- params$h
|
|
m <- params$m
|
|
|
|
for (i in 1:m) {
|
|
if (basic && (h == 1) && (k == M)) {
|
|
ones <- 1:M
|
|
} else {
|
|
ones <- sample(1:k, M * h, replace = TRUE)
|
|
}
|
|
cols <- rep(1:M, each = h)
|
|
map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M))
|
|
colnames(map_by_cohort[[i]]) <- strs
|
|
}
|
|
|
|
all_cohorts_map <- do.call("rBind", map_by_cohort)
|
|
if (generate_pos) {
|
|
map_pos <- t(apply(all_cohorts_map, 2, function(x) {
|
|
ind <- which(x == 1)
|
|
n <- length(ind)
|
|
if (n < h * m) {
|
|
ind <- c(ind, rep(NA, h * m - n))
|
|
}
|
|
ind
|
|
}))
|
|
} else {
|
|
map_pos <- NULL
|
|
}
|
|
|
|
list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map,
|
|
map_pos = map_pos)
|
|
}
|
|
|
|
GetSample <- function(N, strs, probs) {
|
|
# Sample for the strs population with distribution probs.
|
|
sample(strs, N, replace = TRUE, prob = probs)
|
|
}
|
|
|
|
GetTrueBits <- function(samp, map, params) {
|
|
# Convert sample generated by GetSample() to Bloom filters where mapping
|
|
# is defined in map.
|
|
# Output:
|
|
# - reports: a matrix of size [num_instances x size] where each row
|
|
# represents the number of times each bit in the Bloom filter
|
|
# was set for a particular instance.
|
|
# Note: reports[, 1] contains the same size for each instance.
|
|
|
|
N <- length(samp)
|
|
k <- params$k
|
|
m <- params$m
|
|
strs <- colnames(map[[1]])
|
|
reports <- matrix(0, m, k + 1)
|
|
inst <- sample(1:m, N, replace = TRUE)
|
|
for (i in 1:m) {
|
|
tab <- table(samp[inst == i])
|
|
tab2 <- rep(0, length(strs))
|
|
tab2[match(names(tab), strs)] <- tab
|
|
counts <- apply(map[[i]], 1, function(x) x * tab2)
|
|
# cat(length(tab2), dim(map[[i]]), dim(counts), "\n")
|
|
reports[i, ] <- c(sum(tab2), apply(counts, 2, sum))
|
|
}
|
|
reports
|
|
}
|
|
|
|
GetNoisyBits <- function(truth, params) {
|
|
# Applies RAPPOR to the Bloom filters.
|
|
# Args:
|
|
# - truth: a matrix generated by GetTrueBits().
|
|
|
|
k <- params$k
|
|
p <- params$p
|
|
q <- params$q
|
|
f <- params$f
|
|
|
|
rappors <- apply(truth, 1, function(x) {
|
|
# The following samples considering 4 cases:
|
|
# 1. Signal and we lie on the bit.
|
|
# 2. Signal and we tell the truth.
|
|
# 3. Noise and we lie.
|
|
# 4. Noise and we tell the truth.
|
|
|
|
# Lies when signal sampled from the binomial distribution.
|
|
lied_signal <- rbinom(k, x[-1], f)
|
|
|
|
# Remaining must be the non-lying bits when signal. Sampled with q.
|
|
truth_signal <- x[-1] - lied_signal
|
|
|
|
# Lies when there is no signal which happens x[1] - x[-1] times.
|
|
lied_nosignal <- rbinom(k, x[1] - x[-1], f)
|
|
|
|
# Trtuh when there's no signal. These are sampled with p.
|
|
truth_nosignal <- x[1] - x[-1] - lied_nosignal
|
|
|
|
# Total lies and sampling lies with 50/50 for either p or q.
|
|
lied <- lied_signal + lied_nosignal
|
|
lied_p <- rbinom(k, lied, .5)
|
|
lied_q <- lied - lied_p
|
|
|
|
# Generating the report where sampling of either p or q occurs.
|
|
rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p)
|
|
})
|
|
|
|
cbind(truth[, 1], t(rappors))
|
|
}
|
|
|
|
GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05,
|
|
prop_missing = 0,
|
|
correction = "Bonferroni") {
|
|
# Simulate N reports with pop_params describing the population and
|
|
# params describing the RAPPOR configuration.
|
|
num_strings = pop_params[[1]]
|
|
|
|
strs <- SetOfStrings(num_strings)
|
|
probs <- GetSampleProbs(pop_params)
|
|
samp <- GetSample(N, strs, probs)
|
|
map <- CreateMap(strs, params)
|
|
truth <- GetTrueBits(samp, map$map_by_cohort, params)
|
|
rappors <- GetNoisyBits(truth, params)
|
|
|
|
strs_apprx <- strs
|
|
map_apprx <- map$all_cohorts_map
|
|
# Remove % of strings to simulate missing variables.
|
|
if (prop_missing > 0) {
|
|
ind <- which(probs > 0)
|
|
removed <- sample(ind, ceiling(prop_missing * length(ind)))
|
|
map_apprx <- map$all_cohorts_map[, -removed]
|
|
strs_apprx <- strs[-removed]
|
|
}
|
|
|
|
# Randomize the columns.
|
|
ind <- sample(1:length(strs_apprx), length(strs_apprx))
|
|
map_apprx <- map_apprx[, ind]
|
|
strs_apprx <- strs_apprx[ind]
|
|
|
|
fit <- Decode(rappors, map_apprx, params, alpha = alpha,
|
|
correction = correction)
|
|
|
|
# Add truth column.
|
|
fit$fit$Truth <- table(samp)[fit$fit$string]
|
|
fit$fit$Truth[is.na(fit$fit$Truth)] <- 0
|
|
|
|
fit$map <- map$map_by_cohort
|
|
fit$truth <- truth
|
|
fit$strs <- strs
|
|
fit$probs <- probs
|
|
|
|
fit
|
|
}
|