312 lines
12 KiB
R
Executable File
312 lines
12 KiB
R
Executable File
# Copyright 2014 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Authors: vpihur@google.com (Vasyl Pihur), fanti@google.com (Giulia Fanti)
|
|
|
|
library(RUnit)
|
|
source("analysis/R/encode.R")
|
|
source("analysis/R/decode.R")
|
|
source("analysis/R/simulation.R")
|
|
source("analysis/R/association.R")
|
|
source("analysis/R/fast_em.R")
|
|
source("analysis/R/util.R")
|
|
|
|
SamplePopulations <- function(N, num_variables = 1, params,
|
|
variable_opts) {
|
|
# Samples a number of variables. User specifies the number of variables
|
|
# and some desired properties of those variables.
|
|
#
|
|
# Args:
|
|
# N: Number of reports to generate.
|
|
# params: RAPPOR parameters, like Bloom filter size, number of
|
|
# hash bits, etc.
|
|
# variable_opts: List of options for generating the ground truth:
|
|
# independent = whether distinct variables should be independently drawn
|
|
# deterministic = whether the variables should be drawn from a
|
|
# Poisson distribution or uniformly assigned across the range
|
|
# of 1:num_strings
|
|
# num_strings: Only does something if deterministic == TRUE, and
|
|
# specifies how many strings to use in the uniform assignment
|
|
# of ground truth strings.
|
|
#
|
|
# Returns:
|
|
# RAPPOR simulated ground truth for each piece of data.
|
|
|
|
m <- params$m
|
|
num_strings <- variable_opts$num_strings
|
|
|
|
if (variable_opts$deterministic) {
|
|
# If a deterministic assignment is desired, evenly distribute
|
|
# strings across all cohorts.
|
|
|
|
reps <- ceiling(N / num_strings)
|
|
variables <- lapply(1:num_variables,
|
|
function(i)
|
|
as.vector(sapply(1:num_strings, function(x)
|
|
rep(x, reps)))[1:N])
|
|
cohorts <- lapply(1:num_variables,
|
|
function(i) rep(1:m, ceiling(N / m))[1:N])
|
|
} else {
|
|
# Otherwise, draw from a Poisson random variable
|
|
variables <- lapply(1:num_variables, function(i) rpois(N, 1) + 1)
|
|
|
|
# Randomly assign cohorts in each dimension
|
|
cohorts <- lapply(1:num_variables,
|
|
function(i) sample(1:params$m, N, replace = TRUE))
|
|
|
|
if (!variable_opts$independent) {
|
|
# If user wants dependent RVs, subsequent variables are closely correlated
|
|
# with the first variable in the foll. manner:
|
|
# variable_i ~ variable_1 + (i-1) Bernoulli(0.5)
|
|
|
|
bernoulli_corr <- function(x) {
|
|
variables[[1]] + (x - 1) * sample(c(0, 1), N, replace = TRUE)}
|
|
|
|
variables[2:num_variables] <- lapply(2:num_variables,
|
|
function(x) bernoulli_corr(x))
|
|
}
|
|
}
|
|
list(variables = variables, cohorts = cohorts)
|
|
}
|
|
|
|
Simulate <- function(N, num_variables, params, variable_opts = NULL,
|
|
truth = NULL, basic = FALSE) {
|
|
if (is.null(truth)) {
|
|
truth <- SamplePopulations(N, num_variables, params,
|
|
variable_opts)
|
|
}
|
|
strs <- lapply(truth$variables, function(x) sort(seq(max(x))))
|
|
# strs <- lapply(truth$variables, function(x) sort(unique(x)))
|
|
# strs <- lapply(truth$variables, function(x) 1:length(unique(x)))
|
|
|
|
# Construct lists of maps and reports
|
|
if (variable_opts$deterministic) {
|
|
# Build the maps
|
|
map <- CreateMap(strs[[1]], params, FALSE, basic = basic)
|
|
maps <- lapply(1:num_variables, function(x) map)
|
|
# Build the reports
|
|
report <- EncodeAll(truth$variables[[1]], truth$cohorts[[1]],
|
|
map$map_by_cohort, params)
|
|
reports <- lapply(1:num_variables, function(x) report)
|
|
} else {
|
|
# Build the maps
|
|
maps <- lapply(1:num_variables, function(x)
|
|
CreateMap(strs[[x]], params, FALSE,
|
|
basic = basic))
|
|
# Build the reports
|
|
reports <- lapply(1:num_variables, function(x)
|
|
EncodeAll(truth$variables[[x]], truth$cohorts[[x]],
|
|
maps[[x]]$map_by_cohort, params))
|
|
}
|
|
|
|
list(reports = reports, cohorts = truth$cohorts,
|
|
truth = truth$variables, maps = maps, strs = strs)
|
|
|
|
}
|
|
|
|
# ----------------Actual testing starts here--------------- #
|
|
TestComputeDistributionEM <- function() {
|
|
# Test various aspects of ComputeDistributionEM in association.R.
|
|
# Tests include:
|
|
# Test 1: Compute a joint distribution of uniformly distributed,
|
|
# perfectly correlated strings
|
|
# Test 2: Compute a marginal distribution of uniformly distributed strings
|
|
# Test 3: Check the "other" category estimation works by removing
|
|
# a string from the known map.
|
|
# Test 4: Test that the variance from EM algorithm is 1/N when there
|
|
# is no noise in the system.
|
|
# Test 5: Check that the right answer is still obtained when f = 0.2.
|
|
|
|
num_variables <- 3
|
|
N <- 100
|
|
|
|
# Initialize the parameters
|
|
params <- list(k = 12, h = 2, m = 4, p = 0, q = 1, f = 0)
|
|
variable_opts <- list(deterministic = TRUE, num_strings = 2,
|
|
independent = FALSE)
|
|
sim <- Simulate(N, num_variables, params, variable_opts)
|
|
|
|
# Test 1: Delta function pmf
|
|
joint_dist <- ComputeDistributionEM(sim$reports,
|
|
sim$cohorts, sim$maps,
|
|
ignore_other = TRUE,
|
|
params = params,
|
|
marginals = NULL,
|
|
estimate_var = FALSE)
|
|
# The recovered distribution should be close to the delta function.
|
|
checkTrue(abs(joint_dist$fit["1", "1", "1"] - 0.5) < 0.01)
|
|
checkTrue(abs(joint_dist$fit["2", "2", "2"] - 0.5) < 0.01)
|
|
|
|
# Test 2: Now compute a marginal using EM
|
|
dist <- ComputeDistributionEM(list(sim$reports[[1]]),
|
|
list(sim$cohorts[[1]]),
|
|
list(sim$maps[[1]]),
|
|
ignore_other = TRUE,
|
|
params = params,
|
|
marginals = NULL,
|
|
estimate_var = FALSE)
|
|
checkTrue(abs(dist$fit["1"] - 0.5) < 0.01)
|
|
|
|
# Test 3: Check that the "other" category is correctly computed
|
|
# Build a modified map with no column 2 (i.e. we only know that string
|
|
# "1" is a valid string
|
|
map <- sim$maps[[1]]
|
|
small_map <- map
|
|
|
|
for (i in 1:params$m) {
|
|
locs <- which(map$map_by_cohort[[i]][, 1])
|
|
small_map$map_by_cohort[[i]] <- sparseMatrix(locs, rep(1, length(locs)),
|
|
dims = c(params$k, 1))
|
|
locs <- which(map$all_cohorts_map[, 1])
|
|
colnames(small_map$map_by_cohort[[i]]) <- sim$strs[1]
|
|
}
|
|
small_map$all_cohorts_map <- do.call("rBind", small_map$map_by_cohort)
|
|
|
|
dist <- ComputeDistributionEM(list(sim$reports[[1]]),
|
|
list(sim$cohorts[[1]]),
|
|
list(small_map),
|
|
ignore_other = FALSE,
|
|
params = params,
|
|
marginals = NULL,
|
|
estimate_var = FALSE)
|
|
|
|
# The recovered distribution should be uniform over 2 strings.
|
|
checkTrue(abs(dist$fit[1] - 0.5) < 0.1)
|
|
|
|
# Test 4: Test the variance is 1/N
|
|
variable_opts <- list(deterministic = TRUE, num_strings = 1)
|
|
sim <- Simulate(N, num_variables = 1, params, variable_opts)
|
|
dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
|
|
sim$maps, ignore_other = TRUE,
|
|
params = params, marginals = NULL,
|
|
estimate_var = TRUE)
|
|
|
|
checkEqualsNumeric(dist$em$var_cov[1, 1], 1 / N)
|
|
|
|
# Test 5: Check that when f=0.2, we still get a good estimate
|
|
params <- list(k = 12, h = 2, m = 2, p = 0, q = 1, f = 0.2)
|
|
variable_opts <- list(deterministic = TRUE, num_strings = 2)
|
|
sim <- Simulate(N, num_variables = 2, params, variable_opts)
|
|
dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
|
|
sim$maps, ignore_other = TRUE,
|
|
params = params, marginals = NULL,
|
|
estimate_var = FALSE)
|
|
|
|
checkTrue(abs(dist$fit["1", "1"] - 0.5) < 0.15)
|
|
checkTrue(abs(dist$fit["2", "2"] - 0.5) < 0.15)
|
|
|
|
# Test 6: Check the computed joint distribution with randomized
|
|
# correlated inputs from the Poisson distribution
|
|
# Expect to have correlation between strings n and n + 1
|
|
N <- 1000
|
|
params <- list(k = 16, h = 2, m = 4, p = 0.1, q = 0.9, f = 0.1)
|
|
variable_opts <- list(deterministic = FALSE, independent = FALSE)
|
|
sim <- Simulate(N, num_variables = 2, params, variable_opts)
|
|
dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
|
|
sim$maps, ignore_other = TRUE,
|
|
params = params, marginals = NULL,
|
|
estimate_var = FALSE)
|
|
|
|
print_dist <- TRUE # to print joint distribution, set to TRUE
|
|
|
|
if (print_dist) {
|
|
# dist$fit[dist$fit<1e-4] <- 0
|
|
# Sort by row names and column names to visually see correlation
|
|
print(dist$fit[sort(rownames(dist$fit)), sort(colnames(dist$fit))])
|
|
}
|
|
|
|
# Check for correlations (constants chosen heuristically to get good
|
|
# test confidence with small # of samples)
|
|
# Should have mass roughly 1/2e and 1/2e each
|
|
checkTrue(abs(dist$fit["1", "1"] - dist$fit["1", "2"]) < 0.1)
|
|
checkTrue(abs(dist$fit["2", "2"] - dist$fit["2", "3"]) < 0.1)
|
|
|
|
# Should have mass roughly 1/4e and 1/4e each
|
|
checkTrue(abs(dist$fit["3", "3"] - dist$fit["3", "4"]) < 0.06)
|
|
|
|
# Check for lack of probability mass
|
|
checkTrue(dist$fit["1", "3"] < 0.02)
|
|
checkTrue(dist$fit["1", "4"] < 0.02)
|
|
checkTrue(dist$fit["2", "1"] < 0.02)
|
|
checkTrue(dist$fit["2", "4"] < 0.02)
|
|
checkTrue(dist$fit["3", "1"] < 0.02)
|
|
checkTrue(dist$fit["3", "2"] < 0.02)
|
|
}
|
|
|
|
MakeCondProb <- function() {
|
|
d = matrix(c(1,1,2,2,3,3), nrow=3, ncol=2)
|
|
d = d / sum(d)
|
|
|
|
e = matrix(c(3,3,2,2,1,1), nrow=3, ncol=2)
|
|
e = e / sum(e)
|
|
|
|
list(d, e, d) # 3 reports
|
|
}
|
|
|
|
# Test the slow version in R.
|
|
RunEmFunction <- function(cond_prob, max_em_iters) {
|
|
cond_prob <- MakeCondProb()
|
|
|
|
# Mechanical test of 4 iterations. em$hist has 5 elements.
|
|
result <- EM(cond_prob, max_em_iters=max_em_iters)
|
|
result$est
|
|
}
|
|
|
|
# Run a test of the EM executable
|
|
RunEmExecutable <- function(em_executable, cond_prob, max_em_iters) {
|
|
print(cond_prob)
|
|
|
|
if (!file.exists(em_executable)) {
|
|
stop(sprintf("EM executable %s doesn't exist (build it?)", em_executable))
|
|
}
|
|
em_iter_func <- ConstructFastEM(em_executable, "/tmp")
|
|
|
|
result <- em_iter_func(cond_prob, max_em_iters=max_em_iters)
|
|
result$est
|
|
}
|
|
|
|
TestCppImplementation <- function() {
|
|
cond_prob <- MakeCondProb()
|
|
max_em_iters <- 10
|
|
fit1 <- RunEmFunction(cond_prob, max_em_iters)
|
|
|
|
# Assume we're in the repo root
|
|
em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em")
|
|
fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters)
|
|
|
|
cpp_diff <- abs(fit1 - fit2)
|
|
print(cpp_diff)
|
|
Log("C++ implementation difference after %d iterations: %e", max_em_iters,
|
|
sum(cpp_diff))
|
|
|
|
# After 10 iterations they should be almost indistinguishable.
|
|
checkTrue(sum(cpp_diff) < 1e-10)
|
|
}
|
|
|
|
TestTensorFlowImplementation <- function() {
|
|
cond_prob <- MakeCondProb()
|
|
max_em_iters <- 10
|
|
fit1 <- RunEmFunction(cond_prob, max_em_iters)
|
|
|
|
em_tf <- file.path(getwd(), "analysis/tensorflow/fast_em.sh")
|
|
fit2 <- RunEmExecutable(em_tf, cond_prob, max_em_iters)
|
|
|
|
tf_diff <- abs(fit1 - fit2)
|
|
print(tf_diff)
|
|
Log("TensorFlow implementation difference after %d iterations: %e",
|
|
max_em_iters, sum(tf_diff))
|
|
|
|
checkTrue(sum(tf_diff) < 1e-10)
|
|
}
|