492 lines
17 KiB
R
Executable File
492 lines
17 KiB
R
Executable File
# Copyright 2014 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
library(parallel) # mclapply
|
|
|
|
source.rappor <- function(rel_path) {
|
|
abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
|
|
source(abs_path)
|
|
}
|
|
|
|
source.rappor("analysis/R/util.R") # for Log
|
|
source.rappor("analysis/R/decode.R") # for ComputeCounts
|
|
|
|
#
|
|
# Tools used to estimate variable distributions of up to three variables
|
|
# in RAPPOR. This contains the functions relevant to estimating joint
|
|
# distributions.
|
|
|
|
GetOtherProbs <- function(counts, map_by_cohort, marginal, params, pstar,
|
|
qstar) {
|
|
# Computes the marginal for the "other" category.
|
|
#
|
|
# Args:
|
|
# counts: m x (k+1) matrix with counts of each bit for each
|
|
# cohort (m=#cohorts total, k=# bits in bloom filter), first column
|
|
# stores the total counts
|
|
# map_by_cohort: list of matrices encoding locations of hashes for each
|
|
# string "other" category)
|
|
# marginal: object containing the estimated frequencies of known strings
|
|
# as well as the strings themselves, variance, etc.
|
|
# params: RAPPOR encoding parameters
|
|
#
|
|
# Returns:
|
|
# List of vectors of probabilities that each bit was set by the "other"
|
|
# category. The list is indexed by cohort.
|
|
|
|
N <- sum(counts[, 1])
|
|
|
|
# Counts of known strings to remove from each cohort.
|
|
known_counts <- ceiling(marginal$proportion * N / params$m)
|
|
sum_known <- sum(known_counts)
|
|
|
|
# Select only the strings we care about from each cohort.
|
|
# NOTE: drop = FALSE necessary if there is one candidate
|
|
candidate_map <- lapply(map_by_cohort, function(map_for_cohort) {
|
|
map_for_cohort[, marginal$string, drop = FALSE]
|
|
})
|
|
|
|
# If no strings were found, all nonzero counts were set by "other"
|
|
if (length(marginal) == 0) {
|
|
probs_other <- apply(counts, 1, function(cohort_row) {
|
|
cohort_row[-1] / cohort_row[1]
|
|
})
|
|
return(as.list(as.data.frame(probs_other)))
|
|
}
|
|
|
|
# Counts set by known strings without noise considerations.
|
|
known_counts_by_cohort <- sapply(candidate_map, function(map_for_cohort) {
|
|
as.vector(as.matrix(map_for_cohort) %*% known_counts)
|
|
})
|
|
|
|
# Protect against R's matrix/vector confusion. This ensures
|
|
# known_counts_by_cohort is a matrix in the k=1 case.
|
|
dim(known_counts_by_cohort) <- c(params$m, params$k)
|
|
|
|
# Counts set by known vals zero bits adjusting by p plus true bits
|
|
# adjusting by q.
|
|
known_counts_by_cohort <- (sum_known - known_counts_by_cohort) * pstar +
|
|
known_counts_by_cohort * qstar
|
|
|
|
# Add the left hand sums to make it a m x (k+1) "counts" matrix
|
|
known_counts_by_cohort <- cbind(sum_known, known_counts_by_cohort)
|
|
|
|
# Counts set by the "other" category.
|
|
reduced_counts <- counts - known_counts_by_cohort
|
|
reduced_counts[reduced_counts < 0] <- 0
|
|
probs_other <- apply(reduced_counts, 1, function(cohort_row) {
|
|
cohort_row[-1] / cohort_row[1]
|
|
})
|
|
|
|
# Protect against R's matrix/vector confusion.
|
|
dim(probs_other) <- c(params$k, params$m)
|
|
|
|
probs_other[probs_other > 1] <- 1
|
|
probs_other[is.nan(probs_other)] <- 0
|
|
probs_other[is.infinite(probs_other)] <- 0
|
|
|
|
# Convert it from a k x m matrix to a list indexed by m cohorts.
|
|
# as.data.frame makes each cohort a column, which can be indexed by
|
|
# probs_other[[cohort]].
|
|
result <- as.list(as.data.frame(probs_other))
|
|
|
|
result
|
|
}
|
|
|
|
GetCondProbBooleanReports <- function(reports, pstar, qstar, num_cores) {
|
|
# Compute conditional probabilities given a set of Boolean reports.
|
|
#
|
|
# Args:
|
|
# reports: RAPPOR reports as a list of bit arrays (of length 1, because
|
|
# this is a boolean report)
|
|
# pstar, qstar: standard params computed from from rappor parameters
|
|
# num_cores: number of cores to pass to mclapply to parallelize apply
|
|
#
|
|
# Returns:
|
|
# Conditional probability of all boolean reports corresponding to
|
|
# candidates (TRUE, FALSE)
|
|
|
|
# The values below are p(report=1|value=TRUE), p(report=1|value=FALSE)
|
|
cond_probs_for_1 <- c(qstar, pstar)
|
|
# The values below are p(report=0|value=TRUE), p(report=0|value=FALSE)
|
|
cond_probs_for_0 <- c(1 - qstar, 1 - pstar)
|
|
|
|
cond_report_dist <- mclapply(reports, function(report) {
|
|
if (report[[1]] == 1) {
|
|
cond_probs_for_1
|
|
} else {
|
|
cond_probs_for_0
|
|
}
|
|
}, mc.cores = num_cores)
|
|
cond_report_dist
|
|
}
|
|
|
|
GetCondProbStringReports <- function(reports, cohorts, map, m, pstar, qstar,
|
|
marginal, prob_other = NULL, num_cores) {
|
|
# Wrapper around GetCondProb. Given a set of reports, cohorts, map and
|
|
# parameters m, p*, and q*, it first computes bit indices by cohort, and
|
|
# then applies GetCondProb individually to each report.
|
|
#
|
|
# Args:
|
|
# reports: RAPPOR reports as a list of bit arrays
|
|
# cohorts: cohorts corresponding to these reports as a list
|
|
# map: map file
|
|
# m, pstar, qstar: standard params computed from from rappor parameters
|
|
# marginal: list containing marginal estimates (output of Decode)
|
|
# prob_other: vector of length k, indicating how often each bit in the
|
|
# Bloom filter was set by a string in the "other" category.
|
|
#
|
|
# Returns:
|
|
# Conditional probability of all reports given each of the strings in
|
|
# marginal$string
|
|
|
|
# Get bit indices that are set per candidate per cohort
|
|
bit_indices_by_cohort <- lapply(1:m, function(cohort) {
|
|
map_for_cohort <- map$map_by_cohort[[cohort]]
|
|
# Find the bits set by the candidate strings
|
|
bit_indices <- lapply(marginal$string, function(x) {
|
|
which(map_for_cohort[, x])
|
|
})
|
|
bit_indices
|
|
})
|
|
|
|
# Apply GetCondProb over all reports
|
|
cond_report_dist <- mclapply(seq(length(reports)), function(i) {
|
|
cohort <- cohorts[i]
|
|
#Log('Report %d, cohort %d', i, cohort)
|
|
bit_indices <- bit_indices_by_cohort[[cohort]]
|
|
GetCondProb(reports[[i]], pstar, qstar, bit_indices,
|
|
prob_other = prob_other[[cohort]])
|
|
}, mc.cores = num_cores)
|
|
cond_report_dist
|
|
}
|
|
|
|
|
|
GetCondProb <- function(report, pstar, qstar, bit_indices, prob_other = NULL) {
|
|
# Given the observed bit array, estimate P(report | true value).
|
|
# Probabilities are estimated for all truth values.
|
|
#
|
|
# Args:
|
|
# report: A single observed RAPPOR report (binary vector of length k).
|
|
# params: RAPPOR parameters.
|
|
# bit_indices: list with one entry for each candidate. Each entry is an
|
|
# integer vector of length h, specifying which bits are set for the
|
|
# candidate in the report's cohort.
|
|
# prob_other: vector of length k, indicating how often each bit in the
|
|
# Bloom filter was set by a string in the "other" category.
|
|
#
|
|
# Returns:
|
|
# Conditional probability of report given each of the strings in
|
|
# candidate_strings
|
|
ones <- sum(report)
|
|
zeros <- length(report) - ones
|
|
probs <- ifelse(report == 1, pstar, 1 - pstar)
|
|
|
|
# Find the likelihood of report given each candidate string
|
|
prob_obs_vals <- sapply(bit_indices, function(x) {
|
|
prod(c(probs[-x], ifelse(report[x] == 1, qstar, 1 - qstar)))
|
|
})
|
|
|
|
# Account for the "other" category
|
|
if (!is.null(prob_other)) {
|
|
prob_other <- prod(c(prob_other[which(report == 1)],
|
|
(1 - prob_other)[which(report == 0)]))
|
|
c(prob_obs_vals, prob_other)
|
|
} else {
|
|
prob_obs_vals
|
|
}
|
|
}
|
|
|
|
UpdatePij <- function(pij, cond_prob) {
|
|
# Update the probability matrix based on the EM algorithm.
|
|
#
|
|
# Args:
|
|
# pij: conditional distribution of x (vector)
|
|
# cond_prob: conditional distribution computed previously
|
|
#
|
|
# Returns:
|
|
# Updated pijs from em algorithm (maximization)
|
|
|
|
# NOTE: Not using mclapply here because we have a faster C++ implementation.
|
|
# mclapply spawns multiple processes, and each process can take up 3 GB+ or 5
|
|
# GB+ of memory.
|
|
wcp <- lapply(cond_prob, function(x) {
|
|
z <- x * pij
|
|
z <- z / sum(z)
|
|
z[is.nan(z)] <- 0
|
|
z
|
|
})
|
|
Reduce("+", wcp) / length(wcp)
|
|
}
|
|
|
|
ComputeVar <- function(cond_prob, est) {
|
|
# Computes the variance of the estimated pij's.
|
|
#
|
|
# Args:
|
|
# cond_prob: conditional distribution computed previously
|
|
# est: estimated pij's
|
|
#
|
|
# Returns:
|
|
# Variance of the estimated pij's
|
|
|
|
inform <- Reduce("+", lapply(cond_prob, function(x) {
|
|
(outer(as.vector(x), as.vector(x))) / (sum(x * est))^2
|
|
}))
|
|
var_cov <- solve(inform)
|
|
sd <- matrix(sqrt(diag(var_cov)), dim(cond_prob[[1]]))
|
|
list(var_cov = var_cov, sd = sd, inform = inform)
|
|
}
|
|
|
|
EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
|
|
max_em_iters = 1000, epsilon = 10^-6, verbose = FALSE) {
|
|
# Performs estimation.
|
|
#
|
|
# Args:
|
|
# cond_prob: conditional distribution computed previously
|
|
# starting_pij: estimated pij's
|
|
# estimate_var: flags whether we should estimate the variance
|
|
# of our computed distribution
|
|
# max_em_iters: maximum number of EM iterations
|
|
# epsilon: convergence parameter
|
|
# verbose: flags whether to display error data
|
|
#
|
|
# Returns:
|
|
# Estimated pij's, variance, error params
|
|
|
|
pij <- list()
|
|
state_space <- dim(cond_prob[[1]])
|
|
if (is.null(starting_pij)) {
|
|
pij[[1]] <- array(1 / prod(state_space), state_space)
|
|
} else {
|
|
pij[[1]] <- starting_pij
|
|
}
|
|
|
|
i <- 0 # visible outside loop
|
|
if (nrow(pij[[1]]) > 0) {
|
|
# Run EM
|
|
for (i in 1:max_em_iters) {
|
|
pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
|
|
dif <- max(abs(pij[[i + 1]] - pij[[i]]))
|
|
if (dif < epsilon) {
|
|
break
|
|
}
|
|
Log('EM iteration %d, dif = %e', i, dif)
|
|
}
|
|
}
|
|
# Compute the variance of the estimate.
|
|
est <- pij[[length(pij)]]
|
|
if (estimate_var) {
|
|
var_cov <- ComputeVar(cond_prob, est)
|
|
sd <- var_cov$sd
|
|
inform <- var_cov$inform
|
|
var_cov <- var_cov$var_cov
|
|
} else {
|
|
var_cov <- NULL
|
|
inform <- NULL
|
|
sd <- NULL
|
|
}
|
|
list(est = est, sd = sd, var_cov = var_cov, hist = pij, num_em_iters = i)
|
|
}
|
|
|
|
TestIndependence <- function(est, inform) {
|
|
# Tests the degree of independence between variables.
|
|
#
|
|
# Args:
|
|
# est: esimated pij values
|
|
# inform: information matrix
|
|
#
|
|
# Returns:
|
|
# Chi-squared statistic for whether two variables are independent
|
|
|
|
expec <- outer(apply(est, 1, sum), apply(est, 2, sum))
|
|
diffs <- matrix(est - expec, ncol = 1)
|
|
stat <- t(diffs) %*% inform %*% diffs
|
|
df <- (nrow(est) - 1) * (ncol(est) - 1)
|
|
list(stat = stat, pval = pchisq(stat, df, lower = FALSE))
|
|
}
|
|
|
|
UpdateJointConditional <- function(cond_report_dist, joint_conditional = NULL) {
|
|
# Updates the joint conditional distribution of d variables, where
|
|
# num_variables is chosen by the client. Since variables are conditionally
|
|
# independent of one another, this is basically an outer product.
|
|
#
|
|
# Args:
|
|
# joint_conditional: The current state of the joint conditional
|
|
# distribution. This is a list with as many elements as there
|
|
# are reports.
|
|
# cond_report_dist: The conditional distribution of variable x, which will
|
|
# be outer-producted with the current joint conditional.
|
|
#
|
|
# Returns:
|
|
# A list of same length as joint_conditional containing the joint
|
|
# conditional distribution of all variables. If I want
|
|
# P(X'=x',Y=y'|X=x,Y=y), I will look at
|
|
# joint_conditional[x,x',y,y'].
|
|
|
|
if (is.null(joint_conditional)) {
|
|
lapply(cond_report_dist, function(x) array(x))
|
|
} else {
|
|
mapply("outer", joint_conditional, cond_report_dist,
|
|
SIMPLIFY = FALSE)
|
|
}
|
|
}
|
|
|
|
ComputeDistributionEM <- function(reports, report_cohorts, maps,
|
|
ignore_other = FALSE,
|
|
params = NULL,
|
|
params_list = NULL,
|
|
marginals = NULL,
|
|
estimate_var = FALSE,
|
|
num_cores = 10,
|
|
em_iter_func = EM,
|
|
max_em_iters = 1000) {
|
|
# Computes the distribution of num_variables variables, where
|
|
# num_variables is chosen by the client, using the EM algorithm.
|
|
#
|
|
# Args:
|
|
# reports: A list of num_variables elements, each a 2-dimensional array
|
|
# containing the counts of each bin for each report
|
|
# report_cohorts: A num_variables-element list; the ith element is an array
|
|
# containing the cohort of jth report for ith variable.
|
|
# maps: A num_variables-element list containing the map for each variable
|
|
# ignore_other: A boolean describing whether to compute the "other" category
|
|
# params: RAPPOR encoding parameters. If set, all variables are assumed to
|
|
# be encoded with these parameters.
|
|
# params_list: A list of num_variables elements, each of which is the
|
|
# RAPPOR encoding parameters for a variable (a list itself). If set,
|
|
# it must be the same length as 'reports'.
|
|
# marginals: List of estimated marginals for each variable
|
|
# estimate_var: A flag telling whether to estimate the variance.
|
|
# em_iter_func: Function that implements the iterative EM algorithm.
|
|
|
|
# Handle the case that the client wants to find the joint distribution of too
|
|
# many variables.
|
|
num_variables <- length(reports)
|
|
|
|
if (is.null(params) && is.null(params_list)) {
|
|
stop("Either params or params_list must be passed")
|
|
}
|
|
|
|
Log('Computing joint conditional')
|
|
|
|
# Compute the counts for each variable and then do conditionals.
|
|
joint_conditional = NULL
|
|
found_strings <- list()
|
|
|
|
for (j in (1:num_variables)) {
|
|
Log('Processing var %d', j)
|
|
|
|
var_report <- reports[[j]]
|
|
var_cohort <- report_cohorts[[j]]
|
|
var_map <- maps[[j]]
|
|
if (!is.null(params)) {
|
|
var_params <- params
|
|
} else {
|
|
var_params <- params_list[[j]]
|
|
}
|
|
|
|
var_counts <- NULL
|
|
if (is.null(marginals)) {
|
|
Log('\tSumming bits to gets observed counts')
|
|
var_counts <- ComputeCounts(var_report, var_cohort, var_params)
|
|
|
|
Log('\tDecoding marginal')
|
|
marginal <- Decode(var_counts, var_map$all_cohorts_map, var_params,
|
|
quiet = TRUE)$fit
|
|
Log('\tMarginal for var %d has %d values:', j, nrow(marginal))
|
|
print(marginal[, c('estimate', 'proportion')]) # rownames are the string
|
|
cat('\n')
|
|
|
|
if (nrow(marginal) == 0) {
|
|
Log('ERROR: Nothing decoded for variable %d', j)
|
|
return (NULL)
|
|
}
|
|
} else {
|
|
marginal <- marginals[[j]]
|
|
}
|
|
found_strings[[j]] <- marginal$string
|
|
|
|
p <- var_params$p
|
|
q <- var_params$q
|
|
f <- var_params$f
|
|
# pstar and qstar needed to compute other probabilities as well as for
|
|
# inputs to GetCondProb{Boolean, String}Reports subsequently
|
|
pstar <- (1 - f / 2) * p + (f / 2) * q
|
|
qstar <- (1 - f / 2) * q + (f / 2) * p
|
|
k <- var_params$k
|
|
|
|
# Ignore other probability if either ignore_other is set or k == 1
|
|
# (Boolean RAPPOR)
|
|
if (ignore_other || (k == 1)) {
|
|
prob_other <- vector(mode = "list", length = var_params$m)
|
|
} else {
|
|
# Compute the probability of the "other" category
|
|
if (is.null(var_counts)) {
|
|
var_counts <- ComputeCounts(var_report, var_cohort, var_params)
|
|
}
|
|
prob_other <- GetOtherProbs(var_counts, var_map$map_by_cohort, marginal,
|
|
var_params, pstar, qstar)
|
|
found_strings[[j]] <- c(found_strings[[j]], "Other")
|
|
}
|
|
|
|
# Get the joint conditional distribution
|
|
Log('\tGetCondProb for each report (%d cores)', num_cores)
|
|
|
|
# TODO(pseudorandom): check RAPPOR type more systematically instead of by
|
|
# checking if k == 1
|
|
if (k == 1) {
|
|
cond_report_dist <- GetCondProbBooleanReports(var_report, pstar, qstar,
|
|
num_cores)
|
|
} else {
|
|
cond_report_dist <- GetCondProbStringReports(var_report,
|
|
var_cohort, var_map, var_params$m, pstar, qstar,
|
|
marginal, prob_other, num_cores)
|
|
}
|
|
|
|
Log('\tUpdateJointConditional')
|
|
|
|
# Update the joint conditional distribution of all variables
|
|
joint_conditional <- UpdateJointConditional(cond_report_dist,
|
|
joint_conditional)
|
|
}
|
|
|
|
N <- length(joint_conditional)
|
|
dimensions <- dim(joint_conditional[[1]])
|
|
# e.g. 2 x 3
|
|
dimensions_str <- paste(dimensions, collapse = ' x ')
|
|
total_entries <- prod(c(N, dimensions))
|
|
|
|
Log('Starting EM with N = %d matrices of size %s (%d entries)',
|
|
N, dimensions_str, total_entries)
|
|
|
|
start_time <- proc.time()[['elapsed']]
|
|
|
|
# Run expectation maximization to find joint distribution
|
|
em <- em_iter_func(joint_conditional, max_em_iters=max_em_iters,
|
|
epsilon = 10 ^ -6, verbose = FALSE,
|
|
estimate_var = estimate_var)
|
|
|
|
em_elapsed_time <- proc.time()[['elapsed']] - start_time
|
|
|
|
dimnames(em$est) <- found_strings
|
|
# Return results in a usable format
|
|
list(fit = em$est,
|
|
sd = em$sd,
|
|
em_elapsed_time = em_elapsed_time,
|
|
num_em_iters = em$num_em_iters,
|
|
# This last field is implementation-specific; it can be used for
|
|
# interactive debugging.
|
|
em = em)
|
|
}
|