126 lines
4.6 KiB
R
Executable File
126 lines
4.6 KiB
R
Executable File
#!/usr/bin/env Rscript
|
|
#
|
|
# Copyright 2015 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Reads map files, report files, and RAPPOR parameters to run
|
|
# an EM algorithm to estimate joint distribution over two or more variables
|
|
#
|
|
# Usage:
|
|
# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
|
|
# -reports reports.csv \
|
|
# Inputs: map1, map2, reports, params
|
|
# see how options are parsed below for more information
|
|
# Outputs:
|
|
# prints a table with estimated joint probability masses
|
|
# over candidate strings
|
|
# Ex.
|
|
# ssl nossl
|
|
# intel 0.1 0.3
|
|
# google 0.5 0.1
|
|
|
|
library("optparse")
|
|
|
|
options(stringsAsFactors = FALSE)
|
|
|
|
if(!interactive()) {
|
|
option_list <- list(
|
|
# Flags
|
|
make_option(c("--map1", "-m1"), default = "map_1.csv",
|
|
help = "Hashed candidates for 1st variable"),
|
|
make_option(c("--map2", "-m2"), default = "map_2.csv",
|
|
help = "Hashed candidates for 2nd variable"),
|
|
make_option(c("--reports", "-r"), default = "reports.csv",
|
|
help = "File with raw reports as <cohort, report1, report2>"),
|
|
make_option(c("--params", "-p"), default = "params.csv",
|
|
help = "Filename for RAPPOR parameters")
|
|
)
|
|
opts <- parse_args(OptionParser(option_list = option_list))
|
|
}
|
|
|
|
source("../analysis/R/encode.R")
|
|
source("../analysis/R/decode.R")
|
|
source("../analysis/R/simulation.R")
|
|
source("../analysis/R/read_input.R")
|
|
source("../analysis/R/association.R")
|
|
|
|
# This function processes the maps loaded using ReadMapFile
|
|
# Association analysis requires a map object with a map
|
|
# field that has the map split into cohorts and an rmap field
|
|
# that has all the cohorts combined
|
|
# Arguments:
|
|
# map = map object with cohorts as sparse matrix in
|
|
# object map$map
|
|
# This is the expected object from ReadMapFile
|
|
# params = data field with parameters
|
|
# TODO(pseudorandom): move this functionality to ReadMapFile
|
|
ProcessMap <- function(map, params) {
|
|
map$rmap <- map$map
|
|
split_map <- function(i, map_struct) {
|
|
numbits <- params$k
|
|
indices <- which(as.matrix(
|
|
map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
|
|
arr.ind = TRUE)
|
|
sparseMatrix(indices[, "row"], indices[, "col"],
|
|
dims = c(numbits, max(indices[, "col"])))
|
|
}
|
|
map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap))
|
|
map
|
|
}
|
|
|
|
main <- function(opts) {
|
|
ptm <- proc.time()
|
|
|
|
params <- ReadParameterFile(opts$params)
|
|
opts_map <- list(opts$map1, opts$map2)
|
|
map <- lapply(opts_map, function(o)
|
|
ProcessMap(ReadMapFile(o, params = params),
|
|
params = params))
|
|
# Reports must be of the format
|
|
# cohort no, rappor bitstring 1, rappor bitstring 2
|
|
reportsObj <- read.csv(opts$reports,
|
|
colClasses = c("integer", "character", "character"),
|
|
header = FALSE)
|
|
|
|
# Parsing reportsObj
|
|
# ComputeDistributionEM allows for different sets of cohorts
|
|
# for each variable. Here, both sets of cohorts are identical
|
|
co <- as.list(reportsObj[1])[[1]]
|
|
cohorts <- list(co, co)
|
|
# Parse reports from reportObj cols 2 and 3
|
|
reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1]))
|
|
|
|
# Split strings into bit arrays (as required by assoc analysis)
|
|
reports <- lapply(1:2, function(i) {
|
|
# apply the following function to each of reports[[1]] and reports[[2]]
|
|
lapply(reports[[i]][[1]], function(x) {
|
|
# function splits strings and converts them to numeric values
|
|
as.numeric(strsplit(x, split = "")[[1]])
|
|
})
|
|
})
|
|
|
|
joint_dist <- ComputeDistributionEM(reports, cohorts, map,
|
|
ignore_other = TRUE,
|
|
params, marginals = NULL,
|
|
estimate_var = FALSE)
|
|
# TODO(pseudorandom): Export the results to a file for further analysis
|
|
print("JOINT_DIST$FIT")
|
|
print(joint_dist$fit)
|
|
print("PROC.TIME")
|
|
print(proc.time() - ptm)
|
|
}
|
|
|
|
if(!interactive()) {
|
|
main(opts)
|
|
} |