unplugged-system/external/rappor/bin/decode_dist.R

145 lines
3.9 KiB
R
Executable File

#!/usr/bin/env Rscript
#
# Command line tool to decode a RAPPOR data set. It is a simple wrapper for
# Decode() in decode.R.
library(optparse)
#
# Command line parsing. Do this first before loading libraries to catch errors
# quickly. Loading libraries in R is slow.
#
# For command line error checking.
UsageError <- function(...) {
cat(sprintf(...))
cat('\n')
quit(status = 1)
}
option_list <- list(
# Inputs
make_option("--map", default="", help="Map file (required)"),
make_option("--counts", default="", help="Counts file (required)"),
make_option("--params", default="", help="Params file (required)"),
make_option("--output-dir", dest="output_dir", default=".",
help="Output directory (default .)"),
make_option("--correction", default="FDR", help="Correction method"),
make_option("--alpha", default=.05, help="Alpha level"),
make_option("--adjust-counts-hack", dest="adjust_counts_hack",
default=FALSE, action="store_true",
help="Allow the counts file to have more rows than cohorts.
Most users should not use this.")
)
ParseOptions <- function() {
# NOTE: This API is bad; if you add positional_arguments, the return value
# changes!
parser <- OptionParser(option_list = option_list)
opts <- parse_args(parser)
if (opts$map == "") {
UsageError("--map is required.")
}
if (opts$counts == "") {
UsageError("--counts is required.")
}
if (opts$params == "") {
UsageError("--params is required.")
}
return(opts)
}
if (!interactive()) {
opts <- ParseOptions()
}
#
# Load libraries and source our own code.
#
library(RJSONIO)
# So we don't have to change pwd
source.rappor <- function(rel_path) {
abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
source(abs_path)
}
source.rappor("analysis/R/read_input.R")
source.rappor("analysis/R/decode.R")
source.rappor("analysis/R/util.R")
source.rappor("analysis/R/alternative.R")
options(stringsAsFactors = FALSE)
main <- function(opts) {
Log("decode-dist")
Log("argv:")
print(commandArgs(TRUE))
Log("Loading inputs")
# Run a single model of all inputs are specified.
params <- ReadParameterFile(opts$params)
counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
counts <- AdjustCounts(counts, params)
# The left-most column has totals.
num_reports <- sum(counts[, 1])
map <- LoadMapFile(opts$map, params)
Log("Decoding %d reports", num_reports)
res <- Decode(counts, map$map, params, correction = opts$correction,
alpha = opts$alpha)
Log("Done decoding")
if (nrow(res$fit) == 0) {
Log("FATAL: Analysis returned no strings.")
quit(status = 1)
}
# Write analysis results as CSV.
results_csv_path <- file.path(opts$output_dir, 'results.csv')
write.csv(res$fit, file = results_csv_path, row.names = FALSE)
# Write residual histograph as a png.
results_png_path <- file.path(opts$output_dir, 'residual.png')
png(results_png_path)
breaks <- pretty(res$residual, n = 200)
histogram <- hist(res$residual, breaks, plot = FALSE)
histogram$counts <- histogram$counts / sum(histogram$counts) # convert the histogram to frequencies
plot(histogram, main = "Histogram of the residual",
xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
dev.off()
res$metrics$total_elapsed_time <- proc.time()[['elapsed']]
# Write summary as JSON (scalar values).
metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
m <- toJSON(res$metrics)
writeLines(m, con = metrics_json_path)
Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)
# TODO:
# - These are in an 2 column 'parameters' and 'values' format. Should these
# just be a plain list?
# - Should any of these privacy params be in metrics.json?
Log("Privacy summary:")
print(res$privacy)
cat("\n")
Log('DONE')
}
if (!interactive()) {
main(opts)
}