145 lines
3.9 KiB
R
Executable File
145 lines
3.9 KiB
R
Executable File
#!/usr/bin/env Rscript
|
|
#
|
|
# Command line tool to decode a RAPPOR data set. It is a simple wrapper for
|
|
# Decode() in decode.R.
|
|
|
|
library(optparse)
|
|
|
|
#
|
|
# Command line parsing. Do this first before loading libraries to catch errors
|
|
# quickly. Loading libraries in R is slow.
|
|
#
|
|
|
|
# For command line error checking.
|
|
UsageError <- function(...) {
|
|
cat(sprintf(...))
|
|
cat('\n')
|
|
quit(status = 1)
|
|
}
|
|
|
|
option_list <- list(
|
|
# Inputs
|
|
make_option("--map", default="", help="Map file (required)"),
|
|
make_option("--counts", default="", help="Counts file (required)"),
|
|
make_option("--params", default="", help="Params file (required)"),
|
|
make_option("--output-dir", dest="output_dir", default=".",
|
|
help="Output directory (default .)"),
|
|
|
|
make_option("--correction", default="FDR", help="Correction method"),
|
|
make_option("--alpha", default=.05, help="Alpha level"),
|
|
|
|
make_option("--adjust-counts-hack", dest="adjust_counts_hack",
|
|
default=FALSE, action="store_true",
|
|
help="Allow the counts file to have more rows than cohorts.
|
|
Most users should not use this.")
|
|
)
|
|
|
|
ParseOptions <- function() {
|
|
# NOTE: This API is bad; if you add positional_arguments, the return value
|
|
# changes!
|
|
parser <- OptionParser(option_list = option_list)
|
|
opts <- parse_args(parser)
|
|
|
|
if (opts$map == "") {
|
|
UsageError("--map is required.")
|
|
}
|
|
if (opts$counts == "") {
|
|
UsageError("--counts is required.")
|
|
}
|
|
if (opts$params == "") {
|
|
UsageError("--params is required.")
|
|
}
|
|
return(opts)
|
|
}
|
|
|
|
if (!interactive()) {
|
|
opts <- ParseOptions()
|
|
}
|
|
|
|
#
|
|
# Load libraries and source our own code.
|
|
#
|
|
|
|
library(RJSONIO)
|
|
|
|
# So we don't have to change pwd
|
|
source.rappor <- function(rel_path) {
|
|
abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
|
|
source(abs_path)
|
|
}
|
|
|
|
source.rappor("analysis/R/read_input.R")
|
|
source.rappor("analysis/R/decode.R")
|
|
source.rappor("analysis/R/util.R")
|
|
|
|
source.rappor("analysis/R/alternative.R")
|
|
|
|
options(stringsAsFactors = FALSE)
|
|
|
|
|
|
main <- function(opts) {
|
|
Log("decode-dist")
|
|
Log("argv:")
|
|
print(commandArgs(TRUE))
|
|
|
|
Log("Loading inputs")
|
|
|
|
# Run a single model of all inputs are specified.
|
|
params <- ReadParameterFile(opts$params)
|
|
counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
|
|
counts <- AdjustCounts(counts, params)
|
|
|
|
|
|
# The left-most column has totals.
|
|
num_reports <- sum(counts[, 1])
|
|
|
|
map <- LoadMapFile(opts$map, params)
|
|
|
|
Log("Decoding %d reports", num_reports)
|
|
res <- Decode(counts, map$map, params, correction = opts$correction,
|
|
alpha = opts$alpha)
|
|
Log("Done decoding")
|
|
|
|
if (nrow(res$fit) == 0) {
|
|
Log("FATAL: Analysis returned no strings.")
|
|
quit(status = 1)
|
|
}
|
|
|
|
# Write analysis results as CSV.
|
|
results_csv_path <- file.path(opts$output_dir, 'results.csv')
|
|
write.csv(res$fit, file = results_csv_path, row.names = FALSE)
|
|
|
|
# Write residual histograph as a png.
|
|
results_png_path <- file.path(opts$output_dir, 'residual.png')
|
|
png(results_png_path)
|
|
breaks <- pretty(res$residual, n = 200)
|
|
histogram <- hist(res$residual, breaks, plot = FALSE)
|
|
histogram$counts <- histogram$counts / sum(histogram$counts) # convert the histogram to frequencies
|
|
plot(histogram, main = "Histogram of the residual",
|
|
xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
|
|
dev.off()
|
|
|
|
res$metrics$total_elapsed_time <- proc.time()[['elapsed']]
|
|
|
|
# Write summary as JSON (scalar values).
|
|
metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
|
|
m <- toJSON(res$metrics)
|
|
writeLines(m, con = metrics_json_path)
|
|
Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)
|
|
|
|
# TODO:
|
|
# - These are in an 2 column 'parameters' and 'values' format. Should these
|
|
# just be a plain list?
|
|
# - Should any of these privacy params be in metrics.json?
|
|
|
|
Log("Privacy summary:")
|
|
print(res$privacy)
|
|
cat("\n")
|
|
|
|
Log('DONE')
|
|
}
|
|
|
|
if (!interactive()) {
|
|
main(opts)
|
|
}
|