155 lines
4.5 KiB
R
Executable File
155 lines
4.5 KiB
R
Executable File
# Copyright 2014 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
#
|
|
# Read parameter, counts and map files.
|
|
|
|
library(Matrix)
|
|
|
|
source.rappor <- function(rel_path) {
|
|
abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
|
|
source(abs_path)
|
|
}
|
|
|
|
source.rappor("analysis/R/util.R") # for Log
|
|
|
|
|
|
ReadParameterFile <- function(params_file) {
|
|
# Read parameter file. Format:
|
|
# k, h, m, p, q, f
|
|
# 128, 2, 8, 0.5, 0.75, 0.75
|
|
|
|
params <- as.list(read.csv(params_file))
|
|
if (length(params) != 6) {
|
|
stop("There should be exactly 6 columns in the parameter file.")
|
|
}
|
|
if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) {
|
|
stop("Parameter names must be k,h,m,p,q,f.")
|
|
}
|
|
params
|
|
}
|
|
|
|
# Handle the case of redundant cohorts, i.e. the counts file needs to be
|
|
# further aggregated to obtain counts for the number of cohorts specified in
|
|
# the params file.
|
|
#
|
|
# NOTE: Why is this happening?
|
|
AdjustCounts <- function(counts, params) {
|
|
apply(counts, 2, function(x) {
|
|
tapply(x, rep(1:params$m, nrow(counts) / params$m), sum)
|
|
})
|
|
}
|
|
|
|
ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) {
|
|
# Read in the counts file.
|
|
if (!file.exists(counts_file)) {
|
|
return(NULL)
|
|
}
|
|
counts <- read.csv(counts_file, header = FALSE)
|
|
|
|
if (adjust_counts) {
|
|
counts <- AdjustCounts(counts, params)
|
|
}
|
|
|
|
if (nrow(counts) != params$m) {
|
|
stop(sprintf("Got %d rows in the counts file, expected m = %d",
|
|
nrow(counts), params$m))
|
|
}
|
|
|
|
if ((ncol(counts) - 1) != params$k) {
|
|
stop(paste0("Counts file: number of columns should equal to k + 1: ",
|
|
ncol(counts)))
|
|
}
|
|
|
|
if (any(counts < 0)) {
|
|
stop("Counts file: all counts must be positive.")
|
|
}
|
|
|
|
# Turn counts from a data frame into a matrix. (In R a data frame and matrix
|
|
# are sometimes interchangeable, but sometimes we need it to be matrix.)
|
|
as.matrix(counts)
|
|
}
|
|
|
|
ReadMapFile <- function(map_file, params) {
|
|
# Read in the map file which is in the following format (two hash functions):
|
|
# str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ...
|
|
# str2, ...
|
|
# Output:
|
|
# map: a sparse representation of set bits for each candidate string.
|
|
# strs: a vector of all candidate strings.
|
|
|
|
Log("Parsing %s", map_file)
|
|
|
|
map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE)
|
|
strs <- map_pos[, 1]
|
|
strs[strs == ""] <- "Empty"
|
|
|
|
# Remove duplicated strings.
|
|
ind <- which(!duplicated(strs))
|
|
strs <- strs[ind]
|
|
map_pos <- map_pos[ind, ]
|
|
|
|
n <- ncol(map_pos) - 1
|
|
if (n != (params$h * params$m)) {
|
|
stop(paste0("Map file: number of columns should equal hm + 1:",
|
|
n, "_", params$h * params$m))
|
|
}
|
|
|
|
row_pos <- unlist(map_pos[, -1], use.names = FALSE)
|
|
col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1)
|
|
|
|
# TODO: When would this ever happen?
|
|
removed <- which(is.na(row_pos))
|
|
if (length(removed) > 0) {
|
|
Log("Removed %d entries", length(removed))
|
|
row_pos <- row_pos[-removed]
|
|
col_pos <- col_pos[-removed]
|
|
}
|
|
|
|
map <- sparseMatrix(row_pos, col_pos,
|
|
dims = c(params$m * params$k, length(strs)))
|
|
|
|
colnames(map) <- strs
|
|
list(map = map, strs = strs, map_pos = map_pos)
|
|
}
|
|
|
|
LoadMapFile <- function(map_file, params) {
|
|
# Reads the map file, caching an .rda (R binary data) version of it to speed
|
|
# up future loads.
|
|
|
|
rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE)
|
|
# This must be unique per process, so concurrent processes don't try to
|
|
# write the same file.
|
|
tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid())
|
|
|
|
# First save to a temp file, and then atomically rename to the destination.
|
|
if (file.exists(rda_path)) {
|
|
Log("Loading %s", rda_path)
|
|
load(rda_path, .GlobalEnv) # creates the 'map' variable in the global env
|
|
} else {
|
|
map <- ReadMapFile(map_file, params)
|
|
|
|
Log("Saving %s as an rda file for faster access", map_file)
|
|
tryCatch({
|
|
save(map, file = tmp_path)
|
|
file.rename(tmp_path, rda_path)
|
|
}, warning = function(w) {
|
|
Log("WARNING: %s", w)
|
|
}, error = function(e) {
|
|
Log("ERROR: %s", e)
|
|
})
|
|
}
|
|
return(map)
|
|
}
|