unplugged-system/external/rappor/tests/assoc_sim.R

#!/usr/bin/env Rscript
#
# Copyright 2015 Google Inc. All rights reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Simulates inputs on which association analysis can be run.
# Currently assoc_sim.R only supports 2 variables but can
# be easily extended to support more.
# 
# Usage:
#       $ ./assoc_sim.R -n 1000
# Inputs: uvals, params, reports, map, num, unif
#         see how options are parsed below for more information
# Outputs:
#         reports.csv file containing reports
#         map_{1, 2, ...}.csv file(s) containing maps of variables

library("optparse")

options(stringsAsFactors = FALSE)

if(!interactive()) {
  option_list <- list(
    make_option(c("--uvals", "-v"), default = "uvals.csv",
                help = "Filename for list of values over which
                distributions are simulated. The file is a list of
                comma-separated strings each line of which refers
                to a new variable."),
    make_option(c("--params", "-p"), default = "params.csv",
                help = "Filename for RAPPOR parameters"),
    make_option(c("--reports", "-r"), default = "reports.csv",
                help = "Filename for reports"),
    make_option(c("--map", "-m"), default = "map",
                help = "Filename *prefix* for map(s)"),
    make_option(c("--num", "-n"), default = 1e05,
                help = "Number of reports"),
    make_option(c("--unif", "-u"), default = FALSE,
                help = "Run simulation with uniform distribution")
  )
  opts <- parse_args(OptionParser(option_list = option_list))
}    

source("../analysis/R/encode.R")
source("../analysis/R/decode.R")
source("../analysis/R/simulation.R")
source("../analysis/R/read_input.R")
source("../analysis/R/association.R")

# Read unique values of reports from a csv file
# Inputs: filename. The file is expected to contain two rows of strings
#         (one for each variable):
#         "google.com", "apple.com", ...
#         "ssl", "nossl", ...
# Returns: a list containing strings
GetUniqueValsFromFile <- function(filename) {
  contents <- read.csv(filename, header = FALSE)
  # Expect 2 rows of unique vals
  if(nrow(contents) != 2) {
    stop(paste("Unique vals file", filename, "expected to have
               two rows of strings."))
  }
  # Removes superfluous "" entries if the lists of unique values
  # differ in length
  strip_empty <- function(vec) {
    vec[!vec %in% c("")]
  }
  list(var1 = strip_empty(as.vector(t(contents[1,]))),
       var2 = strip_empty(as.vector(t(contents[2,]))))
}

# Simulate correlated reports and write into reportsfile
# Inputs: N = number of reports
#         uvals = list containing a list of unique values
#         params = list with RAPPOR parameters
#         unif = whether to replace poisson with uniform
#         mapfile = file to write maps into (with .csv suffixes)
#         reportsfile = file to write reports into (with .csv suffix)
SimulateReports <- function(N, uvals, params, unif,
                            mapfile, reportsfile) {
  # Compute true distribution
  m <- params$m  

  if (unif) {
    # Draw uniformly from 1 to 10
    v1_samples <- as.integer(runif(N, 1, 10))
  } else {
    # Draw from a Poisson random variable
    v1_samples <- rpois(N, 1) + rep(1, N)
  }
  
  # Pr[var2 = N + 1 | var1 = N] = 0.5
  # Pr[var2 = N     | var1 = N] = 0.5
  v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
  
  tmp_samples <- list(v1_samples, v2_samples)
  
  # Function to pad strings to uval_vec if sample_vec has
  # larger support than the number of strings in uval_vec
  # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
  # only have "value1", "value2", and "value3", samples now
  # over support {"value1", "value2", "value3", "str4", ...}
  PadStrings <- function(sample_vec, uval_vec) {
    if (max(sample_vec) > length(uval_vec)) {
      # Padding uvals to required length
      len <- length(uval_vec)
      max_of_samples <- max(sample_vec)
      uval_vec[(len + 1):max_of_samples] <- apply(
        as.matrix((len + 1):max_of_samples),
        1,
        function(i) sprintf("str%d", i))
    }
    uval_vec
  }
  
  # Pad and update uvals
  uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
                                              uvals[[i]]))

  # Replace integers in tmp_samples with actual sample strings
  samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])

  # Randomly assign cohorts in each dimension
  cohorts <- sample(1:m, N, replace = TRUE)
  
  # Create and write map into mapfile_1.csv and mapfile_2.csv
  map <- lapply(uvals, function(u) CreateMap(u, params))
  write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),
              sep = ",", col.names = FALSE, na = "", quote = FALSE)
  write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
              sep = ",", col.names = FALSE, na = "", quote = FALSE)
  
  # Write reports into a csv file
  # Format:
  #     cohort, bloom filter var1, bloom filter var2
  reports <- lapply(1:2, function(i)
    EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
  # Organize cohorts and reports into format
  write_matrix <- cbind(as.matrix(cohorts),
                        as.matrix(lapply(reports[[1]], 
                            function(x) paste(x, collapse = ""))),
                        as.matrix(lapply(reports[[2]],
                            function(x) paste(x, collapse = ""))))
  write.table(write_matrix, file = reportsfile, quote = FALSE,
              row.names = FALSE, col.names = FALSE, sep = ",")
}

main <- function(opts) {
  ptm <- proc.time()
  
  uvals <- GetUniqueValsFromFile(opts$uvals)
  params <- ReadParameterFile(opts$params)
  SimulateReports(opts$num, uvals, params,  opts$unif, # inputs
                  opts$map, opts$reports)              # outputs
  
  print("PROC.TIME")
  print(proc.time() - ptm)
}

if(!interactive()) {
  main(opts)
}
Initial commit: AOSP 14 with modifications for Unplugged OS 2025-10-06 13:59:42 +00:00			`#!/usr/bin/env Rscript`
			`#`
			`# Copyright 2015 Google Inc. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`# Simulates inputs on which association analysis can be run.`
			`# Currently assoc_sim.R only supports 2 variables but can`
			`# be easily extended to support more.`
			`#`
			`# Usage:`
			`# $ ./assoc_sim.R -n 1000`
			`# Inputs: uvals, params, reports, map, num, unif`
			`# see how options are parsed below for more information`
			`# Outputs:`
			`# reports.csv file containing reports`
			`# map_{1, 2, ...}.csv file(s) containing maps of variables`

			`library("optparse")`

			`options(stringsAsFactors = FALSE)`

			`if(!interactive()) {`
			`option_list <- list(`
			`make_option(c("--uvals", "-v"), default = "uvals.csv",`
			`help = "Filename for list of values over which`
			`distributions are simulated. The file is a list of`
			`comma-separated strings each line of which refers`
			`to a new variable."),`
			`make_option(c("--params", "-p"), default = "params.csv",`
			`help = "Filename for RAPPOR parameters"),`
			`make_option(c("--reports", "-r"), default = "reports.csv",`
			`help = "Filename for reports"),`
			`make_option(c("--map", "-m"), default = "map",`
			`help = "Filename prefix for map(s)"),`
			`make_option(c("--num", "-n"), default = 1e05,`
			`help = "Number of reports"),`
			`make_option(c("--unif", "-u"), default = FALSE,`
			`help = "Run simulation with uniform distribution")`
			`)`
			`opts <- parse_args(OptionParser(option_list = option_list))`
			`}`

			`source("../analysis/R/encode.R")`
			`source("../analysis/R/decode.R")`
			`source("../analysis/R/simulation.R")`
			`source("../analysis/R/read_input.R")`
			`source("../analysis/R/association.R")`

			`# Read unique values of reports from a csv file`
			`# Inputs: filename. The file is expected to contain two rows of strings`
			`# (one for each variable):`
			`# "google.com", "apple.com", ...`
			`# "ssl", "nossl", ...`
			`# Returns: a list containing strings`
			`GetUniqueValsFromFile <- function(filename) {`
			`contents <- read.csv(filename, header = FALSE)`
			`# Expect 2 rows of unique vals`
			`if(nrow(contents) != 2) {`
			`stop(paste("Unique vals file", filename, "expected to have`
			`two rows of strings."))`
			`}`
			`# Removes superfluous "" entries if the lists of unique values`
			`# differ in length`
			`strip_empty <- function(vec) {`
			`vec[!vec %in% c("")]`
			`}`
			`list(var1 = strip_empty(as.vector(t(contents[1,]))),`
			`var2 = strip_empty(as.vector(t(contents[2,]))))`
			`}`

			`# Simulate correlated reports and write into reportsfile`
			`# Inputs: N = number of reports`
			`# uvals = list containing a list of unique values`
			`# params = list with RAPPOR parameters`
			`# unif = whether to replace poisson with uniform`
			`# mapfile = file to write maps into (with .csv suffixes)`
			`# reportsfile = file to write reports into (with .csv suffix)`
			`SimulateReports <- function(N, uvals, params, unif,`
			`mapfile, reportsfile) {`
			`# Compute true distribution`
			`m <- params$m`

			`if (unif) {`
			`# Draw uniformly from 1 to 10`
			`v1_samples <- as.integer(runif(N, 1, 10))`
			`} else {`
			`# Draw from a Poisson random variable`
			`v1_samples <- rpois(N, 1) + rep(1, N)`
			`}`

			`# Pr[var2 = N + 1 \| var1 = N] = 0.5`
			`# Pr[var2 = N \| var1 = N] = 0.5`
			`v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)`

			`tmp_samples <- list(v1_samples, v2_samples)`

			`# Function to pad strings to uval_vec if sample_vec has`
			`# larger support than the number of strings in uval_vec`
			`# For e.g., if samples have support {1, 2, 3, 4, ...} and uvals`
			`# only have "value1", "value2", and "value3", samples now`
			`# over support {"value1", "value2", "value3", "str4", ...}`
			`PadStrings <- function(sample_vec, uval_vec) {`
			`if (max(sample_vec) > length(uval_vec)) {`
			`# Padding uvals to required length`
			`len <- length(uval_vec)`
			`max_of_samples <- max(sample_vec)`
			`uval_vec[(len + 1):max_of_samples] <- apply(`
			`as.matrix((len + 1):max_of_samples),`
			`1,`
			`function(i) sprintf("str%d", i))`
			`}`
			`uval_vec`
			`}`

			`# Pad and update uvals`
			`uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],`
			`uvals[[i]]))`

			`# Replace integers in tmp_samples with actual sample strings`
			`samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])`

			`# Randomly assign cohorts in each dimension`
			`cohorts <- sample(1:m, N, replace = TRUE)`

			`# Create and write map into mapfile_1.csv and mapfile_2.csv`
			`map <- lapply(uvals, function(u) CreateMap(u, params))`
			`write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),`
			`sep = ",", col.names = FALSE, na = "", quote = FALSE)`
			`write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),`
			`sep = ",", col.names = FALSE, na = "", quote = FALSE)`

			`# Write reports into a csv file`
			`# Format:`
			`# cohort, bloom filter var1, bloom filter var2`
			`reports <- lapply(1:2, function(i)`
			`EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))`
			`# Organize cohorts and reports into format`
			`write_matrix <- cbind(as.matrix(cohorts),`
			`as.matrix(lapply(reports[[1]],`
			`function(x) paste(x, collapse = ""))),`
			`as.matrix(lapply(reports[[2]],`
			`function(x) paste(x, collapse = ""))))`
			`write.table(write_matrix, file = reportsfile, quote = FALSE,`
			`row.names = FALSE, col.names = FALSE, sep = ",")`
			`}`

			`main <- function(opts) {`
			`ptm <- proc.time()`

			`uvals <- GetUniqueValsFromFile(opts$uvals)`
			`params <- ReadParameterFile(opts$params)`
			`SimulateReports(opts$num, uvals, params, opts$unif, # inputs`
			`opts$map, opts$reports) # outputs`

			`print("PROC.TIME")`
			`print(proc.time() - ptm)`
			`}`

			`if(!interactive()) {`
			`main(opts)`
			`}`