unplugged-system/external/rappor/tests/rappor_sim.py

#!/usr/bin/python
#
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Run the RAPPOR Python client on simulated input.

It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5
column CSV of RAPPOR'd data.

Input columns: client,true_value
Output coumns: client,cohort,bloom,prr,rappor

TODO:
- cohort should be in the input _input.csv file.

See http://google.github.io/rappor/doc/data-flow.html for details.
"""

import csv
import collections
import optparse
import os
import random
import sys
import time

import rappor  # client library
try:
  import fastrand
except ImportError:
  print >>sys.stderr, (
      "Native fastrand module not imported; see README for speedups")
  fastrand = None


def log(msg, *args):
  if args:
    msg = msg % args
  print >>sys.stderr, msg


def CreateOptionsParser():
  p = optparse.OptionParser()

  p.add_option(
      '--num-bits', type='int', metavar='INT', dest='num_bits', default=16,
      help='Number of bloom filter bits.')
  p.add_option(
      '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2,
      help='Number of hashes.')
  p.add_option(
      '--num-cohorts', type='int', metavar='INT', dest='num_cohorts',
      default=64, help='Number of cohorts.')

  p.add_option(
      '-p', type='float', metavar='FLOAT', dest='prob_p', default=1,
      help='Probability p')
  p.add_option(
      '-q', type='float', metavar='FLOAT', dest='prob_q', default=1,
      help='Probability q')
  p.add_option(
      '-f', type='float', metavar='FLOAT', dest='prob_f', default=1,
      help='Probability f')
  p.add_option(
      '--assoc-testdata', type='int', dest='assoc_testdata', default=0,
      help='Generate association testdata from true values on stdin.')

  choices = ['simple', 'fast']
  p.add_option(
      '-r', type='choice', metavar='STR',
      dest='random_mode', default='fast', choices=choices,
      help='Random algorithm (%s)' % '|'.join(choices))

  return p


def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
                     csv_in, csv_out):
  """Read true values from csv_in and output encoded values on csv_out.

  Replicate assoc_testdata_count times.  First value is a string, second is a
  bool.  TODO: Generalize this.
  """
  rows = []
  for i, (true_value1, true_value2) in enumerate(csv_in):
    if i == 0:
      v1_name = true_value1
      v2_name = true_value2
      continue  # skip header row

    rows.append((true_value1, true_value2))

  # Use the same column names
  header = ('client', 'cohort', v1_name, v2_name)
  csv_out.writerow(header)

  n = assoc_testdata_count
  report_index = 0
  for i in xrange(n):
    for v1, v2 in rows:
      client_str = 'c%d' % report_index

      # randint(a, b) gives i such that a <= i <= b
      cohort = random.randint(0, params1.num_cohorts - 1)

      string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand)
      bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand)

      # Real users should call e.encode().  For testing purposes, we also want
      # the PRR.
      irr1 = string_encoder.encode(v1)

      # TODO: Convert to bool and encode with basic RAPPOR
      v2_int = int(v2)
      #print v2_int
      irr2 = bool_encoder.encode_bits(v2_int)

      irr1_str = rappor.bit_string(irr1, params1.num_bloombits)
      irr2_str = rappor.bit_string(irr2, params2.num_bloombits)

      csv_out.writerow((client_str, cohort, irr1_str, irr2_str))

      report_index += 1


def RapporClientSim(params, irr_rand, csv_in, csv_out):
  """Read true values from csv_in and output encoded values on csv_out."""
  header = ('client', 'cohort', 'bloom', 'prr', 'irr')
  csv_out.writerow(header)

  # TODO: It would be more instructive/efficient to construct an encoder
  # instance up front per client, rather than one per row below.
  start_time = time.time()

  for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
    if i == 0:
      if client_str != 'client':
        raise RuntimeError('Expected client header, got %s' % client_str)
      if cohort_str != 'cohort':
        raise RuntimeError('Expected cohort header, got %s' % cohort_str)
      if true_value != 'value':
        raise RuntimeError('Expected value header, got %s' % value)
      continue  # skip header row

    #if i == 30:  # EARLY STOP
    #  break

    if i % 10000 == 0:
      elapsed = time.time() - start_time
      log('Processed %d inputs in %.2f seconds', i, elapsed)

    cohort = int(cohort_str)
    secret = client_str
    e = rappor.Encoder(params, cohort, secret, irr_rand)

    # Real users should call e.encode().  For testing purposes, we also want
    # the PRR.
    bloom, prr, irr = e._internal_encode(true_value)

    bloom_str = rappor.bit_string(bloom, params.num_bloombits)
    prr_str = rappor.bit_string(prr, params.num_bloombits)
    irr_str = rappor.bit_string(irr, params.num_bloombits)

    out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
    csv_out.writerow(out_row)


def main(argv):
  (opts, argv) = CreateOptionsParser().parse_args(argv)

  # Copy flags into params
  params = rappor.Params()
  params.num_bloombits = opts.num_bits
  params.num_hashes = opts.num_hashes
  params.num_cohorts = opts.num_cohorts
  params.prob_p = opts.prob_p
  params.prob_q = opts.prob_q
  params.prob_f = opts.prob_f

  if opts.random_mode == 'simple':
    irr_rand = rappor.SecureIrrRand(params)
  elif opts.random_mode == 'fast':
    if fastrand:
      log('Using fastrand extension')
      # NOTE: This doesn't take 'rand'.  It's seeded in C with srand().
      irr_rand = fastrand.FastIrrRand(params)
    else:
      log('Warning: fastrand module not importable; see README for build '
          'instructions.  Falling back to simple randomness.')
      irr_rand = rappor.SecureIrrRand(params)
  else:
    raise AssertionError
  # Other possible implementations:
  # - random.SystemRandom (probably uses /dev/urandom on Linux)
  # - HMAC-SHA256 with another secret?  This could match C++ byte for byte.
  #   - or srand(0) might do it.

  csv_in = csv.reader(sys.stdin)
  csv_out = csv.writer(sys.stdout)

  if opts.assoc_testdata:
    # Copy flags into params
    params1 = rappor.Params()
    params1.num_bloombits = opts.num_bits
    params1.num_hashes = opts.num_hashes
    params1.num_cohorts = opts.num_cohorts
    params1.prob_p = opts.prob_p
    params1.prob_q = opts.prob_q
    params1.prob_f = opts.prob_f

    # Second one is boolean
    params2 = rappor.Params()
    params2.num_bloombits = 1  # 1 bit for boolean
    params2.num_hashes = opts.num_hashes
    params2.num_cohorts = opts.num_cohorts
    params2.prob_p = opts.prob_p
    params2.prob_q = opts.prob_q
    params2.prob_f = opts.prob_f

    GenAssocTestdata(
        params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out)
  else:
    RapporClientSim(params, irr_rand, csv_in, csv_out)


if __name__ == "__main__":
  try:
    main(sys.argv)
  except RuntimeError, e:
    log('rappor_sim.py: FATAL: %s', e)