243 lines
7.4 KiB
Python
Executable File
243 lines
7.4 KiB
Python
Executable File
#!/usr/bin/python
|
|
#
|
|
# Copyright 2014 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Run the RAPPOR Python client on simulated input.
|
|
|
|
It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5
|
|
column CSV of RAPPOR'd data.
|
|
|
|
Input columns: client,true_value
|
|
Output coumns: client,cohort,bloom,prr,rappor
|
|
|
|
TODO:
|
|
- cohort should be in the input _input.csv file.
|
|
|
|
See http://google.github.io/rappor/doc/data-flow.html for details.
|
|
"""
|
|
|
|
import csv
|
|
import collections
|
|
import optparse
|
|
import os
|
|
import random
|
|
import sys
|
|
import time
|
|
|
|
import rappor # client library
|
|
try:
|
|
import fastrand
|
|
except ImportError:
|
|
print >>sys.stderr, (
|
|
"Native fastrand module not imported; see README for speedups")
|
|
fastrand = None
|
|
|
|
|
|
def log(msg, *args):
|
|
if args:
|
|
msg = msg % args
|
|
print >>sys.stderr, msg
|
|
|
|
|
|
def CreateOptionsParser():
|
|
p = optparse.OptionParser()
|
|
|
|
p.add_option(
|
|
'--num-bits', type='int', metavar='INT', dest='num_bits', default=16,
|
|
help='Number of bloom filter bits.')
|
|
p.add_option(
|
|
'--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2,
|
|
help='Number of hashes.')
|
|
p.add_option(
|
|
'--num-cohorts', type='int', metavar='INT', dest='num_cohorts',
|
|
default=64, help='Number of cohorts.')
|
|
|
|
p.add_option(
|
|
'-p', type='float', metavar='FLOAT', dest='prob_p', default=1,
|
|
help='Probability p')
|
|
p.add_option(
|
|
'-q', type='float', metavar='FLOAT', dest='prob_q', default=1,
|
|
help='Probability q')
|
|
p.add_option(
|
|
'-f', type='float', metavar='FLOAT', dest='prob_f', default=1,
|
|
help='Probability f')
|
|
p.add_option(
|
|
'--assoc-testdata', type='int', dest='assoc_testdata', default=0,
|
|
help='Generate association testdata from true values on stdin.')
|
|
|
|
choices = ['simple', 'fast']
|
|
p.add_option(
|
|
'-r', type='choice', metavar='STR',
|
|
dest='random_mode', default='fast', choices=choices,
|
|
help='Random algorithm (%s)' % '|'.join(choices))
|
|
|
|
return p
|
|
|
|
|
|
def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
|
|
csv_in, csv_out):
|
|
"""Read true values from csv_in and output encoded values on csv_out.
|
|
|
|
Replicate assoc_testdata_count times. First value is a string, second is a
|
|
bool. TODO: Generalize this.
|
|
"""
|
|
rows = []
|
|
for i, (true_value1, true_value2) in enumerate(csv_in):
|
|
if i == 0:
|
|
v1_name = true_value1
|
|
v2_name = true_value2
|
|
continue # skip header row
|
|
|
|
rows.append((true_value1, true_value2))
|
|
|
|
# Use the same column names
|
|
header = ('client', 'cohort', v1_name, v2_name)
|
|
csv_out.writerow(header)
|
|
|
|
n = assoc_testdata_count
|
|
report_index = 0
|
|
for i in xrange(n):
|
|
for v1, v2 in rows:
|
|
client_str = 'c%d' % report_index
|
|
|
|
# randint(a, b) gives i such that a <= i <= b
|
|
cohort = random.randint(0, params1.num_cohorts - 1)
|
|
|
|
string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand)
|
|
bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand)
|
|
|
|
# Real users should call e.encode(). For testing purposes, we also want
|
|
# the PRR.
|
|
irr1 = string_encoder.encode(v1)
|
|
|
|
# TODO: Convert to bool and encode with basic RAPPOR
|
|
v2_int = int(v2)
|
|
#print v2_int
|
|
irr2 = bool_encoder.encode_bits(v2_int)
|
|
|
|
irr1_str = rappor.bit_string(irr1, params1.num_bloombits)
|
|
irr2_str = rappor.bit_string(irr2, params2.num_bloombits)
|
|
|
|
csv_out.writerow((client_str, cohort, irr1_str, irr2_str))
|
|
|
|
report_index += 1
|
|
|
|
|
|
def RapporClientSim(params, irr_rand, csv_in, csv_out):
|
|
"""Read true values from csv_in and output encoded values on csv_out."""
|
|
header = ('client', 'cohort', 'bloom', 'prr', 'irr')
|
|
csv_out.writerow(header)
|
|
|
|
# TODO: It would be more instructive/efficient to construct an encoder
|
|
# instance up front per client, rather than one per row below.
|
|
start_time = time.time()
|
|
|
|
for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
|
|
if i == 0:
|
|
if client_str != 'client':
|
|
raise RuntimeError('Expected client header, got %s' % client_str)
|
|
if cohort_str != 'cohort':
|
|
raise RuntimeError('Expected cohort header, got %s' % cohort_str)
|
|
if true_value != 'value':
|
|
raise RuntimeError('Expected value header, got %s' % value)
|
|
continue # skip header row
|
|
|
|
#if i == 30: # EARLY STOP
|
|
# break
|
|
|
|
if i % 10000 == 0:
|
|
elapsed = time.time() - start_time
|
|
log('Processed %d inputs in %.2f seconds', i, elapsed)
|
|
|
|
cohort = int(cohort_str)
|
|
secret = client_str
|
|
e = rappor.Encoder(params, cohort, secret, irr_rand)
|
|
|
|
# Real users should call e.encode(). For testing purposes, we also want
|
|
# the PRR.
|
|
bloom, prr, irr = e._internal_encode(true_value)
|
|
|
|
bloom_str = rappor.bit_string(bloom, params.num_bloombits)
|
|
prr_str = rappor.bit_string(prr, params.num_bloombits)
|
|
irr_str = rappor.bit_string(irr, params.num_bloombits)
|
|
|
|
out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
|
|
csv_out.writerow(out_row)
|
|
|
|
|
|
def main(argv):
|
|
(opts, argv) = CreateOptionsParser().parse_args(argv)
|
|
|
|
# Copy flags into params
|
|
params = rappor.Params()
|
|
params.num_bloombits = opts.num_bits
|
|
params.num_hashes = opts.num_hashes
|
|
params.num_cohorts = opts.num_cohorts
|
|
params.prob_p = opts.prob_p
|
|
params.prob_q = opts.prob_q
|
|
params.prob_f = opts.prob_f
|
|
|
|
if opts.random_mode == 'simple':
|
|
irr_rand = rappor.SecureIrrRand(params)
|
|
elif opts.random_mode == 'fast':
|
|
if fastrand:
|
|
log('Using fastrand extension')
|
|
# NOTE: This doesn't take 'rand'. It's seeded in C with srand().
|
|
irr_rand = fastrand.FastIrrRand(params)
|
|
else:
|
|
log('Warning: fastrand module not importable; see README for build '
|
|
'instructions. Falling back to simple randomness.')
|
|
irr_rand = rappor.SecureIrrRand(params)
|
|
else:
|
|
raise AssertionError
|
|
# Other possible implementations:
|
|
# - random.SystemRandom (probably uses /dev/urandom on Linux)
|
|
# - HMAC-SHA256 with another secret? This could match C++ byte for byte.
|
|
# - or srand(0) might do it.
|
|
|
|
csv_in = csv.reader(sys.stdin)
|
|
csv_out = csv.writer(sys.stdout)
|
|
|
|
if opts.assoc_testdata:
|
|
# Copy flags into params
|
|
params1 = rappor.Params()
|
|
params1.num_bloombits = opts.num_bits
|
|
params1.num_hashes = opts.num_hashes
|
|
params1.num_cohorts = opts.num_cohorts
|
|
params1.prob_p = opts.prob_p
|
|
params1.prob_q = opts.prob_q
|
|
params1.prob_f = opts.prob_f
|
|
|
|
# Second one is boolean
|
|
params2 = rappor.Params()
|
|
params2.num_bloombits = 1 # 1 bit for boolean
|
|
params2.num_hashes = opts.num_hashes
|
|
params2.num_cohorts = opts.num_cohorts
|
|
params2.prob_p = opts.prob_p
|
|
params2.prob_q = opts.prob_q
|
|
params2.prob_f = opts.prob_f
|
|
|
|
GenAssocTestdata(
|
|
params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out)
|
|
else:
|
|
RapporClientSim(params, irr_rand, csv_in, csv_out)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main(sys.argv)
|
|
except RuntimeError, e:
|
|
log('rappor_sim.py: FATAL: %s', e)
|