unplugged-system/external/rappor/client/python/rappor.py

# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""RAPPOR client library.

Privacy is ensured without a third party by only sending RAPPOR'd data over the
network (as opposed to raw client data).

Note that we use MD5 for the Bloom filter hash function. (security not required)
"""
import csv
import hashlib
import hmac
import json
import struct
import sys

from random import SystemRandom

class Error(Exception):
  pass


def log(msg, *args):
  if args:
    msg = msg % args
  print >>sys.stderr, msg


class Params(object):
  """RAPPOR encoding parameters.

  These affect privacy/anonymity.  See the paper for details.
  """
  def __init__(self):
    self.num_bloombits = 16      # Number of bloom filter bits (k)
    self.num_hashes = 2          # Number of bloom filter hashes (h)
    self.num_cohorts = 64        # Number of cohorts (m)
    self.prob_p = 0.50           # Probability p
    self.prob_q = 0.75           # Probability q
    self.prob_f = 0.50           # Probability f

  # For testing
  def __eq__(self, other):
    return self.__dict__ == other.__dict__

  def __repr__(self):
    return repr(self.__dict__)

  def to_json(self):
    """Convert this instance to JSON.

    The names are be compatible with the apps/api server.
    """
    return json.dumps({
        'numBits': self.num_bloombits,
        'numHashes': self.num_hashes,
        'numCohorts': self.num_cohorts,
        'probPrr': self.prob_f,
        'probIrr0': self.prob_p,
        'probIrr1': self.prob_q,
    })

  # NOTE:
  # - from_csv is currently used in sum_bits.py
  # - to_csv is in rappor_sim.print_params
  @staticmethod
  def from_csv(f):
    """Read the RAPPOR parameters from a CSV file.

    Args:
      f: file handle

    Returns:
      Params instance.

    Raises:
      rappor.Error: when the file is malformed.
    """
    c = csv.reader(f)
    ok = False
    p = Params()
    for i, row in enumerate(c):

      if i == 0:
        if row != ['k', 'h', 'm', 'p', 'q', 'f']:
          raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row)

      elif i == 1:
        try:
          # NOTE: May raise exceptions
          p.num_bloombits = int(row[0])
          p.num_hashes = int(row[1])
          p.num_cohorts = int(row[2])
          p.prob_p = float(row[3])
          p.prob_q = float(row[4])
          p.prob_f = float(row[5])
        except (ValueError, IndexError) as e:
          raise Error('Row is malformed: %s' % e)
        ok = True

      else:
        raise Error('Params file should only have two rows')

    if not ok:
      raise Error("Expected second row with params")

    return p


class _SecureRandom(object):
  """Returns an integer where each bit has probability p of being 1."""

  def __init__(self, prob_one, num_bits):
    self.prob_one = prob_one
    self.num_bits = num_bits

  def __call__(self):
    p = self.prob_one
    rand = SystemRandom()
    r = 0

    for i in xrange(self.num_bits):
      bit = rand.random() < p
      r |= (bit << i)  # using bool as int
    return r


class SecureIrrRand(object):
  """Python's os.random()"""

  def __init__(self, params):
    """
    Args:
      params: rappor.Params
    """
    num_bits = params.num_bloombits
    # IRR probabilities

    self.p_gen = _SecureRandom(params.prob_p, num_bits)
    self.q_gen = _SecureRandom(params.prob_q, num_bits)


def to_big_endian(i):
  """Convert an integer to a 4 byte big endian string.  Used for hashing."""
  # https://docs.python.org/2/library/struct.html
  # - Big Endian (>) for consistent network byte order.
  # - L means 4 bytes when using >
  return struct.pack('>L', i)


def get_bloom_bits(word, cohort, num_hashes, num_bloombits):
  """Return an array of bits to set in the bloom filter.

  In the real report, we bitwise-OR them together.  In hash candidates, we put
  them in separate entries in the "map" matrix.
  """
  value = to_big_endian(cohort) + word  # Cohort is 4 byte prefix.
  md5 = hashlib.md5(value)

  digest = md5.digest()

  # Each has is a byte, which means we could have up to 256 bit Bloom filters.
  # There are 16 bytes in an MD5, in which case we can have up to 16 hash
  # functions per Bloom filter.
  if num_hashes > len(digest):
    raise RuntimeError("Can't have more than %d hashes" % md5)

  #log('hash_input %r', value)
  #log('Cohort %d', cohort)
  #log('MD5 %s', md5.hexdigest())

  return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)]


def get_prr_masks(secret, word, prob_f, num_bits):
  h = hmac.new(secret, word, digestmod=hashlib.sha256)
  #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())

  # Now go through each byte
  digest_bytes = h.digest()
  assert len(digest_bytes) == 32

  # Use 32 bits.  If we want 64 bits, it may be fine to generate another 32
  # bytes by repeated HMAC.  For arbitrary numbers of bytes it's probably
  # better to use the HMAC-DRBG algorithm.
  if num_bits > len(digest_bytes):
    raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d))

  threshold128 = prob_f * 128

  uniform = 0
  f_mask = 0

  for i in xrange(num_bits):
    ch = digest_bytes[i]
    byte = ord(ch)

    u_bit = byte & 0x01  # 1 bit of entropy
    uniform |= (u_bit << i)  # maybe set bit in mask

    rand128 = byte >> 1  # 7 bits of entropy
    noise_bit = (rand128 < threshold128)
    f_mask |= (noise_bit << i)  # maybe set bit in mask

  return uniform, f_mask


def bit_string(irr, num_bloombits):
  """Like bin(), but uses leading zeroes, and no '0b'."""
  s = ''
  bits = []
  for bit_num in xrange(num_bloombits):
    if irr & (1 << bit_num):
      bits.append('1')
    else:
      bits.append('0')
  return ''.join(reversed(bits))


class Encoder(object):
  """Obfuscates values for a given user using the RAPPOR privacy algorithm."""

  def __init__(self, params, cohort, secret, irr_rand):
    """
    Args:
      params: RAPPOR Params() controlling privacy
      cohort: integer cohort, for Bloom hashing.
      secret: secret string, for the PRR to be a deterministic function of the
        reported value.
      irr_rand: IRR randomness interface.
    """
    # RAPPOR params.  NOTE: num_cohorts isn't used.  p and q are used by
    # irr_rand.
    self.params = params
    self.cohort = cohort  # associated: MD5
    self.secret = secret  # associated: HMAC-SHA256
    self.irr_rand = irr_rand  # p and q used

  def _internal_encode_bits(self, bits):
    """Helper function for simulation / testing.

    Returns:
      The PRR and IRR.  The PRR should never be sent over the network.
    """
    # Compute Permanent Randomized Response (PRR).
    uniform, f_mask = get_prr_masks(
        self.secret, to_big_endian(bits), self.params.prob_f,
        self.params.num_bloombits)

    # Suppose bit i of the Bloom filter is B_i.  Then bit i of the PRR is
    # defined as:
    #
    # 1   with prob f/2
    # 0   with prob f/2
    # B_i with prob 1-f

    # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with
    # probability f.  So in the expression below:
    #
    # - Bits in (uniform & f_mask) are 1 with probability f/2.
    # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability
    # f, so we get B_i with probability 1-f.
    # - The remaining bits are 0, with remaining probability f/2.

    prr = (bits & ~f_mask) | (uniform & f_mask)

    #log('U %s / F %s', bit_string(uniform, num_bits),
    #    bit_string(f_mask, num_bits))

    #log('B %s / PRR %s', bit_string(bloom_bits, num_bits),
    #    bit_string(prr, num_bits))

    # Compute Instantaneous Randomized Response (IRR).
    # If PRR bit is 0, IRR bit is 1 with probability p.
    # If PRR bit is 1, IRR bit is 1 with probability q.
    p_bits = self.irr_rand.p_gen()
    q_bits = self.irr_rand.q_gen()

    irr = (p_bits & ~prr) | (q_bits & prr)

    return prr, irr  # IRR is the rappor

  def _internal_encode(self, word):
    """Helper function for simulation / testing.

    Returns:
      The Bloom filter bits, PRR, and IRR.  The first two values should never
      be sent over the network.
    """
    bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes,
                                self.params.num_bloombits)

    bloom = 0
    for bit_to_set in bloom_bits:
      bloom |= (1 << bit_to_set)

    prr, irr = self._internal_encode_bits(bloom)
    return bloom, prr, irr

  def encode_bits(self, bits):
    """Encode a string with RAPPOR.

    Args:
      bits: An integer representing bits to encode.

    Returns:
      An integer that is the IRR (Instantaneous Randomized Response).
    """
    _, irr = self._internal_encode_bits(bits)
    return irr

  def encode(self, word):
    """Encode a string with RAPPOR.

    Args:
      word: the string that should be privately transmitted.

    Returns:
      An integer that is the IRR (Instantaneous Randomized Response).
    """
    _, _, irr = self._internal_encode(word)
    return irr