417 lines
12 KiB
C++
417 lines
12 KiB
C++
|
|
// Copyright 2015 Google Inc. All rights reserved.
|
||
|
|
//
|
||
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
// you may not use this file except in compliance with the License.
|
||
|
|
// You may obtain a copy of the License at
|
||
|
|
//
|
||
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
//
|
||
|
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
// See the License for the specific language governing permissions and
|
||
|
|
// limitations under the License.
|
||
|
|
|
||
|
|
#include "encoder.h"
|
||
|
|
#include "openssl_hash_impl.h"
|
||
|
|
|
||
|
|
#include <assert.h>
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <stdarg.h> // va_list, etc.
|
||
|
|
#include <vector>
|
||
|
|
|
||
|
|
namespace rappor {
|
||
|
|
|
||
|
|
void log(const char* fmt, ...) {
|
||
|
|
va_list args;
|
||
|
|
va_start(args, fmt);
|
||
|
|
vfprintf(stderr, fmt, args);
|
||
|
|
va_end(args);
|
||
|
|
fprintf(stderr, "\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
//
|
||
|
|
// Functions for debugging
|
||
|
|
//
|
||
|
|
|
||
|
|
static void PrintHex(const std::vector<uint8_t>& h) {
|
||
|
|
for (size_t i = 0; i < h.size(); ++i) {
|
||
|
|
fprintf(stderr, "%02x", h[i]);
|
||
|
|
}
|
||
|
|
fprintf(stderr, "\n");
|
||
|
|
}
|
||
|
|
|
||
|
|
// We use 1 *byte* of a HMAC-SHA256 value per BIT to generate the PRR. SHA256
|
||
|
|
// has 32 bytes, so the max is 32 bits.
|
||
|
|
static const int kMaxBits = 32;
|
||
|
|
|
||
|
|
// Can't be more than the number of bytes in MD5.
|
||
|
|
static const int kMaxHashes = 16;
|
||
|
|
|
||
|
|
// Probabilities should be in the interval [0.0, 1.0].
|
||
|
|
static void CheckValidProbability(float prob, const char* var_name) {
|
||
|
|
if (prob < 0.0f || prob > 1.0f) {
|
||
|
|
log("%s should be between 0.0 and 1.0 inclusive (got %.2f)", var_name,
|
||
|
|
prob);
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Used to 1) turn cohort into a string, and 2) Turn raw bits into a string.
|
||
|
|
// Return by value since it's small.
|
||
|
|
static std::string ToBigEndian(uint32_t u) {
|
||
|
|
std::string result(4, '\0');
|
||
|
|
|
||
|
|
// rely on truncation to char
|
||
|
|
result[0] = u >> 24;
|
||
|
|
result[1] = u >> 16;
|
||
|
|
result[2] = u >> 8;
|
||
|
|
result[3] = u;
|
||
|
|
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
|
||
|
|
static const char* kHmacCohortPrefix = "\x00";
|
||
|
|
static const char* kHmacPrrPrefix = "\x01";
|
||
|
|
|
||
|
|
|
||
|
|
//
|
||
|
|
// Encoder
|
||
|
|
//
|
||
|
|
|
||
|
|
uint32_t Encoder::AssignCohort(const Deps& deps, int num_cohorts) {
|
||
|
|
std::vector<uint8_t> sha256;
|
||
|
|
if (!deps.hmac_func_(deps.client_secret_, kHmacCohortPrefix, &sha256)) {
|
||
|
|
log("HMAC failed");
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Either we are using SHA256 to have exactly 32 bytes,
|
||
|
|
// or we're using HmacDrbg for any number of bytes.
|
||
|
|
if ((sha256.size() == kMaxBits)
|
||
|
|
|| (deps.hmac_func_ == rappor::HmacDrbg)) {
|
||
|
|
// Hash size ok.
|
||
|
|
} else {
|
||
|
|
log("Bad hash size.");
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Interpret first 4 bytes of sha256 as a uint32_t.
|
||
|
|
uint32_t c = *(reinterpret_cast<uint32_t*>(sha256.data()));
|
||
|
|
// e.g. for 128 cohorts, 0x80 - 1 = 0x7f
|
||
|
|
uint32_t cohort_mask = num_cohorts - 1;
|
||
|
|
return c & cohort_mask;
|
||
|
|
}
|
||
|
|
|
||
|
|
Encoder::Encoder(const std::string& encoder_id, const Params& params,
|
||
|
|
const Deps& deps)
|
||
|
|
: encoder_id_(encoder_id),
|
||
|
|
params_(params),
|
||
|
|
deps_(deps),
|
||
|
|
cohort_(AssignCohort(deps, params.num_cohorts_)),
|
||
|
|
cohort_str_(ToBigEndian(cohort_)) {
|
||
|
|
|
||
|
|
if (params_.num_bits_ <= 0) {
|
||
|
|
log("num_bits must be positive");
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
if (params_.num_hashes_ <= 0) {
|
||
|
|
log("num_hashes must be positive");
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
if (params_.num_cohorts_ <= 0) {
|
||
|
|
log("num_cohorts must be positive");
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check Maximum values.
|
||
|
|
if (deps_.hmac_func_ == rappor::HmacDrbg) {
|
||
|
|
// Using HmacDrbg
|
||
|
|
if (params_.num_bits_ % 8 != 0) {
|
||
|
|
log("num_bits (%d) must be divisible by 8 when using HmacDrbg.",
|
||
|
|
params.num_bits_);
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
// Using SHA256
|
||
|
|
if (params_.num_bits_ > kMaxBits) {
|
||
|
|
log("num_bits (%d) can't be greater than %d", params_.num_bits_,
|
||
|
|
kMaxBits);
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (params_.num_hashes_ > kMaxHashes) {
|
||
|
|
log("num_hashes (%d) can't be greater than %d", params_.num_hashes_,
|
||
|
|
kMaxHashes);
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
int m = params_.num_cohorts_;
|
||
|
|
if ((m & (m - 1)) != 0) {
|
||
|
|
log("num_cohorts (%d) must be a power of 2 (and not 0)", m);
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
// TODO: check max cohorts?
|
||
|
|
|
||
|
|
CheckValidProbability(params_.prob_f_, "prob_f");
|
||
|
|
CheckValidProbability(params_.prob_p_, "prob_p");
|
||
|
|
CheckValidProbability(params_.prob_q_, "prob_q");
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Encoder::MakeBloomFilter(const std::string& value, Bits* bloom_out) const {
|
||
|
|
const int num_bits = params_.num_bits_;
|
||
|
|
const int num_hashes = params_.num_hashes_;
|
||
|
|
|
||
|
|
Bits bloom = 0;
|
||
|
|
|
||
|
|
// 4 byte cohort string + true value
|
||
|
|
std::string hash_input(cohort_str_ + value);
|
||
|
|
|
||
|
|
// First do hashing.
|
||
|
|
std::vector<uint8_t> hash_output;
|
||
|
|
deps_.hash_func_(hash_input, &hash_output);
|
||
|
|
|
||
|
|
// Error check
|
||
|
|
if (hash_output.size() < static_cast<size_t>(num_hashes)) {
|
||
|
|
log("Hash function didn't return enough bytes");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// To determine which bit to set in the bloom filter, use a byte of the MD5.
|
||
|
|
for (int i = 0; i < num_hashes; ++i) {
|
||
|
|
int bit_to_set = hash_output[i] % num_bits;
|
||
|
|
bloom |= 1 << bit_to_set;
|
||
|
|
}
|
||
|
|
|
||
|
|
*bloom_out = bloom;
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Write a Bloom filter into a vector of bytes, used for num_bits > 32.
|
||
|
|
bool Encoder::MakeBloomFilter(const std::string& value,
|
||
|
|
std::vector<uint8_t>* bloom_out) const {
|
||
|
|
const int num_bits = params_.num_bits_;
|
||
|
|
const int num_hashes = params_.num_hashes_;
|
||
|
|
|
||
|
|
bloom_out->resize(params_.num_bits_ / 8, 0);
|
||
|
|
|
||
|
|
// Generate the hash.
|
||
|
|
std::vector<uint8_t> hash_output;
|
||
|
|
deps_.hash_func_(std::string(cohort_str_ + value), &hash_output);
|
||
|
|
|
||
|
|
// Check that we have enough bytes of hash available.
|
||
|
|
int exponent = 0;
|
||
|
|
int bytes_needed = 0;
|
||
|
|
while ((1 << exponent) < num_bits) {
|
||
|
|
exponent++;
|
||
|
|
}
|
||
|
|
bytes_needed = ((exponent - 1) / 8) + 1;
|
||
|
|
if (bytes_needed > 4) {
|
||
|
|
log("Can only use 4 bytes of hash at a time, needed %d "
|
||
|
|
"to address %d bits.", bytes_needed, num_bits);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
if (hash_output.size() < static_cast<size_t>(bytes_needed * num_hashes)) {
|
||
|
|
log("Hash function returned %d bytes, but we needed "
|
||
|
|
"%d bytes * %d hashes. Choose lower num_hashes or "
|
||
|
|
"a different hash function.",
|
||
|
|
hash_output.size(), bytes_needed, num_hashes);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// To determine which bit to set in the Bloom filter, use 1 or more
|
||
|
|
// bytes of the MD5.
|
||
|
|
int hash_byte = 0;
|
||
|
|
for (int i = 0; i < num_hashes; ++i) {
|
||
|
|
int bit_to_set = 0;
|
||
|
|
for (int j = 0; j < bytes_needed; ++j) {
|
||
|
|
bit_to_set |= hash_output[hash_byte] << (j * 8);
|
||
|
|
++hash_byte;
|
||
|
|
}
|
||
|
|
bit_to_set %= num_bits;
|
||
|
|
// Start at end of array to be consistent with the Bits implementation.
|
||
|
|
int index = (bloom_out->size() - 1) - (bit_to_set / 8);
|
||
|
|
(*bloom_out)[index] |= 1 << (bit_to_set % 8);
|
||
|
|
}
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Helper method for PRR
|
||
|
|
bool Encoder::GetPrrMasks(const Bits bits, Bits* uniform_out,
|
||
|
|
Bits* f_mask_out) const {
|
||
|
|
// Create HMAC(secret, value), and use its bits to construct f_mask and
|
||
|
|
// uniform bits.
|
||
|
|
std::vector<uint8_t> sha256;
|
||
|
|
|
||
|
|
std::string hmac_value = kHmacPrrPrefix + encoder_id_ + ToBigEndian(bits);
|
||
|
|
|
||
|
|
deps_.hmac_func_(deps_.client_secret_, hmac_value, &sha256);
|
||
|
|
if (sha256.size() != kMaxBits) { // sanity check
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// We should have already checked this.
|
||
|
|
if (params_.num_bits_ > kMaxBits) {
|
||
|
|
log("num_bits exceeds maximum.");
|
||
|
|
assert(false);
|
||
|
|
}
|
||
|
|
|
||
|
|
uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
|
||
|
|
|
||
|
|
Bits uniform = 0;
|
||
|
|
Bits f_mask = 0;
|
||
|
|
|
||
|
|
for (int i = 0; i < params_.num_bits_; ++i) {
|
||
|
|
uint8_t byte = sha256[i];
|
||
|
|
|
||
|
|
uint8_t u_bit = byte & 0x01; // 1 bit of entropy
|
||
|
|
uniform |= (u_bit << i); // maybe set bit in mask
|
||
|
|
|
||
|
|
uint8_t rand128 = byte >> 1; // 7 bits of entropy
|
||
|
|
uint8_t noise_bit = (rand128 < threshold128);
|
||
|
|
f_mask |= (noise_bit << i); // maybe set bit in mask
|
||
|
|
}
|
||
|
|
|
||
|
|
*uniform_out = uniform;
|
||
|
|
*f_mask_out = f_mask;
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Encoder::_EncodeBitsInternal(const Bits bits, Bits* prr_out,
|
||
|
|
Bits* irr_out) const {
|
||
|
|
// Compute Permanent Randomized Response (PRR).
|
||
|
|
Bits uniform;
|
||
|
|
Bits f_mask;
|
||
|
|
if (!GetPrrMasks(bits, &uniform, &f_mask)) {
|
||
|
|
log("GetPrrMasks failed");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
Bits prr = (bits & ~f_mask) | (uniform & f_mask);
|
||
|
|
*prr_out = prr;
|
||
|
|
|
||
|
|
// Compute Instantaneous Randomized Response (IRR).
|
||
|
|
|
||
|
|
// NOTE: These can fail if say a read() from /dev/urandom fails.
|
||
|
|
Bits p_bits;
|
||
|
|
Bits q_bits;
|
||
|
|
if (!deps_.irr_rand_.GetMask(params_.prob_p_, params_.num_bits_, &p_bits)) {
|
||
|
|
log("PMask failed");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
if (!deps_.irr_rand_.GetMask(params_.prob_q_, params_.num_bits_, &q_bits)) {
|
||
|
|
log("QMask failed");
|
||
|
|
return false;
|
||
|
|
};
|
||
|
|
|
||
|
|
Bits irr = (p_bits & ~prr) | (q_bits & prr);
|
||
|
|
*irr_out = irr;
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Encoder::_EncodeStringInternal(const std::string& value, Bits* bloom_out,
|
||
|
|
Bits* prr_out, Bits* irr_out) const {
|
||
|
|
if (!MakeBloomFilter(value, bloom_out)) {
|
||
|
|
log("Bloom filter calculation failed");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
return _EncodeBitsInternal(*bloom_out, prr_out, irr_out);
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Encoder::EncodeBits(const Bits bits, Bits* irr_out) const {
|
||
|
|
Bits unused_prr;
|
||
|
|
return _EncodeBitsInternal(bits, &unused_prr, irr_out);
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Encoder::EncodeString(const std::string& value, Bits* irr_out) const {
|
||
|
|
Bits unused_bloom;
|
||
|
|
Bits unused_prr;
|
||
|
|
return _EncodeStringInternal(value, &unused_bloom, &unused_prr, irr_out);
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint8_t shifted(const Bits& bits, const int& index) {
|
||
|
|
// For an array of bytes, select the appopriate byte from a 4-byte
|
||
|
|
// integer value. Bytes are enumerated in big-endian order, i.e.
|
||
|
|
// index = 0 is the MSB, index = 3 is the LSB.
|
||
|
|
int shift = 8 * (3 - (index % 4)); // Byte 0 shifts by 24 bits, 1 by 16, etc.
|
||
|
|
return (uint8_t)((bits >> shift) & 0xFF); // Return the correct byte.
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Encoder::EncodeString(const std::string& value,
|
||
|
|
std::vector<uint8_t>* irr_out) const {
|
||
|
|
std::vector<uint8_t> bloom_out;
|
||
|
|
std::vector<uint8_t> hmac_out;
|
||
|
|
std::vector<uint8_t> uniform;
|
||
|
|
std::vector<uint8_t> f_mask;
|
||
|
|
const int num_bits = params_.num_bits_;
|
||
|
|
|
||
|
|
uniform.resize(num_bits / 8, 0);
|
||
|
|
f_mask.resize(num_bits / 8, 0);
|
||
|
|
irr_out->resize(num_bits / 8, 0);
|
||
|
|
|
||
|
|
// Set bloom_out.
|
||
|
|
if (!MakeBloomFilter(value, &bloom_out)) {
|
||
|
|
log("Bloom filter calculation failed");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set hmac_out.
|
||
|
|
hmac_out.resize(num_bits); // Signal to HmacDrbg about desired output size.
|
||
|
|
// Call HmacDrbg
|
||
|
|
std::string hmac_value = kHmacPrrPrefix + encoder_id_;
|
||
|
|
for (int i = 0; i < bloom_out.size(); ++i) {
|
||
|
|
hmac_value.append(reinterpret_cast<char *>(&bloom_out[i]), 1);
|
||
|
|
}
|
||
|
|
deps_.hmac_func_(deps_.client_secret_, hmac_value, &hmac_out);
|
||
|
|
if (hmac_out.size() != num_bits) {
|
||
|
|
log("Needed %d bytes from Hmac function, received %d bytes.",
|
||
|
|
num_bits, hmac_out.size());
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// We'll be using 7 bits of each byte of the MAC as our random
|
||
|
|
// number for the f_mask.
|
||
|
|
uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
|
||
|
|
|
||
|
|
// Construct uniform and f_mask bitwise.
|
||
|
|
for (int i = 0; i < num_bits; i++) {
|
||
|
|
uint8_t byte = hmac_out[i];
|
||
|
|
uint8_t u_bit = byte & 0x01; // 1 bit of entropy.
|
||
|
|
int vector_index = (num_bits - 1 - i) / 8;
|
||
|
|
uint8_t rand128 = byte >> 1; // 7 bits of entropy.
|
||
|
|
uint8_t noise_bit = (rand128 < threshold128);
|
||
|
|
uniform[vector_index] |= (u_bit << (i % 8));
|
||
|
|
f_mask[vector_index] |= (noise_bit << (i % 8));
|
||
|
|
}
|
||
|
|
|
||
|
|
for (int i = 0; i < bloom_out.size(); i++) {
|
||
|
|
Bits p_bits;
|
||
|
|
Bits q_bits;
|
||
|
|
uint8_t prr;
|
||
|
|
prr = (bloom_out[i] & ~f_mask[i]) | (uniform[i] & f_mask[i]);
|
||
|
|
// GetMask operates on Uint32, so we generate a new p_bits every 4
|
||
|
|
// bytes, and use each of its bytes once.
|
||
|
|
if (i % 4 == 0) {
|
||
|
|
// Need new p_bits, q_bits values to work with.
|
||
|
|
if (!deps_.irr_rand_.GetMask(params_.prob_p_, 32, &p_bits)) {
|
||
|
|
log("PMask failed");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
if (!deps_.irr_rand_.GetMask(params_.prob_q_, 32, &q_bits)) {
|
||
|
|
log("QMask failed");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
(*irr_out)[i] = (shifted(p_bits, i) & ~prr)
|
||
|
|
| (shifted(q_bits, i) & prr);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void Encoder::set_cohort(uint32_t cohort) {
|
||
|
|
cohort_ = cohort;
|
||
|
|
cohort_str_ = ToBigEndian(cohort_);
|
||
|
|
}
|
||
|
|
|
||
|
|
} // namespace rappor
|