378 lines
14 KiB
R
Executable File
378 lines
14 KiB
R
Executable File
# Copyright 2014 Google Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
#
|
|
# This file has functions that aid in the estimation of a distribution when the
|
|
# dictionary is unknown. There are functions for estimating pairwise joint
|
|
# ngram distributions, pruning out false positives, and combining the two
|
|
# steps.
|
|
|
|
FindPairwiseCandidates <- function(report_data, N, ngram_params, params) {
|
|
# Finds the pairwise most likely ngrams.
|
|
#
|
|
# Args:
|
|
# report_data: Object containing data relevant to reports:
|
|
# $inds: The indices of reports collected using various pairs
|
|
# $cohorts: The cohort of each report
|
|
# $map: The map used for all the ngrams
|
|
# $reports: The reports used for each ngram and full string
|
|
# N: Number of reports collected
|
|
# ngram_params: Parameters related to ngram size
|
|
# params: Parameter list.
|
|
#
|
|
# Returns:
|
|
# List: list of matrices, list of pairwise distributions.
|
|
|
|
inds <- report_data$inds
|
|
cohorts <- report_data$cohorts
|
|
num_ngrams_collected <- ngram_params$num_ngrams_collected
|
|
map <- report_data$map
|
|
reports <- report_data$reports
|
|
|
|
# Cycle over all the unique pairs of ngrams being collected
|
|
found_candidates <- list()
|
|
|
|
# Generate the map list to be used for all ngrams
|
|
maps <- lapply(1:num_ngrams_collected, function(x) map)
|
|
num_candidate_ngrams <- length(inds)
|
|
|
|
.ComputeDist <- function(i, inds, cohorts, reports, maps, params,
|
|
num_ngrams_collected) {
|
|
library(glmnet)
|
|
ind <- inds[[i]]
|
|
cohort_subset <- lapply(1:num_ngrams_collected, function(x)
|
|
cohorts[ind])
|
|
report_subset <- reports[[i]]
|
|
new_dist <- ComputeDistributionEM(report_subset,
|
|
cohort_subset,
|
|
maps, ignore_other = FALSE,
|
|
params = params, estimate_var = FALSE)
|
|
new_dist
|
|
}
|
|
|
|
# Compute the pairwise distributions (could be parallelized)
|
|
dists <- lapply(seq(num_candidate_ngrams), function(i)
|
|
.ComputeDist(i, inds, cohorts, reports, maps,
|
|
params, num_ngrams_collected))
|
|
|
|
dists_null <- sapply(dists, function(x) is.null(x))
|
|
if (any(dists_null)) {
|
|
return (list(found_candidates = list(), dists = dists))
|
|
}
|
|
cat("Found the pairwise ngram distributions.\n")
|
|
|
|
# Find the threshold for choosing "significant" ngram pairs
|
|
f <- params$f; q <- params$q; p <- params$p
|
|
q2 <- .5 * f * (p + q) + (1 - f) * q
|
|
p2 <- .5 * f * (p + q) + (1 - f) * p
|
|
std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
|
|
(threshold <- std_dev_counts / N)
|
|
threshold <- 0.04
|
|
|
|
# Filter joints to remove infrequently co-occurring ngrams.
|
|
candidate_strs <- lapply(1:num_candidate_ngrams, function(i) {
|
|
fit <- dists[[i]]$fit
|
|
edges <- which(fit > threshold, arr.ind = TRUE, FALSE)
|
|
|
|
# Recover the list of strings that seem significant
|
|
found_candidates <- sapply(1:ncol(edges), function(x) {
|
|
chunks <- sapply(edges[, x],
|
|
function(j) dimnames(fit)[[x]][j])
|
|
chunks
|
|
})
|
|
# sapply returns either "character" vector (for n=1) or a matrix. Convert
|
|
# it to a matrix. This can be seen as follows:
|
|
#
|
|
# > class(sapply(1:5, function(x) "a"))
|
|
# [1] "character"
|
|
# > class(sapply(1:5, function(x) c("a", "b")))
|
|
# [1] "matrix"
|
|
found_candidates <- rbind(found_candidates)
|
|
|
|
# Remove the "others"
|
|
others <- which(found_candidates == "Other")
|
|
if (length(others) > 0) {
|
|
other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1]
|
|
# drop = FALSE necessary to keep it a matrix
|
|
found_candidates <- found_candidates[-other, , drop = FALSE]
|
|
}
|
|
|
|
found_candidates
|
|
})
|
|
if (any(lapply(found_candidates, function(x) length(x)) == 0)) {
|
|
return (NULL)
|
|
}
|
|
|
|
list(candidate_strs = candidate_strs, dists = dists)
|
|
}
|
|
|
|
FindFeasibleStrings <- function(found_candidates, pairings, num_ngrams,
|
|
ngram_size) {
|
|
# Uses the list of strings found by the pairwise comparisons to build
|
|
# a list of full feasible strings. This relies on the iterative,
|
|
# graph-based approach.
|
|
#
|
|
# Args:
|
|
# found_candidates: list of candidates found by each pairwise decoding
|
|
# pairings: Matrix of size 2x(num_ngrams choose 2) listing all the
|
|
# ngram position pairings.
|
|
# num_ngrams: The total number of ngrams per word.
|
|
# ngram_size: Number of characters per ngram
|
|
#
|
|
# Returns:
|
|
# List of full string candidates.
|
|
|
|
# Which ngram pairs are adjacent, i.e. of the form (i,i+1)
|
|
adjacent <- sapply(seq(num_ngrams - 1), function(x) {
|
|
c(1 + (x - 1) * ngram_size, x * ngram_size + 1)
|
|
})
|
|
|
|
adjacent_pairs <- apply(adjacent, 2, function(x) {
|
|
which(apply(pairings, 1, function(y) identical(y, x)))
|
|
})
|
|
|
|
# The first set of candidates are ngrams found in positions 1 and 2
|
|
active_cands <- found_candidates[[adjacent_pairs[1]]]
|
|
if (class(active_cands) == "list") {
|
|
return (list())
|
|
} else {
|
|
active_cands <- as.data.frame(active_cands)
|
|
}
|
|
|
|
# Now check successive ngrams to find consistent combinations
|
|
# i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc.
|
|
for (i in 2:length(adjacent_pairs)) {
|
|
if (nrow(active_cands) == 0) {
|
|
return (list())
|
|
}
|
|
new_cands <- found_candidates[[adjacent_pairs[i]]]
|
|
new_cands <- as.data.frame(new_cands)
|
|
# Builds the set of possible candidates based only on ascending
|
|
# candidate pairs
|
|
active_cands <- BuildCandidates(active_cands, new_cands)
|
|
}
|
|
|
|
if (nrow(active_cands) == 0) {
|
|
return (list())
|
|
}
|
|
# Now refine these candidates using non-adjacent bigrams
|
|
remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)]
|
|
# For each non-adjacent pair, make sure that all the candidates are
|
|
# consistent (in this phase, candidates can ONLY be eliminated)
|
|
|
|
for (i in remaining) {
|
|
new_cands <- found_candidates[[i]]
|
|
new_cands <- as.data.frame(new_cands)
|
|
# Prune out all candidates that do not agree with new_cands
|
|
active_cands <- PruneCandidates(active_cands, pairings[i, ],
|
|
ngram_size,
|
|
new_cands = new_cands)
|
|
}
|
|
# Consolidate the string ngrams into a full string representation
|
|
if (length(active_cands) > 0) {
|
|
active_cands <- sort(apply(active_cands, 1,
|
|
function(x) paste0(x, collapse = "")))
|
|
}
|
|
unname(active_cands)
|
|
}
|
|
|
|
BuildCandidates <- function(active_cands, new_cands) {
|
|
# Takes in a data frame where each row is a valid sequence of ngrams
|
|
# checks which of the new_cands ngram pairs are consistent with
|
|
# the original active_cands ngram sequence.
|
|
#
|
|
# Args:
|
|
# active_cands: data frame of ngram sequence candidates (1 candidate
|
|
# sequence per row)
|
|
# new_cands: An rx2 data frame with a new list of candidate ngram
|
|
# pairs that might fit in with the previous list of candidates
|
|
#
|
|
# Returns:
|
|
# Updated active_cands, with another column if valid extensions are
|
|
# found.
|
|
|
|
# Get the trailing ngrams from the current candidates
|
|
to_check <- as.vector(tail(t(active_cands), n = 1))
|
|
# Check which of the elements in to_check are leading ngrams among the
|
|
# new candidates
|
|
present <- sapply(to_check, function(x) any(x == new_cands))
|
|
# Remove the strings that are not represented among the new candidates
|
|
to_check <- to_check[present]
|
|
# Now insert the new candidates where they belong
|
|
active_cands <- active_cands[present, , drop = FALSE]
|
|
active_cands <- cbind(active_cands, col = NA)
|
|
num_cands <- nrow(active_cands)
|
|
hit_list <- c()
|
|
for (j in 1:num_cands) {
|
|
inds <- which(new_cands[, 1] == to_check[j])
|
|
if (length(inds) == 0) {
|
|
hit_list <- c(hit_list, j)
|
|
next
|
|
}
|
|
# If there are multiple candidates fitting with an ngram, include
|
|
# each /full/ string as a candidate
|
|
extra <- length(inds) - 1
|
|
if (extra > 0) {
|
|
rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra))
|
|
to_paste <- active_cands[j, ]
|
|
# Add the new candidates to the bottom
|
|
for (p in 1:extra) {
|
|
active_cands <- rbind(active_cands, to_paste)
|
|
}
|
|
} else {
|
|
rep_inds <- c(j)
|
|
}
|
|
active_cands[rep_inds, ncol(active_cands)] <-
|
|
as.vector(new_cands[inds, 2])
|
|
new_num_cands <- nrow(active_cands)
|
|
}
|
|
# If there were some false candidates in the original set, remove them
|
|
if (length(hit_list) > 0) {
|
|
active_cands <- active_cands[-hit_list, , drop = FALSE]
|
|
}
|
|
active_cands
|
|
}
|
|
|
|
PruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) {
|
|
# Takes in a data frame where each row is a valid sequence of ngrams
|
|
# checks which of the new_cands ngram pairs are consistent with
|
|
# the original active_cands ngram sequence. This can ONLY remove
|
|
# candidates presented in active_cands.
|
|
#
|
|
# Args:
|
|
# active_cands: data frame of ngram sequence candidates (1 candidate
|
|
# sequence per row)
|
|
# pairing: A length-2 list storing which two ngrams are measured
|
|
# ngram_size: Number of characters per ngram
|
|
# new_cands: An rx2 data frame with a new list of candidate ngram
|
|
# pairs that might fit in with the previous list of candidates
|
|
#
|
|
# Returns:
|
|
# Updated active_cands, with a reduced number of rows.
|
|
|
|
# Convert the pairing to an ngram index
|
|
cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1)
|
|
|
|
cands_to_check <- active_cands[, cols, drop = FALSE]
|
|
# Find the candidates that are inconsistent with the new data
|
|
hit_list <- sapply(1:nrow(cands_to_check), function(j) {
|
|
to_kill <- FALSE
|
|
if (nrow(new_cands) == 0) {
|
|
return (TRUE)
|
|
}
|
|
if (!any(apply(new_cands, 1, function(x)
|
|
all(cands_to_check[j, , drop = FALSE] == x)))) {
|
|
to_kill <- TRUE
|
|
}
|
|
to_kill
|
|
})
|
|
|
|
# Determine which rows are false positives
|
|
hit_indices <- which(hit_list)
|
|
# Remove the false positives
|
|
if (length(hit_indices) > 0) {
|
|
active_cands <- active_cands[-hit_indices, ]
|
|
}
|
|
active_cands
|
|
}
|
|
|
|
EstimateDictionary <- function(report_data, N, ngram_params, params) {
|
|
# Takes in a list of report data and returns a list of string
|
|
# estimates of the dictionary.
|
|
#
|
|
# Args:
|
|
# report_data: Object containing data relevant to reports:
|
|
# $inds: The indices of reports collected using various pairs
|
|
# $cohorts: The cohort of each report
|
|
# $map: THe map used for all the ngrams
|
|
# $reports: The reports used for each ngram and full string
|
|
# N: the number of individuals sending reports
|
|
# ngram_params: Parameters related to ngram length, etc
|
|
# params: Parameter vector with RAPPOR noise levels, cohorts, etc
|
|
#
|
|
# Returns:
|
|
# List: list of found candidates, list of pairwise candidates
|
|
|
|
pairwise_candidates <- FindPairwiseCandidates(report_data, N,
|
|
ngram_params,
|
|
params)$candidate_strs
|
|
cat("Found the pairwise candidates. \n")
|
|
if (is.null(pairwise_candidates)) {
|
|
return (list())
|
|
}
|
|
found_candidates <- FindFeasibleStrings(pairwise_candidates,
|
|
report_data$pairings,
|
|
ngram_params$num_ngrams,
|
|
ngram_params$ngram_size)
|
|
cat("Found all the candidates. \n")
|
|
list(found_candidates = found_candidates,
|
|
pairwise_candidates = pairwise_candidates)
|
|
}
|
|
|
|
WriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams,
|
|
ngram_size) {
|
|
# Args:
|
|
# conn: R connection to write to. Should be opened with mode w+.
|
|
# pairwise_candidates: list of matrices. Each matrix represents a subgraph;
|
|
# it contains the edges between partitions i and j, so there are (k choose
|
|
# 2) matrices. Each matrix has dimension 2 x E, where E is the number of
|
|
# edges.
|
|
# pairings: 2 x (k choose 2) matrix of character positions. Each row
|
|
# corresponds to a subgraph; it has 1-based character index of partitions
|
|
# i and j.
|
|
# num_ngrams: length of pairwise_candidates, or the number of partitions in
|
|
# the k-partite graph
|
|
|
|
# File Format:
|
|
#
|
|
# num_partitions 3
|
|
# ngram_size 2
|
|
# 0.ab 1.cd
|
|
# 0.ab 2.ef
|
|
#
|
|
# The first line specifies the number of partitions (k).
|
|
# The remaining lines are edges, where each node is <partition>.<bigram>.
|
|
#
|
|
# Partitions are numbered from 0. The partition of the left node will be
|
|
# less than the partition of the right node.
|
|
|
|
# First two lines are metadata
|
|
cat(sprintf('num_partitions %d\n', num_ngrams), file = conn)
|
|
cat(sprintf('ngram_size %d\n', ngram_size), file = conn)
|
|
|
|
for (i in 1:length(pairwise_candidates)) {
|
|
# The two pairwise_candidates for this subgraph.
|
|
# Turn 1-based character positions into 0-based partition numbers,
|
|
# e.g. (3, 5) -> (1, 2)
|
|
|
|
pos1 <- pairings[[i, 1]]
|
|
pos2 <- pairings[[i, 2]]
|
|
part1 <- (pos1 - 1) / ngram_size
|
|
part2 <- (pos2 - 1) / ngram_size
|
|
cat(sprintf("Writing partition (%d, %d)\n", part1, part2))
|
|
|
|
p <- pairwise_candidates[[i]]
|
|
# each row is an edge
|
|
for (j in 1:nrow(p)) {
|
|
n1 <- p[[j, 1]]
|
|
n2 <- p[[j, 2]]
|
|
line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2)
|
|
# NOTE: It would be faster to preallocate 'lines', but we would have to
|
|
# make a two passes through pairwise_candidates.
|
|
cat(line, file = conn)
|
|
}
|
|
}
|
|
}
|
|
|