498 lines
16 KiB
Go
498 lines
16 KiB
Go
// Copyright 2020 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package classifier
|
|
|
|
import (
|
|
"fmt"
|
|
"hash/crc32"
|
|
"math"
|
|
"sort"
|
|
|
|
"github.com/davecgh/go-spew/spew"
|
|
)
|
|
|
|
// searchSet is a set of q-grams that have hashes associated with them,
|
|
// making it fast to search for potential matches.
|
|
type searchSet struct {
|
|
// Tokens is a tokenized list of the original input string.
|
|
Tokens []indexedToken
|
|
// Hashes is a map of checksums to a range of tokens.
|
|
Hashes hash
|
|
// Checksums is a list of checksums ordered from longest range to
|
|
// shortest.
|
|
Checksums []uint32
|
|
// ChecksumRanges are the token ranges for the above checksums.
|
|
ChecksumRanges tokenRanges
|
|
origin string // A debugging identifier to label what this searchset is associated with
|
|
|
|
nodes []*node
|
|
q int // The length of q-grams in this searchset.
|
|
}
|
|
|
|
// node consists of a range of tokens along with the checksum for those tokens.
|
|
type node struct {
|
|
checksum uint32
|
|
tokens *tokenRange
|
|
}
|
|
|
|
func (n *node) String() string {
|
|
return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
|
|
}
|
|
|
|
// newSearchSet creates a new searchSet object. A searchset generates all
|
|
// possible q-grams of tokens. These q-grams of tokens can be correlated to
|
|
// determine where a section of text from one source may appear in another
|
|
// source.
|
|
func newSearchSet(s *indexedDocument, q int) *searchSet {
|
|
// Start generating hash values for all q-grams within the text.
|
|
h := make(hash)
|
|
if len(s.Tokens) < q {
|
|
// We can't have a smaller q than the number of tokens.
|
|
q = len(s.Tokens)
|
|
}
|
|
checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict)
|
|
sset := &searchSet{
|
|
Tokens: s.Tokens,
|
|
Hashes: h,
|
|
Checksums: checksums,
|
|
ChecksumRanges: tokenRanges,
|
|
q: q,
|
|
}
|
|
sset.generateNodeList()
|
|
return sset
|
|
}
|
|
|
|
// tokenRange indicates the range of tokens that map to a particular checksum.
|
|
type tokenRange struct {
|
|
Start int
|
|
End int
|
|
}
|
|
|
|
func (t *tokenRange) String() string {
|
|
return fmt.Sprintf("[%v, %v)", t.Start, t.End)
|
|
}
|
|
|
|
// tokenRanges is a sortable type of a slice of TokenRange.
|
|
type tokenRanges []*tokenRange
|
|
|
|
// generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens.
|
|
func generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) {
|
|
if q == 0 {
|
|
return nil, nil
|
|
}
|
|
var css []uint32
|
|
var tr tokenRanges
|
|
crc := crc32.NewIEEE()
|
|
for offset := 0; offset+q <= len(toks); offset++ {
|
|
crc.Reset()
|
|
for i := 0; i < q; i++ {
|
|
crc.Write([]byte(dict.getWord(toks[offset+i].ID)))
|
|
crc.Write([]byte{' '})
|
|
}
|
|
cs := crc.Sum32()
|
|
css = append(css, cs)
|
|
tr = append(tr, &tokenRange{offset, offset + q})
|
|
h.add(cs, offset, offset+q)
|
|
}
|
|
|
|
return css, tr
|
|
}
|
|
|
|
// generateNodeList creates a node list out of the search set.
|
|
func (s *searchSet) generateNodeList() {
|
|
if len(s.Tokens) == 0 {
|
|
return
|
|
}
|
|
|
|
for i := 0; i < len(s.Checksums); i++ {
|
|
s.nodes = append(s.nodes, &node{
|
|
checksum: s.Checksums[i],
|
|
tokens: s.ChecksumRanges[i],
|
|
})
|
|
}
|
|
}
|
|
|
|
// matchRange is the range within the source text that is a match to the range
|
|
// in the target text.
|
|
type matchRange struct {
|
|
// Offsets into the source tokens.
|
|
SrcStart, SrcEnd int
|
|
// Offsets into the target tokens.
|
|
TargetStart, TargetEnd int
|
|
// TokensClaimed tracks the number of positively matched tokens in this
|
|
// range. For initially created matchRanges, this is equal to the extent of
|
|
// the range. However, as matchRanges get merged together and error is
|
|
// potentially introduced, this tracks the precise number of tokens that
|
|
// exist in the range.
|
|
TokensClaimed int
|
|
}
|
|
|
|
// in returns true if the start and end are enclosed in the match range.
|
|
func (m *matchRange) in(other *matchRange) bool {
|
|
return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd
|
|
}
|
|
|
|
func (m *matchRange) String() string {
|
|
return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed)
|
|
}
|
|
|
|
// matchRanges is a list of "matchRange"s. The ranges are monotonically
|
|
// increasing in value and indicate a single potential occurrence of the source
|
|
// text in the target text. They are sorted to support greedy matching with the
|
|
// longest runs of q-grams appearing first in the sort.
|
|
type matchRanges []*matchRange
|
|
|
|
func (m matchRanges) Len() int { return len(m) }
|
|
func (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
|
|
func (m matchRanges) Less(i, j int) bool {
|
|
if m[i].TokensClaimed != m[j].TokensClaimed {
|
|
return m[i].TokensClaimed > m[j].TokensClaimed
|
|
}
|
|
|
|
if m[i].TargetStart != m[j].TargetStart {
|
|
return m[i].TargetStart < m[j].TargetStart
|
|
}
|
|
return m[i].SrcStart < m[j].SrcStart
|
|
}
|
|
|
|
// findPotentialMatches returns the ranges in the target (unknown) text that
|
|
// are best potential matches to the source (known) text.
|
|
func (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges {
|
|
matchedRanges := c.getMatchedRanges(src, target, confidence, src.q)
|
|
if c.tc.traceSearchset(src.origin) {
|
|
c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges))
|
|
}
|
|
if len(matchedRanges) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// After computing all potential matches, we only output ranges that contain
|
|
// enough tokens to clear the confidence threshold. As noted, this number can
|
|
// be too high, yielding false positives, but cannot yield false negatives.
|
|
threshold := int(confidence * float64(len(src.Tokens)))
|
|
|
|
for i, m := range matchedRanges {
|
|
if m.TokensClaimed < threshold {
|
|
matchedRanges = matchedRanges[:i]
|
|
break
|
|
}
|
|
}
|
|
|
|
if c.tc.traceSearchset(src.origin) {
|
|
c.tc.trace("finalized matchedRanges for %s: %d = %s", src.origin, len(src.Tokens), spew.Sdump(matchedRanges))
|
|
}
|
|
return matchedRanges
|
|
}
|
|
|
|
// fuseRanges analyzes the source matches, attempting to combine hits without
|
|
// errors into larger hits with tolerable amounts of error to produce matches
|
|
// that contain enough tokens to be considered for exact matching against a a
|
|
// target document. This routine intentionally does not accurately track error
|
|
// contributions from merging runs, trading false positives (but not false
|
|
// negatives), for faster performance.
|
|
func (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
|
|
var claimed matchRanges
|
|
errorMargin := int(math.Round(float64(size) * (1.0 - confidence)))
|
|
|
|
filter := make([]bool, targetSize)
|
|
for _, m := range runs {
|
|
for i := m.SrcStart; i < m.SrcEnd; i++ {
|
|
// Only consider these offsets if they fit into the target document, since
|
|
// the target may be smaller than the source being analyzed
|
|
if i < targetSize {
|
|
filter[i] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
filterDrops := 0
|
|
filterPasses := 0
|
|
|
|
// For each hit detected, compare it against all other previous hits to see if it can be part of match
|
|
// or represents a group that is eligible for matching and having other hits contribute to it.
|
|
for _, m := range matched {
|
|
off := m.TargetStart - m.SrcStart
|
|
|
|
// If the offset is negative, but within error margins, we associate it
|
|
// with the first index to see if it could contribute to a run. If the
|
|
// absolute offset is larger than the error margin, it can't possibly
|
|
// contribute and will be dropped. This puts more potential error into the zero
|
|
// index, but that just slightly increases the rate of false positives. In
|
|
// practice, this would only be an issue if there are major substrings of a
|
|
// source in a target that aren't part of a real hit. We see many small
|
|
// references (the name of a license) but not large chunks of the license.
|
|
if off < 0 {
|
|
if -off <= errorMargin {
|
|
off = 0
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
|
|
// If the filter is false, there was not sufficient token density in that
|
|
// part of the target document for a viable match, so this match is a
|
|
// spurious hit and can be discarded.
|
|
if !filter[off] {
|
|
filterDrops++
|
|
continue
|
|
}
|
|
|
|
filterPasses++
|
|
unclaimed := true
|
|
|
|
for _, c := range claimed {
|
|
moff := m.TargetStart - m.SrcStart
|
|
coff := c.TargetStart - c.SrcStart
|
|
sampleError := int(math.Round(math.Abs(float64(moff - coff))))
|
|
withinError := sampleError < errorMargin
|
|
|
|
// The contribution needs to add more value than accumulated error. This prevents
|
|
// against spurious matches of a reference to a license incorrectly overextending the
|
|
// match range.
|
|
if withinError && m.TokensClaimed > int(sampleError) {
|
|
if m.in(c) {
|
|
// This can cause too many tokens to be claimed, but that's fine since we want to avoid
|
|
// undercounting and missing content.
|
|
c.TokensClaimed += m.TokensClaimed
|
|
unclaimed = false
|
|
} else {
|
|
// See if the claims can be merged. If the error tolerances allow for it,
|
|
// we merge the claims and the ranges. This doesn't accumulate error
|
|
// accurately so it's possible that repeated merges can introduce too
|
|
// much error to actually make a match, but we won't get false
|
|
// negatives from this approach. The error tolerances allow for a
|
|
// merge, but we only want to merge if it increases the range of
|
|
// tokens being covered. If this match is already handled by an
|
|
// existing (stronger by definition) claim, we don't merge this one,
|
|
// but treat it as a new claim. This allows for the case where a
|
|
// highly fragmented text will be matched by a long series of low
|
|
// score matches.
|
|
if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart {
|
|
c.TargetStart = m.TargetStart
|
|
c.SrcStart = m.SrcStart
|
|
c.TokensClaimed += m.TokensClaimed
|
|
unclaimed = false
|
|
} else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd {
|
|
c.TargetEnd = m.TargetEnd
|
|
c.SrcEnd = m.SrcEnd
|
|
c.TokensClaimed += m.TokensClaimed
|
|
unclaimed = false
|
|
}
|
|
// This claim does not extend any existing block, and it may be claimed in its own
|
|
// right.
|
|
}
|
|
}
|
|
if !unclaimed {
|
|
break
|
|
}
|
|
}
|
|
// Only create a claim if this claim is likely relevant. If we had some higher quality hits,
|
|
// it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep
|
|
// this around.
|
|
if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed {
|
|
claimed = append(claimed, m)
|
|
}
|
|
}
|
|
sort.Sort(claimed)
|
|
if c.tc.traceSearchset(origin) {
|
|
c.tc.trace("filterPasses = %+v", filterPasses)
|
|
c.tc.trace("filterDrops = %+v", filterDrops)
|
|
c.tc.trace("claimed = %s", spew.Sdump(claimed))
|
|
}
|
|
return claimed
|
|
}
|
|
|
|
// getMatchedRanges finds the ranges in the target text that match the source
|
|
// text. The ranges returned are ordered from the entries with the most matched
|
|
// tokens to the least.
|
|
func (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges {
|
|
shouldTrace := c.tc.traceSearchset(src.origin)
|
|
|
|
if shouldTrace {
|
|
c.tc.trace("src.origin = %+v", src.origin)
|
|
}
|
|
// Assemble a list of all the matched q-grams without any consideration to
|
|
// error tolerances.
|
|
matched := targetMatchedRanges(src, target)
|
|
if shouldTrace {
|
|
c.tc.trace("matched = %s", spew.Sdump(matched))
|
|
}
|
|
if len(matched) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Perform neighborhood matching to figure out which clusters of q-grams have
|
|
// sufficient likelihood to be a potential match to the source. For an error
|
|
// confidence threshold of X, we require that a sequence of N target tokens
|
|
// must contain N*X (X <=1.0) ordered source tokens in order to be a viable
|
|
// match.
|
|
//
|
|
// It's much easier to compute potential ranges in the target if we disregard
|
|
// proper ordering of source tokens initially, and see which source q-grams
|
|
// appear in sufficient quantities to be a potential match. We can then
|
|
// disregard any q-gram that falls outside of that range. This helps
|
|
// significantly since processing token matches is an N^2 (or worse)
|
|
// operation, so reducing N is a big win.
|
|
|
|
runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)
|
|
|
|
if shouldTrace {
|
|
c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs))
|
|
}
|
|
|
|
// If there are no target runs of source tokens, we're done.
|
|
if len(runs) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Using the runs as a filter to ignore irrelevant matches, fuse the source
|
|
// match ranges into larger matches (with possible errors) to see if we can
|
|
// produce large enough runs that pass the confidence threshold.
|
|
|
|
fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
|
|
if shouldTrace {
|
|
c.tc.trace("fr = %s", spew.Sdump(fr))
|
|
}
|
|
return fr
|
|
}
|
|
|
|
func (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
|
|
shouldTrace := c.tc.traceSearchset(origin)
|
|
hits := make([]bool, targetLength)
|
|
for _, m := range matched {
|
|
for idx := m.TargetStart; idx < m.TargetEnd; idx++ {
|
|
hits[idx] = true
|
|
}
|
|
}
|
|
|
|
if len(hits) == 0 {
|
|
return nil
|
|
}
|
|
var out []int
|
|
|
|
total := 0
|
|
target := int(float64(subsetLength) * threshold)
|
|
if shouldTrace {
|
|
c.tc.trace("target = %+v", target)
|
|
c.tc.trace("targetLength = %+v", targetLength)
|
|
c.tc.trace("subsetLength = %+v", subsetLength)
|
|
}
|
|
|
|
// If we don't have at least 1 subset (i.e. the target is shorter than the
|
|
// source) just analyze what we have.
|
|
if len(hits) < subsetLength {
|
|
if shouldTrace {
|
|
c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits))
|
|
}
|
|
subsetLength = len(hits)
|
|
}
|
|
// Initialize our sliding window value.
|
|
for i := 0; i < subsetLength; i++ {
|
|
if hits[i] {
|
|
total++
|
|
}
|
|
}
|
|
|
|
if total >= target {
|
|
out = append(out, 0)
|
|
}
|
|
|
|
// Now move through the window adjusting the total by subtracting out the
|
|
// last bit and adding in the new bit.
|
|
for i := 1; i < len(hits); i++ {
|
|
if hits[i-1] {
|
|
total--
|
|
}
|
|
end := i + subsetLength - 1
|
|
if end < len(hits) && hits[i+subsetLength-1] {
|
|
total++
|
|
}
|
|
if total >= target {
|
|
out = append(out, i)
|
|
}
|
|
}
|
|
if len(out) == 0 {
|
|
return nil
|
|
}
|
|
|
|
final := []matchRange{
|
|
{
|
|
SrcStart: out[0],
|
|
SrcEnd: out[0] + q,
|
|
},
|
|
}
|
|
for i := 1; i < len(out); i++ {
|
|
if out[i] != 1+out[i-1] {
|
|
final = append(final, matchRange{
|
|
SrcStart: out[i],
|
|
SrcEnd: out[i] + q,
|
|
})
|
|
} else {
|
|
final[len(final)-1].SrcEnd = out[i] + q
|
|
}
|
|
}
|
|
|
|
return final
|
|
}
|
|
|
|
func targetMatchedRanges(src, target *searchSet) matchRanges {
|
|
offsetMappings := make(map[int][]*matchRange)
|
|
|
|
var matched matchRanges
|
|
for _, tgtNode := range target.nodes {
|
|
sr, ok := src.Hashes[tgtNode.checksum]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
tv := tgtNode.tokens
|
|
for _, sv := range sr {
|
|
offset := tv.Start - sv.Start
|
|
if om, ok := offsetMappings[offset]; ok {
|
|
// See if this extends the most recent existing mapping
|
|
lastIdx := len(om) - 1
|
|
if om[lastIdx].TargetEnd == tv.End-1 {
|
|
// This new value extends. Update the value in place
|
|
om[lastIdx].SrcEnd = sv.End
|
|
om[lastIdx].TargetEnd = tv.End
|
|
continue
|
|
}
|
|
}
|
|
offsetMappings[offset] = append(offsetMappings[offset], &matchRange{
|
|
SrcStart: sv.Start,
|
|
SrcEnd: sv.End,
|
|
TargetStart: tv.Start,
|
|
TargetEnd: tv.End,
|
|
})
|
|
}
|
|
}
|
|
|
|
// Compute the number of tokens claimed in each run and flatten into a single slice.
|
|
for _, mr := range offsetMappings {
|
|
for _, m := range mr {
|
|
m.TokensClaimed = m.TargetEnd - m.TargetStart
|
|
}
|
|
matched = append(matched, mr...)
|
|
}
|
|
sort.Sort(matched)
|
|
return matched
|
|
}
|
|
|
|
type hash map[uint32]tokenRanges
|
|
|
|
func (h hash) add(checksum uint32, start, end int) {
|
|
h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end})
|
|
}
|