388 lines
12 KiB
Go
388 lines
12 KiB
Go
// Copyright 2020 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package classifier
|
|
|
|
import (
|
|
"fmt"
|
|
"reflect"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/davecgh/go-spew/spew"
|
|
"github.com/google/go-cmp/cmp"
|
|
)
|
|
|
|
// hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully.
|
|
var hundredLicenseText = `
|
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
|
|
|
|
// prefixMissingText exercises the error margin detection case at the beginning of a body of text.
|
|
var prefixMissingText = `
|
|
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
|
|
|
|
// suffixMissingText exercises the error margin detection case at the beginning of a body of text.
|
|
var suffixMissingText = `
|
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80`
|
|
|
|
// fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance.
|
|
var fragmentedText = `
|
|
1 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X`
|
|
|
|
// bigChunkText has a gap of maximal length to ensure that reassembly works.
|
|
var bigChunkText = `
|
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100`
|
|
|
|
// The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation.
|
|
var fiftyLicenseText = `
|
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50`
|
|
|
|
var repeatedWithBreakagesText = `
|
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X`
|
|
|
|
func TestSearchSet_New(t *testing.T) {
|
|
tests := []struct {
|
|
description string
|
|
text string
|
|
q int
|
|
want *searchSet
|
|
}{
|
|
{
|
|
description: "Empty string",
|
|
text: "",
|
|
q: 4,
|
|
want: &searchSet{
|
|
Tokens: nil,
|
|
Hashes: make(hash),
|
|
Checksums: nil,
|
|
ChecksumRanges: nil,
|
|
},
|
|
},
|
|
{
|
|
description: "Small string",
|
|
text: "Hello world",
|
|
q: 4,
|
|
want: &searchSet{
|
|
Tokens: []indexedToken{
|
|
{Line: 1, ID: 1},
|
|
{Line: 1, ID: 2},
|
|
},
|
|
Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}},
|
|
Checksums: []uint32{1957950203},
|
|
ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}},
|
|
nodes: []*node{{1957950203, &tokenRange{Start: 0, End: 2}}},
|
|
q: 2,
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
var trace strings.Builder
|
|
c := NewClassifier(.8) // This value doesn't affect the test.
|
|
c.SetTraceConfiguration(&TraceConfiguration{
|
|
TraceLicenses: "*",
|
|
TracePhases: "*",
|
|
Tracer: func(f string, args ...interface{}) {
|
|
trace.WriteString(fmt.Sprintf(f, args...))
|
|
},
|
|
})
|
|
c.AddContent("", "text", "", []byte(tt.text))
|
|
if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) {
|
|
t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want))
|
|
t.Errorf("Trace:\n%s", trace.String())
|
|
}
|
|
}
|
|
}
|
|
func TestFindPotentialMatches(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
src string
|
|
target string
|
|
confidence float64
|
|
expectedHits int
|
|
}{
|
|
{
|
|
name: "maximally fragmented",
|
|
src: hundredLicenseText,
|
|
target: fragmentedText,
|
|
confidence: .8,
|
|
expectedHits: 1,
|
|
},
|
|
{
|
|
name: "prefix missing",
|
|
src: hundredLicenseText,
|
|
target: prefixMissingText,
|
|
confidence: .8,
|
|
expectedHits: 1,
|
|
},
|
|
{
|
|
name: "suffix missing",
|
|
src: hundredLicenseText,
|
|
target: suffixMissingText,
|
|
confidence: .8,
|
|
expectedHits: 1,
|
|
},
|
|
{
|
|
name: "maximum-length error",
|
|
src: hundredLicenseText,
|
|
target: bigChunkText,
|
|
confidence: .8,
|
|
expectedHits: 1,
|
|
},
|
|
}
|
|
|
|
for _, test := range tests {
|
|
t.Run(test.name, func(t *testing.T) {
|
|
var trace strings.Builder
|
|
c := NewClassifier(test.confidence)
|
|
c.SetTraceConfiguration(&TraceConfiguration{
|
|
TraceLicenses: "*",
|
|
TracePhases: "*",
|
|
Tracer: func(f string, args ...interface{}) {
|
|
trace.WriteString(fmt.Sprintf(f, args...))
|
|
},
|
|
})
|
|
c.AddContent("", "source", "", []byte(test.src))
|
|
|
|
doc := c.createTargetIndexedDocument([]byte(test.target))
|
|
doc.generateSearchSet(c.q)
|
|
hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence)
|
|
if actual := len(hits); actual != test.expectedHits {
|
|
t.Errorf("got %d hits, wanted %d", actual, test.expectedHits)
|
|
t.Errorf("Trace:\n%s", trace.String())
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestFuseRanges(t *testing.T) {
|
|
// This test verifies that target ordering doesn't affect how ranges
|
|
// get fused. The data that is input for fuseRanges is not always
|
|
// ordered deterministically since it's the product of a map iteration.
|
|
// This was a bug that occurred during development.
|
|
tests := []struct {
|
|
name string
|
|
in matchRanges
|
|
out matchRanges
|
|
conf float64
|
|
size int
|
|
}{
|
|
{
|
|
name: "in target order",
|
|
conf: .8,
|
|
size: 100,
|
|
in: matchRanges{
|
|
&matchRange{
|
|
SrcStart: 50,
|
|
SrcEnd: 93,
|
|
TargetStart: 0,
|
|
TargetEnd: 43,
|
|
TokensClaimed: 43,
|
|
},
|
|
&matchRange{
|
|
SrcStart: 0,
|
|
SrcEnd: 43,
|
|
TargetStart: 0,
|
|
TargetEnd: 43,
|
|
TokensClaimed: 43,
|
|
},
|
|
&matchRange{
|
|
SrcStart: 10,
|
|
SrcEnd: 47,
|
|
TargetStart: 60,
|
|
TargetEnd: 97,
|
|
TokensClaimed: 37,
|
|
},
|
|
&matchRange{
|
|
SrcStart: 60,
|
|
SrcEnd: 97,
|
|
TargetStart: 60,
|
|
TargetEnd: 97,
|
|
TokensClaimed: 37,
|
|
},
|
|
},
|
|
out: matchRanges{
|
|
&matchRange{
|
|
SrcStart: 0,
|
|
SrcEnd: 97,
|
|
TargetStart: 0,
|
|
TargetEnd: 97,
|
|
TokensClaimed: 80,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
name: "not in-target order",
|
|
conf: .8,
|
|
size: 100,
|
|
in: matchRanges{
|
|
&matchRange{
|
|
SrcStart: 0,
|
|
SrcEnd: 43,
|
|
TargetStart: 0,
|
|
TargetEnd: 43,
|
|
TokensClaimed: 43,
|
|
},
|
|
&matchRange{
|
|
SrcStart: 50,
|
|
SrcEnd: 93,
|
|
TargetStart: 0,
|
|
TargetEnd: 43,
|
|
TokensClaimed: 43,
|
|
},
|
|
&matchRange{
|
|
SrcStart: 60,
|
|
SrcEnd: 97,
|
|
TargetStart: 60,
|
|
TargetEnd: 97,
|
|
TokensClaimed: 37,
|
|
},
|
|
&matchRange{
|
|
SrcStart: 10,
|
|
SrcEnd: 47,
|
|
TargetStart: 60,
|
|
TargetEnd: 97,
|
|
TokensClaimed: 37,
|
|
},
|
|
},
|
|
out: matchRanges{
|
|
&matchRange{
|
|
SrcStart: 0,
|
|
SrcEnd: 97,
|
|
TargetStart: 0,
|
|
TargetEnd: 97,
|
|
TokensClaimed: 80,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
for _, test := range tests {
|
|
t.Run(test.name, func(t *testing.T) {
|
|
var trace strings.Builder
|
|
c := NewClassifier(.8)
|
|
c.SetTraceConfiguration(&TraceConfiguration{
|
|
TraceLicenses: "*",
|
|
TracePhases: "*",
|
|
Tracer: func(f string, args ...interface{}) {
|
|
trace.WriteString(fmt.Sprintf(f, args...))
|
|
},
|
|
})
|
|
runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4)
|
|
actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100)
|
|
if !cmp.Equal(actual, test.out) {
|
|
t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out))
|
|
t.Errorf("Trace:\n%s", trace.String())
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestDetectRuns(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
matched matchRanges
|
|
targetLength, subsetLength, q int
|
|
threshold float64
|
|
expected []matchRange
|
|
}{
|
|
{
|
|
// For an exact match on 100 accurate tokens, the first q-gram
|
|
// is the only possible location hit we can return.
|
|
name: "precise matching on perfect runs",
|
|
threshold: 1.0,
|
|
targetLength: 100,
|
|
subsetLength: 100,
|
|
q: 4,
|
|
matched: matchRanges{
|
|
&matchRange{TargetStart: 0, TargetEnd: 100},
|
|
},
|
|
expected: []matchRange{
|
|
{SrcStart: 0, SrcEnd: 4},
|
|
},
|
|
},
|
|
{
|
|
// For an 80% match on 100 accurate tokens, the first 20 token
|
|
// positions represent possible matches.
|
|
name: "approximate matching on perfect runs",
|
|
threshold: 0.8,
|
|
targetLength: 100,
|
|
subsetLength: 100,
|
|
q: 4,
|
|
matched: matchRanges{
|
|
&matchRange{TargetStart: 0, TargetEnd: 100},
|
|
},
|
|
expected: []matchRange{
|
|
{SrcStart: 0, SrcEnd: 24},
|
|
},
|
|
},
|
|
{
|
|
name: "multiple runs in a single target",
|
|
threshold: 0.8,
|
|
targetLength: 100,
|
|
subsetLength: 10,
|
|
q: 4,
|
|
matched: matchRanges{
|
|
&matchRange{TargetStart: 0, TargetEnd: 10},
|
|
&matchRange{TargetStart: 20, TargetEnd: 25},
|
|
&matchRange{TargetStart: 50, TargetEnd: 60},
|
|
&matchRange{TargetStart: 70, TargetEnd: 77},
|
|
},
|
|
expected: []matchRange{
|
|
// Runs end on 4-gram boundaries
|
|
{SrcStart: 0, SrcEnd: 6},
|
|
// The run starts early because of error tolerance
|
|
{SrcStart: 48, SrcEnd: 56},
|
|
},
|
|
},
|
|
{
|
|
name: "bridge broken runs in a single target",
|
|
threshold: 0.8,
|
|
targetLength: 100,
|
|
subsetLength: 10,
|
|
q: 4,
|
|
matched: matchRanges{
|
|
&matchRange{TargetStart: 20, TargetEnd: 25},
|
|
&matchRange{TargetStart: 26, TargetEnd: 30},
|
|
&matchRange{TargetStart: 60, TargetEnd: 67},
|
|
&matchRange{TargetStart: 68, TargetEnd: 72},
|
|
},
|
|
expected: []matchRange{
|
|
// Runs end on 4-gram boundaries and start early because
|
|
// of error tolerance.
|
|
{SrcStart: 19, SrcEnd: 25},
|
|
{SrcStart: 59, SrcEnd: 67},
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, test := range tests {
|
|
t.Run(test.name, func(t *testing.T) {
|
|
var trace strings.Builder
|
|
c := NewClassifier(.8)
|
|
c.SetTraceConfiguration(&TraceConfiguration{
|
|
TraceLicenses: "*",
|
|
TracePhases: "*",
|
|
Tracer: func(f string, args ...interface{}) {
|
|
trace.WriteString(fmt.Sprintf(f, args...))
|
|
},
|
|
})
|
|
if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) {
|
|
t.Errorf(cmp.Diff(got, test.expected))
|
|
t.Errorf("Trace:\n%s", trace.String())
|
|
}
|
|
|
|
})
|
|
}
|
|
}
|