306 lines
6.8 KiB
Go
306 lines
6.8 KiB
Go
|
|
// Copyright 2020 Google Inc.
|
||
|
|
//
|
||
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
// you may not use this file except in compliance with the License.
|
||
|
|
// You may obtain a copy of the License at
|
||
|
|
//
|
||
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
//
|
||
|
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
// See the License for the specific language governing permissions and
|
||
|
|
// limitations under the License.
|
||
|
|
|
||
|
|
package classifier
|
||
|
|
|
||
|
|
import (
|
||
|
|
"fmt"
|
||
|
|
"strings"
|
||
|
|
"testing"
|
||
|
|
|
||
|
|
"github.com/sergi/go-diff/diffmatchpatch"
|
||
|
|
)
|
||
|
|
|
||
|
|
func TestLevenshteinDiff(t *testing.T) {
|
||
|
|
tests := []struct {
|
||
|
|
name string
|
||
|
|
diffs []diffmatchpatch.Diff
|
||
|
|
expected int
|
||
|
|
}{
|
||
|
|
{
|
||
|
|
name: "identical text",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "equivalent text",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: 0,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "changed text",
|
||
|
|
// Adjacent inverse changes get scored with the maximum of the 2 change scores
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffDelete,
|
||
|
|
Text: "removed words",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffInsert,
|
||
|
|
Text: "inserted text here",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: 3,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "inserted text",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "identical words",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffInsert,
|
||
|
|
Text: "inserted",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: 1,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "deleted text",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffDelete,
|
||
|
|
Text: "many extraneous deleted words",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "before the equivalent text",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: 4,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
for _, test := range tests {
|
||
|
|
t.Run(test.name, func(t *testing.T) {
|
||
|
|
if got := diffLevenshteinWord(test.diffs); got != test.expected {
|
||
|
|
t.Errorf("got %d wanted %d", got, test.expected)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
func TestScoreDiffs(t *testing.T) {
|
||
|
|
tests := []struct {
|
||
|
|
name string
|
||
|
|
license string
|
||
|
|
diffs []diffmatchpatch.Diff
|
||
|
|
expected int
|
||
|
|
}{
|
||
|
|
{
|
||
|
|
name: "identical text",
|
||
|
|
license: "License/MIT/license.txt",
|
||
|
|
diffs: nil,
|
||
|
|
expected: 0,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "acceptable change",
|
||
|
|
license: "License/MIT/license.txt",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "license",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffInsert,
|
||
|
|
Text: "as needed",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffDelete,
|
||
|
|
Text: "when necessary",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: 2,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "version change",
|
||
|
|
license: "License/MIT/license.txt",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "version",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffInsert,
|
||
|
|
Text: "2",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: versionChange,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "license name change by deletion",
|
||
|
|
license: "License/MIT/license.txt",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "gnu",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffDelete,
|
||
|
|
Text: "lesser",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: lesserGPLChange,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "license name change by insertion",
|
||
|
|
license: "License/MIT/license.txt",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "gnu",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffInsert,
|
||
|
|
Text: "lesser",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: lesserGPLChange,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "license name change by name insertion",
|
||
|
|
license: "License/ImageMagick/license.txt",
|
||
|
|
diffs: []diffmatchpatch.Diff{
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffEqual,
|
||
|
|
Text: "license",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
Type: diffmatchpatch.DiffInsert,
|
||
|
|
Text: "imagemagick",
|
||
|
|
},
|
||
|
|
},
|
||
|
|
expected: introducedPhraseChange,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
for _, test := range tests {
|
||
|
|
t.Run(test.name, func(t *testing.T) {
|
||
|
|
if got := scoreDiffs(test.license, test.diffs); got != test.expected {
|
||
|
|
t.Errorf("got %d, want %d", got, test.expected)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
func TestConfidencePercentage(t *testing.T) {
|
||
|
|
tests := []struct {
|
||
|
|
name string
|
||
|
|
klen, distance int
|
||
|
|
expected float64
|
||
|
|
}{
|
||
|
|
{
|
||
|
|
name: "empty text",
|
||
|
|
klen: 0,
|
||
|
|
distance: 0,
|
||
|
|
expected: 1.0,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "99% match",
|
||
|
|
klen: 100,
|
||
|
|
distance: 1,
|
||
|
|
expected: 0.99,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
for _, test := range tests {
|
||
|
|
t.Run(test.name, func(t *testing.T) {
|
||
|
|
if got := confidencePercentage(test.klen, test.distance); got != test.expected {
|
||
|
|
t.Errorf("got %v want %v", got, test.expected)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
func TestScore(t *testing.T) {
|
||
|
|
tests := []struct {
|
||
|
|
name string
|
||
|
|
known, unknown string
|
||
|
|
expectedConf float64
|
||
|
|
expectedStart, expectedEnd int
|
||
|
|
}{
|
||
|
|
{
|
||
|
|
name: "identical text",
|
||
|
|
known: "here is some sample text",
|
||
|
|
unknown: "here is some sample text",
|
||
|
|
expectedConf: 1.00,
|
||
|
|
expectedStart: 0,
|
||
|
|
expectedEnd: 0,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "close match with matching sizes",
|
||
|
|
known: "here is some sample text",
|
||
|
|
unknown: "here is different sample text",
|
||
|
|
expectedConf: .8,
|
||
|
|
expectedStart: 0,
|
||
|
|
expectedEnd: 0,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "close match with different sizes",
|
||
|
|
known: "here is some sample text",
|
||
|
|
unknown: "padding before here is different sample text",
|
||
|
|
expectedConf: .8,
|
||
|
|
expectedStart: 2,
|
||
|
|
expectedEnd: 0,
|
||
|
|
},
|
||
|
|
{
|
||
|
|
name: "no match due to unacceptable diff",
|
||
|
|
known: "here is some sample text for version 2 of the license",
|
||
|
|
unknown: "padding before here is different sample text for version 3 of the licenses",
|
||
|
|
expectedConf: 0.0,
|
||
|
|
expectedStart: 0,
|
||
|
|
expectedEnd: 0,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
for _, test := range tests {
|
||
|
|
t.Run(test.name, func(t *testing.T) {
|
||
|
|
var trace strings.Builder
|
||
|
|
c := NewClassifier(.8)
|
||
|
|
c.SetTraceConfiguration(&TraceConfiguration{
|
||
|
|
TraceLicenses: "*",
|
||
|
|
TracePhases: "*",
|
||
|
|
Tracer: func(f string, args ...interface{}) {
|
||
|
|
trace.WriteString(fmt.Sprintf(f, args...))
|
||
|
|
},
|
||
|
|
})
|
||
|
|
c.AddContent("", "known", "", []byte(test.known))
|
||
|
|
kd := c.getIndexedDocument("", "known", "")
|
||
|
|
ud := c.createTargetIndexedDocument([]byte(test.unknown))
|
||
|
|
// The name for the test needs to look like an asset path so we prepend
|
||
|
|
// the directory.
|
||
|
|
conf, so, eo := c.score("License/"+test.name, ud, kd, 0, ud.size())
|
||
|
|
|
||
|
|
success := true
|
||
|
|
if conf != test.expectedConf {
|
||
|
|
t.Errorf("conf: got %v want %v", conf, test.expectedConf)
|
||
|
|
success = false
|
||
|
|
}
|
||
|
|
if so != test.expectedStart {
|
||
|
|
t.Errorf("start offset: got %v want %v", so, test.expectedStart)
|
||
|
|
success = false
|
||
|
|
}
|
||
|
|
if eo != test.expectedEnd {
|
||
|
|
t.Errorf("end offset: got %v want %v", so, test.expectedEnd)
|
||
|
|
success = false
|
||
|
|
}
|
||
|
|
|
||
|
|
if !success {
|
||
|
|
t.Errorf("Trace:\n%s", trace.String())
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|