556 lines
15 KiB
Go
556 lines
15 KiB
Go
// Copyright 2017 Google Inc.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
//
|
||
// Select test data comes from
|
||
// The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue
|
||
|
||
package stringclassifier
|
||
|
||
import (
|
||
"reflect"
|
||
"regexp"
|
||
"sort"
|
||
"testing"
|
||
|
||
"github.com/sergi/go-diff/diffmatchpatch"
|
||
)
|
||
|
||
var (
|
||
gettysburg = `Four score and seven years ago our fathers brought forth
|
||
on this continent, a new nation, conceived in Liberty, and dedicated to the
|
||
proposition that all men are created equal.`
|
||
modifiedGettysburg = `Four score and seven years ago our fathers brought forth
|
||
on this continent, a nation that was new and improved, conceived in Liberty, and
|
||
dedicated to the proposition that all men are created equal.`
|
||
gettysburgExtraWord = `Four score and seven years ago our fathers brought forth
|
||
on this continent, a new nation, conceived in Liberty, and dedicated to the
|
||
proposition that all men are created equal.Foobar`
|
||
|
||
declaration = `When in the Course of human events, it becomes necessary
|
||
for one people to dissolve the political bands which have connected them with
|
||
another, and to assume among the powers of the earth, the separate and equal
|
||
station to which the Laws of Nature and of Nature's God entitle them, a decent
|
||
respect to the opinions of mankind requires that they should declare the causes
|
||
which impel them to the separation.`
|
||
|
||
loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
|
||
varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor
|
||
feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim
|
||
vulputate, tempus leo commodo, accumsan nulla.`
|
||
modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla
|
||
varius enim mattis, lectus id, aliquet rhoncus sem. Phasellus eget ex in dolor
|
||
feugiat ultricies. Etiam interdum sit amet sit nisl in placerat. Sed vitae enim
|
||
vulputate, tempus leo commodo, accumsan nulla.`
|
||
lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla
|
||
varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor
|
||
feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim
|
||
vulputate, tempus leo commodo, accumsan nulla.`
|
||
humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic
|
||
philosophy from liquor; they are “elevated,” not depressed, and do not deem
|
||
it essential to the production of a poem that its author should be a cynic or
|
||
an evil prophet. One of the best attributes of Irish poetry is its constant
|
||
expression of the natural emotions. Previous to the close of the
|
||
seventeenth[xvi] century, it is said, drunkenness was not suggested by the
|
||
poets as common in Ireland—the popularity of Bacchanalian songs since that
|
||
date seems to prove that the vice soon became a virtue. Maginn is the
|
||
noisiest of modern revellers, and easily roars the others down.
|
||
`
|
||
fellowInTheGoatSkin = `There was a poor widow living down there near the Iron
|
||
Forge when the country was all covered with forests, and you might walk on
|
||
the tops of trees from Carnew to the Lady’s Island, and she had one boy. She
|
||
was very poor, as I said before, and was not able to buy clothes for her son.
|
||
So when she was going out she fixed him snug and combustible in the ash-pit,
|
||
and piled the warm ashes about him. The boy knew no better, and was as happy
|
||
as the day was long; and he was happier still when a neighbour[10] gave his
|
||
mother a kid to keep him company when herself was abroad. The kid and the lad
|
||
played like two may-boys; and when she was old enough to give milk, wasn’t it
|
||
a godsend to the little family? You won’t prevent the boy from growing up
|
||
into a young man, but not a screed of clothes had he then no more than when
|
||
he was a gorsoon.
|
||
`
|
||
oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and
|
||
he said to him, “Now, my son,” says he, “listen to the advice I’m going to
|
||
give you. If you see a person coming near you and stooping, mind yourself,
|
||
and be on your keeping; he’s stooping for a stone to throw at you.”
|
||
|
||
“But tell me,” says the young crow, “what should I do if he had a stone
|
||
already down in his pocket?”
|
||
|
||
“Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the
|
||
devil another learning I’m able to give you.”
|
||
`
|
||
nullifiable = `[[ , _ , _ , _
|
||
? _ : _
|
||
? _ : _
|
||
? _ : _
|
||
]
|
||
}
|
||
`
|
||
nonWords = regexp.MustCompile("[[:punct:]]+")
|
||
)
|
||
|
||
// removeNonWords removes non-words from the string, replacing them with empty
|
||
// string. (This is meant to exercise tokenization problems.)
|
||
func removeNonWords(s string) string {
|
||
return nonWords.ReplaceAllString(s, "")
|
||
}
|
||
|
||
func TestClassify_NearestMatch(t *testing.T) {
|
||
c := New(DefaultConfidenceThreshold, FlattenWhitespace)
|
||
c.AddValue("gettysburg", gettysburg)
|
||
c.AddValue("declaration", declaration)
|
||
c.AddValue("loremipsum", loremipsum)
|
||
|
||
tests := []struct {
|
||
description string
|
||
input string // input string to match
|
||
name string // name of expected nearest match
|
||
minConf float64 // the lowest confidence accepted for the match
|
||
maxConf float64 // the highest confidence we expect for this match
|
||
}{
|
||
{
|
||
description: "Full Declaration",
|
||
input: declaration,
|
||
name: "declaration",
|
||
minConf: 1.0,
|
||
maxConf: 1.0,
|
||
},
|
||
{
|
||
description: "Modified Lorem",
|
||
input: modifiedLorem,
|
||
name: "loremipsum",
|
||
minConf: 0.90,
|
||
maxConf: 0.91,
|
||
},
|
||
{
|
||
description: "Modified Gettysburg",
|
||
input: modifiedGettysburg,
|
||
name: "gettysburg",
|
||
minConf: 0.86,
|
||
maxConf: 0.87,
|
||
},
|
||
}
|
||
|
||
for _, tt := range tests {
|
||
m := c.NearestMatch(tt.input)
|
||
|
||
if got, want := m.Name, tt.name; got != want {
|
||
t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want)
|
||
}
|
||
if got, want := m.Confidence, tt.minConf; got < want {
|
||
t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want)
|
||
}
|
||
if got, want := m.Confidence, tt.maxConf; got > want {
|
||
t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want)
|
||
}
|
||
}
|
||
}
|
||
|
||
type result struct {
|
||
key string // key of expected nearest match
|
||
offset int // offset of match in unknown string
|
||
|
||
// The confidence values are retrieved by simply running the classifier
|
||
// and noting the output. A value greater than the "max" is fine and
|
||
// the tests can be adjusted to account for it. A value less than "min"
|
||
// should be carefully scrutinzed before adjusting the tests.
|
||
minConf float64 // the lowest confidence accepted for the match
|
||
maxConf float64 // the highest confidence we expect for this match
|
||
}
|
||
|
||
func TestClassify_MultipleMatch(t *testing.T) {
|
||
c := New(DefaultConfidenceThreshold, FlattenWhitespace)
|
||
c.AddValue("gettysburg", gettysburg)
|
||
c.AddValue("declaration", declaration)
|
||
c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:])
|
||
c.AddValue("loremipsum", loremipsum)
|
||
|
||
cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords)
|
||
cNormalize.AddValue("gettysburg", gettysburg)
|
||
|
||
tests := []struct {
|
||
description string
|
||
c *Classifier
|
||
input string // input string to match
|
||
want []result
|
||
}{
|
||
{
|
||
description: "Exact text match",
|
||
c: c,
|
||
input: fellowInTheGoatSkin + declaration + humourOfIreland,
|
||
want: []result{
|
||
{
|
||
key: "declaration",
|
||
offset: 845,
|
||
minConf: 1.0,
|
||
maxConf: 1.0,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Partial text match",
|
||
c: c,
|
||
input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland,
|
||
want: []result{
|
||
{
|
||
key: "loremipsum",
|
||
offset: 845,
|
||
minConf: 0.90,
|
||
maxConf: 0.91,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Two partial matches",
|
||
c: c,
|
||
input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow,
|
||
want: []result{
|
||
{
|
||
key: "loremipsum",
|
||
offset: 845,
|
||
minConf: 0.90,
|
||
maxConf: 0.91,
|
||
},
|
||
{
|
||
key: "gettysburg",
|
||
offset: 1750,
|
||
minConf: 0.86,
|
||
maxConf: 0.87,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Partial matches of similar text",
|
||
c: c,
|
||
input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow,
|
||
want: []result{
|
||
{
|
||
key: "loremipsum",
|
||
offset: 1750,
|
||
minConf: 0.98,
|
||
maxConf: 0.99,
|
||
},
|
||
{
|
||
key: "loremipsum",
|
||
offset: 845,
|
||
minConf: 0.90,
|
||
maxConf: 0.91,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Nullifiable text",
|
||
c: c,
|
||
input: nullifiable,
|
||
want: nil,
|
||
},
|
||
{
|
||
description: "No match",
|
||
c: c,
|
||
input: fellowInTheGoatSkin + humourOfIreland,
|
||
want: nil,
|
||
},
|
||
{
|
||
description: "Exact text match, with extra word and non-word normalizer",
|
||
c: cNormalize,
|
||
input: fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland,
|
||
want: []result{
|
||
{
|
||
key: "gettysburg",
|
||
offset: 825,
|
||
minConf: 1.0,
|
||
maxConf: 1.0,
|
||
},
|
||
},
|
||
},
|
||
}
|
||
|
||
for _, tt := range tests {
|
||
matches := tt.c.MultipleMatch(tt.input)
|
||
if len(matches) != len(tt.want) {
|
||
t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want))
|
||
}
|
||
|
||
for i := 0; i < len(matches); i++ {
|
||
m := matches[i]
|
||
w := tt.want[i]
|
||
if got, want := m.Name, w.key; got != want {
|
||
t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want)
|
||
}
|
||
if got, want := m.Confidence, w.minConf; got < want {
|
||
t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want)
|
||
}
|
||
if got, want := m.Confidence, w.maxConf; got > want {
|
||
t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want)
|
||
}
|
||
if got, want := m.Offset, w.offset; got != want {
|
||
t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestClassify_DiffRatio(t *testing.T) {
|
||
tests := []struct {
|
||
x, y string
|
||
want float64
|
||
}{
|
||
{"", "", 1.0},
|
||
{"a", "b", 1.0},
|
||
{"", "abc", 0},
|
||
{"ab", "c", 0.5},
|
||
{"a", "bc", 0.5},
|
||
{"a", "bcde", 0.25},
|
||
}
|
||
|
||
for _, tt := range tests {
|
||
if got, want := diffRatio(tt.x, tt.y), tt.want; got != want {
|
||
t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestClassify_Matches(t *testing.T) {
|
||
tests := []struct {
|
||
description string
|
||
matches Matches
|
||
want Matches
|
||
}{
|
||
{
|
||
description: "Different names, same confidences, same offset",
|
||
matches: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
want: Matches{
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Same names, different confidences, same offset",
|
||
matches: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.90,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
want: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.90,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Same names, same confidences, different offsets",
|
||
matches: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 42,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
want: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 42,
|
||
},
|
||
},
|
||
},
|
||
|
||
{
|
||
description: "Different names, different confidences, same offset",
|
||
matches: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.90,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
want: Matches{
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.90,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Different names, same confidences, different offset",
|
||
matches: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 37,
|
||
},
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
want: Matches{
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.42,
|
||
Offset: 37,
|
||
},
|
||
},
|
||
},
|
||
{
|
||
description: "Different names, different confidences, different offset",
|
||
matches: Matches{
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.90,
|
||
Offset: 37,
|
||
},
|
||
},
|
||
want: Matches{
|
||
&Match{
|
||
Name: "b",
|
||
Confidence: 0.90,
|
||
Offset: 37,
|
||
},
|
||
&Match{
|
||
Name: "a",
|
||
Confidence: 0.42,
|
||
Offset: 0,
|
||
},
|
||
},
|
||
},
|
||
}
|
||
|
||
for _, tt := range tests {
|
||
sort.Sort(tt.matches)
|
||
if !reflect.DeepEqual(tt.matches, tt.want) {
|
||
for _, x := range tt.matches {
|
||
t.Errorf("got: %v", x)
|
||
}
|
||
for _, x := range tt.want {
|
||
t.Errorf("want: %v", x)
|
||
}
|
||
t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestClassify_DiffRangeEnd(t *testing.T) {
|
||
dmp := diffmatchpatch.New()
|
||
tests := []struct {
|
||
description string
|
||
unknown string
|
||
known string
|
||
end int
|
||
}{
|
||
{
|
||
description: "identical",
|
||
unknown: declaration,
|
||
known: declaration,
|
||
end: 1,
|
||
},
|
||
{
|
||
description: "lorem",
|
||
unknown: lessModifiedLorem,
|
||
known: loremipsum,
|
||
end: 3,
|
||
},
|
||
{
|
||
description: "gettysburg",
|
||
unknown: modifiedGettysburg,
|
||
known: gettysburg,
|
||
end: 19,
|
||
},
|
||
}
|
||
|
||
for _, tt := range tests {
|
||
diffs := dmp.DiffMain(tt.unknown, tt.known, true)
|
||
if e := diffRangeEnd(tt.known, diffs); e != tt.end {
|
||
t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end)
|
||
}
|
||
}
|
||
}
|
||
|
||
func BenchmarkClassifier(b *testing.B) {
|
||
c := New(DefaultConfidenceThreshold, FlattenWhitespace)
|
||
c.AddValue("gettysburg", gettysburg)
|
||
c.AddValue("declaration", declaration)
|
||
c.AddValue("loremipsum", loremipsum)
|
||
|
||
b.ResetTimer()
|
||
for i := 0; i < b.N; i++ {
|
||
c.NearestMatch(modifiedLorem)
|
||
}
|
||
}
|