316 lines
6.9 KiB
Go
316 lines
6.9 KiB
Go
// Copyright 2020 Google Inc.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
|
||
package classifier
|
||
|
||
import (
|
||
"bytes"
|
||
"io"
|
||
"strings"
|
||
"testing"
|
||
|
||
"github.com/google/go-cmp/cmp"
|
||
"github.com/google/go-cmp/cmp/cmpopts"
|
||
)
|
||
|
||
func TestCleanupToken(t *testing.T) {
|
||
tests := []struct {
|
||
input string
|
||
output string
|
||
}{{
|
||
input: "cleanup!",
|
||
output: "cleanup",
|
||
},
|
||
{
|
||
input: "12345",
|
||
output: "12345",
|
||
},
|
||
{
|
||
input: "r1@zx42-",
|
||
output: "rzx",
|
||
},
|
||
{
|
||
input: "12345,",
|
||
output: "12345",
|
||
},
|
||
{
|
||
input: "12345-6789",
|
||
output: "12345-6789",
|
||
},
|
||
{
|
||
input: "1(a)",
|
||
output: "1",
|
||
},
|
||
{
|
||
input: "1.2.3",
|
||
output: "1.2.3",
|
||
},
|
||
}
|
||
for _, test := range tests {
|
||
if got := cleanupToken(0, test.input, true); got != test.output {
|
||
t.Errorf("%q: got %q want %q", test.input, got, test.output)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestTokenize(t *testing.T) {
|
||
tests := []struct {
|
||
name string
|
||
input string
|
||
output *indexedDocument
|
||
}{
|
||
{name: "hyphenization recovery",
|
||
input: `basket-
|
||
ball`,
|
||
output: &indexedDocument{
|
||
Tokens: []indexedToken{
|
||
{
|
||
ID: 1,
|
||
Line: 1,
|
||
},
|
||
},
|
||
Norm: "basketball",
|
||
},
|
||
},
|
||
{
|
||
name: "basic scenario",
|
||
input: `The AWESOME Project LICENSE
|
||
|
||
Modifi-
|
||
cations prohibited
|
||
|
||
Copyright 1996-2002, 2006 by A. Developer
|
||
|
||
Introduction
|
||
|
||
The AWESOME Project`,
|
||
output: &indexedDocument{
|
||
Tokens: []indexedToken{
|
||
{
|
||
ID: 1,
|
||
Line: 1,
|
||
},
|
||
{
|
||
ID: 2,
|
||
Line: 1,
|
||
},
|
||
{
|
||
ID: 3,
|
||
Line: 1,
|
||
},
|
||
{
|
||
ID: 4,
|
||
Line: 1,
|
||
},
|
||
{
|
||
ID: 5,
|
||
Line: 3,
|
||
},
|
||
{
|
||
ID: 6,
|
||
Line: 4,
|
||
},
|
||
{
|
||
ID: 7,
|
||
Line: 8,
|
||
},
|
||
{
|
||
ID: 1,
|
||
Line: 10,
|
||
},
|
||
{
|
||
ID: 2,
|
||
Line: 10,
|
||
},
|
||
{
|
||
ID: 3,
|
||
Line: 10,
|
||
},
|
||
},
|
||
Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
|
||
Norm: "the awesome project license modifications prohibited introduction the awesome project",
|
||
},
|
||
},
|
||
}
|
||
for _, test := range tests {
|
||
t.Run(test.name, func(t *testing.T) {
|
||
d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true)
|
||
if err != nil {
|
||
t.Errorf("%s failed: got unexpected error %v", test.name, err)
|
||
}
|
||
if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
|
||
t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
|
||
}
|
||
})
|
||
}
|
||
}
|
||
|
||
type mockReader struct {
|
||
t *testing.T
|
||
schedule []int
|
||
cur int
|
||
}
|
||
|
||
func (m *mockReader) Read(buf []byte) (int, error) {
|
||
if m.cur > len(m.schedule) {
|
||
m.t.Fatal("Unexpected read on mock")
|
||
}
|
||
|
||
if m.cur == len(m.schedule) {
|
||
return 0, io.EOF
|
||
}
|
||
|
||
if len(buf) != m.schedule[m.cur] {
|
||
m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
|
||
}
|
||
m.cur++
|
||
|
||
for i := range buf {
|
||
buf[i] = 'a'
|
||
}
|
||
|
||
return len(buf), nil
|
||
}
|
||
|
||
func TestTokenizerBuffering(t *testing.T) {
|
||
dict := newDictionary()
|
||
mr := mockReader{
|
||
t: t,
|
||
schedule: []int{1024, 1020, 1020},
|
||
}
|
||
d, err := tokenizeStream(&mr, true, dict, true)
|
||
if err != nil {
|
||
t.Errorf("Read returned unexpected error: %v", err)
|
||
}
|
||
|
||
// Do a basic test to make sure the data returned is sound
|
||
if len(d.Tokens) != 1 {
|
||
t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
|
||
}
|
||
|
||
if len(d.Norm) != 3064 {
|
||
t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
|
||
}
|
||
}
|
||
|
||
func TestTokenizer(t *testing.T) {
|
||
// This test focuses primarily on the textual content extracted and does not look
|
||
// at the other parts of the document.
|
||
tests := []struct {
|
||
name string
|
||
input string
|
||
output string
|
||
}{
|
||
{
|
||
name: "Basic Tokens",
|
||
input: "Here are some words. ",
|
||
output: "here are some words",
|
||
},
|
||
{
|
||
name: "skips bullet headers",
|
||
input: "* item the first\n· item the second",
|
||
output: "item the first item the second",
|
||
},
|
||
{
|
||
name: "preserves version numbers but not header numbers",
|
||
input: "sample rules\n1. Python 2.7.8 is a version of the language.",
|
||
output: "sample rules python 2.7.8 is a version of the language",
|
||
},
|
||
{
|
||
name: "preserves version numbers across line breaks",
|
||
input: "Python version\n2.7.8 is a version of the language.",
|
||
output: "python version 2.7.8 is a version of the language",
|
||
},
|
||
{
|
||
name: "preserves punctuation",
|
||
input: "Bill, Larry, and Sergey agree precision is critical!",
|
||
output: "bill larry and sergey agree precision is critical",
|
||
},
|
||
{
|
||
name: "ignores comment characters and bullet formatting",
|
||
input: "/* * item the first",
|
||
output: "item the first",
|
||
},
|
||
{
|
||
name: "produces blank line as needed",
|
||
input: "/* *",
|
||
output: "",
|
||
},
|
||
{
|
||
name: "clobbers header looking thing as appropriate",
|
||
input: " iv. this is a test",
|
||
output: "this is a test",
|
||
},
|
||
{
|
||
name: "clobbers header looking thing as appropriate even in comment",
|
||
input: "/* 1.2.3. this is a test",
|
||
output: "this is a test",
|
||
},
|
||
{
|
||
name: "preserve version number (not a header, but header-looking) not at beginning of sentence",
|
||
input: "This is version 1.1.",
|
||
output: "this is version 1.1",
|
||
},
|
||
{
|
||
name: "copyright inside a comment",
|
||
input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
|
||
output: "",
|
||
},
|
||
{
|
||
name: "FTL copyright text",
|
||
input: `The FreeType Project LICENSE
|
||
|
||
2006-Jan-27
|
||
2006-01-27
|
||
|
||
Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg
|
||
|
||
Introduction
|
||
|
||
The FreeType Project`,
|
||
output: "the freetype project license introduction the freetype project",
|
||
},
|
||
{
|
||
name: "Separated text",
|
||
input: `distribution and modifi‐
|
||
cation follow.`,
|
||
output: "distribution and modification follow",
|
||
},
|
||
{
|
||
name: "preserve internal references, even on line break",
|
||
input: "(ii) should be preserved as (ii) is preserved",
|
||
output: "ii should be preserved as ii is preserved",
|
||
},
|
||
}
|
||
|
||
for _, test := range tests {
|
||
t.Run(test.name, func(t *testing.T) {
|
||
dict := newDictionary()
|
||
d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true)
|
||
if err != nil {
|
||
t.Errorf("%s failed: got unexpected error %v", test.name, err)
|
||
}
|
||
var b strings.Builder
|
||
for _, tok := range d.Tokens {
|
||
b.WriteString(dict.getWord(tok.ID))
|
||
b.WriteString(" ")
|
||
}
|
||
actual := strings.TrimSpace(b.String())
|
||
if actual != test.output {
|
||
t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
|
||
}
|
||
})
|
||
}
|
||
}
|