unplugged-system/external/licenseclassifier/v2/tokenizer_test.go

316 lines
6.9 KiB
Go
Raw Permalink Normal View History

// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"bytes"
"io"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
)
func TestCleanupToken(t *testing.T) {
tests := []struct {
input string
output string
}{{
input: "cleanup!",
output: "cleanup",
},
{
input: "12345",
output: "12345",
},
{
input: "r1@zx42-",
output: "rzx",
},
{
input: "12345,",
output: "12345",
},
{
input: "12345-6789",
output: "12345-6789",
},
{
input: "1(a)",
output: "1",
},
{
input: "1.2.3",
output: "1.2.3",
},
}
for _, test := range tests {
if got := cleanupToken(0, test.input, true); got != test.output {
t.Errorf("%q: got %q want %q", test.input, got, test.output)
}
}
}
func TestTokenize(t *testing.T) {
tests := []struct {
name string
input string
output *indexedDocument
}{
{name: "hyphenization recovery",
input: `basket-
ball`,
output: &indexedDocument{
Tokens: []indexedToken{
{
ID: 1,
Line: 1,
},
},
Norm: "basketball",
},
},
{
name: "basic scenario",
input: `The AWESOME Project LICENSE
Modifi-
cations prohibited
Copyright 1996-2002, 2006 by A. Developer
Introduction
The AWESOME Project`,
output: &indexedDocument{
Tokens: []indexedToken{
{
ID: 1,
Line: 1,
},
{
ID: 2,
Line: 1,
},
{
ID: 3,
Line: 1,
},
{
ID: 4,
Line: 1,
},
{
ID: 5,
Line: 3,
},
{
ID: 6,
Line: 4,
},
{
ID: 7,
Line: 8,
},
{
ID: 1,
Line: 10,
},
{
ID: 2,
Line: 10,
},
{
ID: 3,
Line: 10,
},
},
Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
Norm: "the awesome project license modifications prohibited introduction the awesome project",
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true)
if err != nil {
t.Errorf("%s failed: got unexpected error %v", test.name, err)
}
if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
}
})
}
}
type mockReader struct {
t *testing.T
schedule []int
cur int
}
func (m *mockReader) Read(buf []byte) (int, error) {
if m.cur > len(m.schedule) {
m.t.Fatal("Unexpected read on mock")
}
if m.cur == len(m.schedule) {
return 0, io.EOF
}
if len(buf) != m.schedule[m.cur] {
m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur])
}
m.cur++
for i := range buf {
buf[i] = 'a'
}
return len(buf), nil
}
func TestTokenizerBuffering(t *testing.T) {
dict := newDictionary()
mr := mockReader{
t: t,
schedule: []int{1024, 1020, 1020},
}
d, err := tokenizeStream(&mr, true, dict, true)
if err != nil {
t.Errorf("Read returned unexpected error: %v", err)
}
// Do a basic test to make sure the data returned is sound
if len(d.Tokens) != 1 {
t.Errorf("Got %d tokens, expected 1", len(d.Tokens))
}
if len(d.Norm) != 3064 {
t.Errorf("Got %d bytes, expected 3064", len(d.Norm))
}
}
func TestTokenizer(t *testing.T) {
// This test focuses primarily on the textual content extracted and does not look
// at the other parts of the document.
tests := []struct {
name string
input string
output string
}{
{
name: "Basic Tokens",
input: "Here are some words. ",
output: "here are some words",
},
{
name: "skips bullet headers",
input: "* item the first\n· item the second",
output: "item the first item the second",
},
{
name: "preserves version numbers but not header numbers",
input: "sample rules\n1. Python 2.7.8 is a version of the language.",
output: "sample rules python 2.7.8 is a version of the language",
},
{
name: "preserves version numbers across line breaks",
input: "Python version\n2.7.8 is a version of the language.",
output: "python version 2.7.8 is a version of the language",
},
{
name: "preserves punctuation",
input: "Bill, Larry, and Sergey agree precision is critical!",
output: "bill larry and sergey agree precision is critical",
},
{
name: "ignores comment characters and bullet formatting",
input: "/* * item the first",
output: "item the first",
},
{
name: "produces blank line as needed",
input: "/* *",
output: "",
},
{
name: "clobbers header looking thing as appropriate",
input: " iv. this is a test",
output: "this is a test",
},
{
name: "clobbers header looking thing as appropriate even in comment",
input: "/* 1.2.3. this is a test",
output: "this is a test",
},
{
name: "preserve version number (not a header, but header-looking) not at beginning of sentence",
input: "This is version 1.1.",
output: "this is version 1.1",
},
{
name: "copyright inside a comment",
input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
output: "",
},
{
name: "FTL copyright text",
input: `The FreeType Project LICENSE
2006-Jan-27
2006-01-27
Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg
Introduction
The FreeType Project`,
output: "the freetype project license introduction the freetype project",
},
{
name: "Separated text",
input: `distribution and modifi
cation follow.`,
output: "distribution and modification follow",
},
{
name: "preserve internal references, even on line break",
input: "(ii) should be preserved as (ii) is preserved",
output: "ii should be preserved as ii is preserved",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
dict := newDictionary()
d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true)
if err != nil {
t.Errorf("%s failed: got unexpected error %v", test.name, err)
}
var b strings.Builder
for _, tok := range d.Tokens {
b.WriteString(dict.getWord(tok.ID))
b.WriteString(" ")
}
actual := strings.TrimSpace(b.String())
if actual != test.output {
t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
}
})
}
}