418 lines
11 KiB
Go
418 lines
11 KiB
Go
// Copyright 2020 Google Inc.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
|
||
package classifier
|
||
|
||
import (
|
||
"html"
|
||
"io"
|
||
"regexp"
|
||
"strings"
|
||
"unicode"
|
||
"unicode/utf8"
|
||
)
|
||
|
||
var eol = "\n"
|
||
|
||
func header(in string) bool {
|
||
if len(in) == 0 {
|
||
return false
|
||
}
|
||
p, e := in[:len(in)-1], in[len(in)-1]
|
||
switch e {
|
||
case '.', ':', ')':
|
||
if listMarker[p] {
|
||
if e != ')' {
|
||
return true
|
||
}
|
||
}
|
||
// Check for patterns like 1.2.3
|
||
for _, r := range p {
|
||
if unicode.IsDigit(r) || r == '.' {
|
||
continue
|
||
}
|
||
return false
|
||
}
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
var listMarker = func() map[string]bool {
|
||
const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
|
||
l := map[string]bool{}
|
||
for _, marker := range strings.Split(allListMarkers, " ") {
|
||
l[marker] = true
|
||
}
|
||
return l
|
||
}()
|
||
|
||
// ignorableTexts is a list of lines at the start of the string we can remove
|
||
// to get a cleaner match.
|
||
var ignorableTexts = []*regexp.Regexp{
|
||
regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
|
||
regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
|
||
regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
|
||
}
|
||
|
||
// tokenizeStream reads bytes from src and produces an indexedDocument of its
|
||
// cotent. tokenizeStream will never return an error of its own, it can only
|
||
// return an error from the provided Reader. If the provided Reader never
|
||
// returns an error, it is safe to assume that tokenizeStream will not return an
|
||
// error.
|
||
func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
|
||
const bufSize = 1024
|
||
// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
|
||
// in the buffer to ensure we never run out of bytes trying to finish
|
||
// constructing a rune. These leftover 4 bytes will be copied to the start of
|
||
// the buffer before additional bytes are read.
|
||
tgt := bufSize - 4
|
||
|
||
rbuf := make([]byte, bufSize)
|
||
obuf := make([]byte, 0)
|
||
linebuf := make([]tokenID, 0)
|
||
idx := 0
|
||
line := 1 // 1s-based count
|
||
deferredEOL := false
|
||
deferredWord := false
|
||
// the tokenizer uses a local dictionary to conserve memory while
|
||
// analyzing the input doc to avoid polluting the global dictionary
|
||
ld := newDictionary()
|
||
|
||
var doc indexedDocument
|
||
|
||
isEOF := func(in error) bool {
|
||
return in == io.EOF || in == io.ErrUnexpectedEOF
|
||
}
|
||
|
||
// Read out the stream in chunks
|
||
for {
|
||
// Fill up the buffer with bytes to extract runes from
|
||
// idx is offset to hold any bytes left over from previous reads
|
||
n, err := io.ReadFull(src, rbuf[idx:])
|
||
if isEOF(err) {
|
||
// There are no more bytes to read, so we must now consume all bytes in the
|
||
// buffer.
|
||
tgt = idx + n
|
||
} else if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
for idx = 0; idx < tgt; {
|
||
r, n := utf8.DecodeRune(rbuf[idx:])
|
||
idx += n
|
||
|
||
if r == '\n' {
|
||
// Deal with carriage return
|
||
|
||
// If we are in a word (len(obuf) > 0)and the last rune is a -
|
||
// strike that rune and keep accumulating.
|
||
// Otherwise we treat it like a space and
|
||
// flush the word
|
||
|
||
if len(obuf) > 0 {
|
||
if obuf[len(obuf)-1] == '-' {
|
||
obuf = obuf[0 : len(obuf)-1]
|
||
deferredEOL = true
|
||
continue
|
||
}
|
||
|
||
// Append the word fragment to the line buffer
|
||
linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
|
||
}
|
||
|
||
// If there is something in the line to process, do so now
|
||
if len(linebuf) > 0 {
|
||
appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
|
||
linebuf = nil
|
||
obuf = nil
|
||
}
|
||
if !normalize {
|
||
tokID := dict.getIndex(eol)
|
||
if tokID == unknownIndex {
|
||
tokID = dict.add(eol)
|
||
}
|
||
doc.Tokens = append(doc.Tokens, indexedToken{
|
||
ID: tokID,
|
||
Line: line})
|
||
}
|
||
line++
|
||
continue
|
||
}
|
||
|
||
if len(obuf) == 0 {
|
||
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
|
||
// Number or word character starts an interesting word
|
||
// Now we slurp up all non-space runes and aggregate it as
|
||
// a single word
|
||
|
||
// Buffer the initial token, normalizing to lower case if needed
|
||
if normalize {
|
||
r = unicode.ToLower(r)
|
||
}
|
||
obuf = utf8.AppendRune(obuf, r)
|
||
}
|
||
continue
|
||
}
|
||
|
||
// At this point, len(obuf) > 0 and we are accumulating more runes
|
||
// to complete a word.
|
||
if unicode.IsSpace(r) {
|
||
// If we have a deferred EOL, we need to pick up a non-space character
|
||
// to resume the hyphenated word, so we just consume spaces until that
|
||
// happens
|
||
if deferredEOL {
|
||
continue
|
||
}
|
||
|
||
// This is a space between word characters, so we assemble the word as a
|
||
// token and flush it out.
|
||
idx -= n
|
||
|
||
linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
|
||
if deferredWord {
|
||
appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
|
||
linebuf = nil
|
||
deferredWord = false
|
||
// Increment the line count now so the remainder token is credited
|
||
// to the previous line number.
|
||
line++
|
||
}
|
||
obuf = make([]byte, 0)
|
||
continue
|
||
}
|
||
|
||
if deferredEOL {
|
||
deferredEOL = false
|
||
deferredWord = true
|
||
}
|
||
// perform token mappings for punctuation to emulate
|
||
// normalizePunctuation. this returns a string and each rune needs to be
|
||
// injected.
|
||
if rep, found := punctuationMappings[r]; found {
|
||
for _, t := range rep {
|
||
obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
|
||
}
|
||
continue
|
||
}
|
||
|
||
// if it's not punctuation, lowercase and buffer the token
|
||
obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
|
||
}
|
||
|
||
// Break out if we have consumed all read bytes
|
||
if isEOF(err) {
|
||
break
|
||
}
|
||
|
||
// Copy the unconsumed bytes at the end of the buffer to the start
|
||
// of the buffer so the next read appends after them.
|
||
n = copy(rbuf, rbuf[idx:])
|
||
idx = n
|
||
}
|
||
|
||
// Process the remaining bytes in the buffer
|
||
if len(obuf) > 0 {
|
||
linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
|
||
}
|
||
if len(linebuf) > 0 {
|
||
appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
|
||
}
|
||
|
||
doc.dict = dict
|
||
doc.generateFrequencies()
|
||
doc.runes = diffWordsToRunes(&doc, 0, doc.size())
|
||
doc.Norm = doc.normalized()
|
||
return &doc, nil
|
||
}
|
||
|
||
func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
|
||
tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
|
||
if tokens != nil {
|
||
doc.Tokens = append(doc.Tokens, tokens...)
|
||
} else if m != nil {
|
||
doc.Matches = append(doc.Matches, m)
|
||
}
|
||
}
|
||
|
||
func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
|
||
if len(in) == 0 {
|
||
return nil, nil
|
||
}
|
||
var sb strings.Builder
|
||
for i, r := range in {
|
||
out := ld.getWord(r)
|
||
if out == "" {
|
||
continue
|
||
}
|
||
sb.WriteString(out)
|
||
if i < len(in)-1 {
|
||
sb.WriteByte(' ')
|
||
}
|
||
}
|
||
|
||
out := sb.String()
|
||
|
||
for _, re := range ignorableTexts {
|
||
if re.MatchString(out) {
|
||
return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
|
||
}
|
||
}
|
||
|
||
var tokens []indexedToken
|
||
for i, r := range in {
|
||
txt := cleanupToken(i, ld.getWord(r), normalize)
|
||
if txt != "" {
|
||
var tokID tokenID
|
||
if updateDict {
|
||
tokID = dict.add(txt)
|
||
} else {
|
||
tokID = dict.getIndex(txt)
|
||
}
|
||
tokens = append(tokens, indexedToken{
|
||
Line: line,
|
||
ID: tokID,
|
||
})
|
||
}
|
||
}
|
||
|
||
return tokens, nil
|
||
}
|
||
|
||
func normalizeToken(in string) string {
|
||
// This performs some preprocessing on the token.
|
||
// This is different than cleanupToken in that fixups here
|
||
// are not exact match on the token.
|
||
// Normalizing URLs from https to http is an example of a fix applied
|
||
// here.
|
||
return strings.ReplaceAll(in, "https", "http")
|
||
}
|
||
|
||
func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
|
||
// clean up the contents of the rune buffer
|
||
token := string(obuf)
|
||
// escape sequences can occur anywhere in the string, not just the beginning
|
||
// so always attempt to unescape the word's content.
|
||
token = html.UnescapeString(token)
|
||
|
||
clean := normalizeToken(token)
|
||
|
||
return ld.add(clean)
|
||
}
|
||
|
||
func cleanupToken(pos int, in string, normalizeWord bool) string {
|
||
r, _ := utf8.DecodeRuneInString(in)
|
||
var out strings.Builder
|
||
if pos == 0 && header(in) {
|
||
return ""
|
||
}
|
||
|
||
if !unicode.IsLetter(r) {
|
||
if unicode.IsDigit(r) {
|
||
// Based on analysis of the license corpus, the characters that are
|
||
// significant are numbers, periods, and dashes. Anything else can be
|
||
// safely discarded, and helps avoid matching failures due to inconsistent
|
||
// whitespacing and formatting.
|
||
for _, c := range in {
|
||
if unicode.IsDigit(c) || c == '.' || c == '-' {
|
||
out.WriteRune(c)
|
||
}
|
||
}
|
||
|
||
// Numbers should not end in a . since that doesn't indicate a version
|
||
// number, but usually an end of a line.
|
||
res := out.String()
|
||
for strings.HasSuffix(res, ".") {
|
||
res = res[0 : len(res)-1]
|
||
}
|
||
return res
|
||
}
|
||
}
|
||
|
||
// Remove internal hyphenization or URL constructs to better normalize strings
|
||
// for matching.
|
||
|
||
for _, c := range in {
|
||
if unicode.IsLetter(c) {
|
||
out.WriteRune(c)
|
||
}
|
||
}
|
||
|
||
tok := out.String()
|
||
if !normalizeWord {
|
||
return tok
|
||
}
|
||
|
||
if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
|
||
return iw
|
||
}
|
||
return tok
|
||
}
|
||
|
||
var interchangeableWords = map[string]string{
|
||
"analyse": "analyze",
|
||
"artefact": "artifact",
|
||
"authorisation": "authorization",
|
||
"authorised": "authorized",
|
||
"calibre": "caliber",
|
||
"cancelled": "canceled",
|
||
"capitalisations": "capitalizations",
|
||
"catalogue": "catalog",
|
||
"categorise": "categorize",
|
||
"centre": "center",
|
||
"emphasised": "emphasized",
|
||
"favour": "favor",
|
||
"favourite": "favorite",
|
||
"fulfil": "fulfill",
|
||
"fulfilment": "fulfillment",
|
||
"https": "http",
|
||
"initialise": "initialize",
|
||
"judgment": "judgement",
|
||
"labelling": "labeling",
|
||
"labour": "labor",
|
||
"licence": "license",
|
||
"maximise": "maximize",
|
||
"modelled": "modeled",
|
||
"modelling": "modeling",
|
||
"offence": "offense",
|
||
"optimise": "optimize",
|
||
"organisation": "organization",
|
||
"organise": "organize",
|
||
"practise": "practice",
|
||
"programme": "program",
|
||
"realise": "realize",
|
||
"recognise": "recognize",
|
||
"signalling": "signaling",
|
||
"utilisation": "utilization",
|
||
"whilst": "while",
|
||
"wilful": "wilfull",
|
||
// TODO: These three need tokenizer magic
|
||
"non commercial": "noncommercial",
|
||
"per cent": "percent",
|
||
"sub license": "sublicense",
|
||
}
|
||
|
||
var punctuationMappings = map[rune]string{
|
||
'-': "-",
|
||
'‒': "-",
|
||
'–': "-",
|
||
'—': "-",
|
||
'‐': "-",
|
||
'©': "(c)",
|
||
'§': "(s)",
|
||
'¤': "(s)",
|
||
'·': " ",
|
||
'*': " ",
|
||
}
|