365 lines
11 KiB
Go
365 lines
11 KiB
Go
// Copyright 2020 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package classifier
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// Match is the information about a single instance of a detected match.
|
|
type Match struct {
|
|
Name string
|
|
Confidence float64
|
|
MatchType string
|
|
Variant string
|
|
StartLine int
|
|
EndLine int
|
|
StartTokenIndex int
|
|
EndTokenIndex int
|
|
}
|
|
|
|
// Results captures the summary information and matches detected by the
|
|
// classifier.
|
|
type Results struct {
|
|
Matches Matches
|
|
TotalInputLines int
|
|
}
|
|
|
|
// Matches is a sortable slice of Match.
|
|
type Matches []*Match
|
|
|
|
// Swap two elements of Matches.
|
|
func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
|
|
func (d Matches) Len() int { return len(d) }
|
|
func (d Matches) Less(i, j int) bool {
|
|
di, dj := d[i], d[j]
|
|
// Return matches ordered by confidence
|
|
if di.Confidence != dj.Confidence {
|
|
return di.Confidence > dj.Confidence
|
|
}
|
|
// Licenses of same confidence are ordered by their appearance
|
|
if di.StartTokenIndex != dj.StartTokenIndex {
|
|
return di.StartTokenIndex < dj.StartTokenIndex
|
|
}
|
|
// Should never get here, but tiebreak based on the larger license.
|
|
return di.EndTokenIndex > dj.EndTokenIndex
|
|
}
|
|
|
|
// Match reports instances of the supplied content in the corpus.
|
|
func (c *Classifier) match(in io.Reader) (Results, error) {
|
|
id, err := tokenizeStream(in, true, c.dict, false)
|
|
if err != nil {
|
|
return Results{}, err
|
|
}
|
|
|
|
firstPass := make(map[string]*indexedDocument)
|
|
for l, d := range c.docs {
|
|
sim := id.tokenSimilarity(d)
|
|
|
|
if c.tc.traceTokenize(l) {
|
|
c.tc.trace("Token similarity for %s: %.2f", l, sim)
|
|
}
|
|
|
|
if sim >= c.threshold {
|
|
firstPass[l] = d
|
|
}
|
|
}
|
|
|
|
if len(firstPass) == 0 {
|
|
return Results{
|
|
Matches: nil,
|
|
TotalInputLines: 0,
|
|
}, nil
|
|
}
|
|
|
|
// Perform the expensive work of generating a searchset to look for token runs.
|
|
id.generateSearchSet(c.q)
|
|
|
|
var candidates Matches
|
|
candidates = append(candidates, id.Matches...)
|
|
|
|
for l, d := range firstPass {
|
|
matches := c.findPotentialMatches(d.s, id.s, c.threshold)
|
|
for _, m := range matches {
|
|
startIndex := m.TargetStart
|
|
endIndex := m.TargetEnd
|
|
conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
|
|
if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
|
|
candidates = append(candidates, &Match{
|
|
Name: LicenseName(l),
|
|
Variant: variantName(l),
|
|
MatchType: detectionType(l),
|
|
Confidence: conf,
|
|
StartLine: id.Tokens[startIndex+startOffset].Line,
|
|
EndLine: id.Tokens[endIndex-endOffset-1].Line,
|
|
StartTokenIndex: startIndex + startOffset,
|
|
EndTokenIndex: endIndex - endOffset - 1,
|
|
})
|
|
}
|
|
|
|
}
|
|
}
|
|
sort.Sort(candidates)
|
|
retain := make([]bool, len(candidates))
|
|
for i, c := range candidates {
|
|
// Filter out overlapping licenses based primarily on confidence. Since
|
|
// the candidates slice is ordered by confidence, we look for overlaps and
|
|
// decide if we retain the record c.
|
|
|
|
// For each candidate, only add it to the report unless we have a
|
|
// higher-quality hit that contains these lines. In the case of two
|
|
// licenses having overlap, we consider 'token density' to break ties. If a
|
|
// less confident match of a larger license has more matching tokens than a
|
|
// perfect match of a smaller license, we want to keep that. This handles
|
|
// licenses that include another license as a subtext. NPL contains MPL
|
|
// as a concrete example.
|
|
|
|
keep := true
|
|
proposals := make(map[int]bool)
|
|
for j, o := range candidates {
|
|
if j == i {
|
|
break
|
|
}
|
|
// Make sure to only check containment on licenses that are still in consideration at this point.
|
|
if contains(c, o) && retain[j] {
|
|
// The license here can override a previous detection, but that isn't sufficient to be kept
|
|
// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
|
|
// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
|
|
// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
|
|
// bits are missing.
|
|
|
|
ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
|
|
otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
|
|
cconf := ctoks * c.Confidence
|
|
oconf := otoks * o.Confidence
|
|
|
|
// If the two licenses are exactly the same confidence, that means we
|
|
// have an ambiguous detect and should retain both, so the caller can
|
|
// see and resolve the situation.
|
|
if cconf > oconf {
|
|
proposals[j] = false
|
|
} else if oconf > cconf {
|
|
keep = false
|
|
}
|
|
} else if overlaps(c, o) && retain[j] {
|
|
// if the ending and start lines exactly overlap, it's OK to keep both
|
|
if c.StartLine != o.EndLine {
|
|
keep = false
|
|
}
|
|
}
|
|
|
|
if !keep {
|
|
break
|
|
}
|
|
}
|
|
if keep {
|
|
retain[i] = true
|
|
for p, v := range proposals {
|
|
retain[p] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
var out Matches
|
|
for i, keep := range retain {
|
|
if keep {
|
|
out = append(out, candidates[i])
|
|
}
|
|
}
|
|
return Results{
|
|
Matches: out,
|
|
TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
|
|
}, nil
|
|
}
|
|
|
|
// Classifier provides methods for identifying open source licenses in text
|
|
// content.
|
|
type Classifier struct {
|
|
tc *TraceConfiguration
|
|
dict *dictionary
|
|
docs map[string]*indexedDocument
|
|
threshold float64
|
|
q int // The value of q for q-grams in this corpus
|
|
}
|
|
|
|
// NewClassifier creates a classifier with an empty corpus.
|
|
func NewClassifier(threshold float64) *Classifier {
|
|
classifier := &Classifier{
|
|
tc: new(TraceConfiguration),
|
|
dict: newDictionary(),
|
|
docs: make(map[string]*indexedDocument),
|
|
threshold: threshold,
|
|
q: computeQ(threshold),
|
|
}
|
|
return classifier
|
|
}
|
|
|
|
// Normalize takes input content and applies the following transforms to aid in
|
|
// identifying license content. The return value of this function is
|
|
// line-separated text which is the basis for position values returned by the
|
|
// classifier.
|
|
//
|
|
// 1. Breaks up long lines of text. This helps with detecting licenses like in
|
|
// TODO(wcn):URL reference
|
|
//
|
|
// 2. Certain ignorable texts are removed to aid matching blocks of text.
|
|
// Introductory lines such as "The MIT License" are removed. Copyright notices
|
|
// are removed since the parties are variable and shouldn't impact matching.
|
|
//
|
|
// It is NOT necessary to call this function to simply identify licenses in a
|
|
// file. It should only be called to aid presenting this information to the user
|
|
// in context (for example, creating diffs of differences to canonical
|
|
// licenses).
|
|
//
|
|
// It is an invariant of the classifier that calling Match(Normalize(in)) will
|
|
// return the same results as Match(in).
|
|
func (c *Classifier) Normalize(in []byte) []byte {
|
|
doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
|
|
if err != nil {
|
|
panic("should not be reachable, since bytes.NewReader().Read() should never fail")
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
|
|
switch len(doc.Tokens) {
|
|
case 0:
|
|
return nil
|
|
case 1:
|
|
buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
|
|
return buf.Bytes()
|
|
}
|
|
|
|
prevLine := 1
|
|
buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
|
|
for _, t := range doc.Tokens[1:] {
|
|
// Only write out an EOL token that incremented the line
|
|
if t.Line == prevLine+1 {
|
|
buf.WriteString(eol)
|
|
}
|
|
|
|
// Only write tokens that aren't EOL
|
|
txt := c.dict.getWord(t.ID)
|
|
|
|
if txt != eol {
|
|
// Only put a space between tokens if the previous token was on the same
|
|
// line. This prevents spaces after an EOL
|
|
if t.Line == prevLine {
|
|
buf.WriteString(" ")
|
|
}
|
|
buf.WriteString(txt)
|
|
}
|
|
|
|
prevLine = t.Line
|
|
}
|
|
return buf.Bytes()
|
|
}
|
|
|
|
// LoadLicenses adds the contents of the supplied directory to the corpus of the
|
|
// classifier.
|
|
func (c *Classifier) LoadLicenses(dir string) error {
|
|
var files []string
|
|
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
if !strings.HasSuffix(path, "txt") {
|
|
return nil
|
|
}
|
|
files = append(files, path)
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, f := range files {
|
|
relativePath := strings.Replace(f, dir, "", 1)
|
|
sep := fmt.Sprintf("%c", os.PathSeparator)
|
|
segments := strings.Split(relativePath, sep)
|
|
if len(segments) < 3 {
|
|
c.tc.trace("Insufficient segment count for path: %s", relativePath)
|
|
continue
|
|
}
|
|
category, name, variant := segments[1], segments[2], segments[3]
|
|
b, err := ioutil.ReadFile(f)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.AddContent(category, name, variant, []byte(string(b)))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// SetTraceConfiguration installs a tracing configuration for the classifier.
|
|
func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
|
|
c.tc = in
|
|
c.tc.init()
|
|
}
|
|
|
|
// Match finds matches within an unknown text. This will not modify the contents
|
|
// of the supplied byte slice.
|
|
func (c *Classifier) Match(in []byte) Results {
|
|
// Since bytes.NewReader().Read() will never return an error, tokenizeStream
|
|
// will never return an error so it's okay to ignore the return value in this
|
|
// case.
|
|
res, _ := c.MatchFrom(bytes.NewReader(in))
|
|
return res
|
|
}
|
|
|
|
// MatchFrom finds matches within the read content.
|
|
func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
|
|
return c.match(in)
|
|
}
|
|
|
|
func detectionType(in string) string {
|
|
splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
|
|
return splits[0]
|
|
}
|
|
|
|
func variantName(in string) string {
|
|
splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
|
|
return splits[2]
|
|
}
|
|
|
|
// LicenseName produces the output name for a license, removing the internal structure
|
|
// of the filename in use.
|
|
func LicenseName(in string) string {
|
|
splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
|
|
return splits[1]
|
|
}
|
|
|
|
// contains returns true iff b is completely inside a
|
|
func contains(a, b *Match) bool {
|
|
return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
|
|
}
|
|
|
|
// returns true iff b <= a <= c
|
|
func between(a, b, c int) bool {
|
|
return b <= a && a <= c
|
|
}
|
|
|
|
// returns true iff the ranges covered by a and b overlap.
|
|
func overlaps(a, b *Match) bool {
|
|
return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
|
|
}
|