229 lines
8.7 KiB
Go
229 lines
8.7 KiB
Go
// Copyright 2020 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package classifier
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/davecgh/go-spew/spew"
|
|
"github.com/sergi/go-diff/diffmatchpatch"
|
|
)
|
|
|
|
// return values for the distance function that explain why a diff
|
|
// is not an acceptable match for the source document.
|
|
const (
|
|
versionChange = -1
|
|
introducedPhraseChange = -2
|
|
lesserGPLChange = -3
|
|
)
|
|
|
|
// score computes a metric of similarity between the known and unknown
|
|
// document, including the offsets into the unknown that yield the content
|
|
// generating the computed similarity.
|
|
func (c *Classifier) score(id string, unknown, known *indexedDocument, unknownStart, unknownEnd int) (float64, int, int) {
|
|
if c.tc.traceScoring(known.s.origin) {
|
|
c.tc.trace("Scoring %s: [%d-%d]", known.s.origin, unknownStart, unknownEnd)
|
|
}
|
|
|
|
knownLength := known.size()
|
|
diffs := docDiff(id, unknown, unknownStart, unknownEnd, known, 0, knownLength)
|
|
|
|
start, end := diffRange(known.Norm, diffs)
|
|
distance := scoreDiffs(id, diffs[start:end])
|
|
|
|
if c.tc.traceScoring(known.s.origin) {
|
|
c.tc.trace("Diffs against %s:\n%s", known.s.origin, spew.Sdump(diffs[start:end]))
|
|
}
|
|
|
|
if distance < 0 {
|
|
// If the distance is negative, this indicates an unacceptable diff so we return a zero-confidence match.
|
|
if c.tc.traceScoring(known.s.origin) {
|
|
c.tc.trace("Distance result %v, rejected match", distance)
|
|
}
|
|
return 0.0, 0, 0
|
|
}
|
|
|
|
// Applying the diffRange-generated offsets provides the run of text from the
|
|
// target most closely correlated to the source. This is the process for
|
|
// compensating for errors from the searchset. With the reduced text, we
|
|
// compute the final confidence score and exact text locations for the
|
|
// matched content.
|
|
// The diff slice consists of three regions: an optional deletion diff for
|
|
// target text before the source, n elements that represent the diff between
|
|
// the source and target, and an optional deletion diff for text after the
|
|
// target. The helper functions identify the portions of the slice
|
|
// corresponding to those regions. This results in a more accurate
|
|
// confidence score and better position detection of the source in the
|
|
// target.
|
|
conf, so, eo := confidencePercentage(knownLength, distance), textLength(diffs[:start]), textLength(diffs[end:])
|
|
|
|
if c.tc.traceScoring(known.s.origin) {
|
|
c.tc.trace("Score result: %v [%d-%d]", conf, so, eo)
|
|
}
|
|
return conf, so, eo
|
|
}
|
|
|
|
// confidencePercentage computes a confidence match score for the lengths,
|
|
// handling the cases where source and target lengths differ.
|
|
func confidencePercentage(klen, distance int) float64 {
|
|
// No text is matched at 100% confidence (avoid divide by zero).
|
|
if klen == 0 {
|
|
return 1.0
|
|
}
|
|
|
|
// Return a computed fractional match against the known text.
|
|
return 1.0 - float64(distance)/float64(klen)
|
|
}
|
|
|
|
// diffLevenshteinWord computes word-based Levenshtein count.
|
|
func diffLevenshteinWord(diffs []diffmatchpatch.Diff) int {
|
|
levenshtein := 0
|
|
insertions := 0
|
|
deletions := 0
|
|
|
|
for _, aDiff := range diffs {
|
|
switch aDiff.Type {
|
|
case diffmatchpatch.DiffInsert:
|
|
insertions += wordLen(aDiff.Text)
|
|
case diffmatchpatch.DiffDelete:
|
|
deletions += wordLen(aDiff.Text)
|
|
case diffmatchpatch.DiffEqual:
|
|
// A deletion and an insertion is one substitution.
|
|
levenshtein += max(insertions, deletions)
|
|
insertions = 0
|
|
deletions = 0
|
|
}
|
|
}
|
|
|
|
levenshtein += max(insertions, deletions)
|
|
return levenshtein
|
|
}
|
|
|
|
func isVersionNumber(in string) bool {
|
|
for _, r := range in {
|
|
if !unicode.IsDigit(r) && r != '.' {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// scoreDiffs returns a score rating the acceptability of these diffs. A
|
|
// negative value means that the changes represented by the diff are not an
|
|
// acceptable transformation since it would change the underlying license. A
|
|
// positive value indicates the Levenshtein word distance.
|
|
func scoreDiffs(id string, diffs []diffmatchpatch.Diff) int {
|
|
// We make a pass looking for unacceptable substitutions
|
|
// Delete diffs are always ordered before insert diffs. This is leveraged to
|
|
// analyze a change by checking an insert against the delete text that was
|
|
// previously cached.
|
|
prevText := ""
|
|
prevDelete := ""
|
|
for i, diff := range diffs {
|
|
text := diff.Text
|
|
switch diff.Type {
|
|
case diffmatchpatch.DiffInsert:
|
|
num := text
|
|
if i := strings.Index(num, " "); i != -1 {
|
|
num = num[0:i]
|
|
}
|
|
if isVersionNumber(num) && strings.HasSuffix(prevText, "version") {
|
|
if !strings.HasSuffix(prevText, "the standard version") && !strings.HasSuffix(prevText, "the contributor version") {
|
|
return versionChange
|
|
}
|
|
}
|
|
// There are certain phrases that can't be introduced to make a license
|
|
// hit. TODO: would like to generate this programmatically. Most of
|
|
// these are words or phrases that appear in a single/small number of
|
|
// licenses. Can we leverage frequency analysis to identify these
|
|
// interesting words/phrases and auto-extract them?
|
|
|
|
inducedPhrases := map[string][]string{
|
|
"AGPL": {"affero"},
|
|
"Atmel": {"atmel"},
|
|
"Apache": {"apache"},
|
|
"BSD": {"bsd"},
|
|
"BSD-3-Clause-Attribution": {"acknowledgment"},
|
|
"bzip2": {"seward"},
|
|
"GPL-2.0-with-GCC-exception": {"gcc linking exception"},
|
|
"GPL-2.0-with-autoconf-exception": {"autoconf exception"},
|
|
"GPL-2.0-with-bison-exception": {"bison exception"},
|
|
"GPL-2.0-with-classpath-exception": {"class path exception"},
|
|
"GPL-2.0-with-font-exception": {"font exception"},
|
|
"LGPL-2.0": {"library"},
|
|
"ImageMagick": {"imagemagick"},
|
|
"PHP": {"php"},
|
|
"SISSL": {"sun standards"},
|
|
"SGI-B": {"silicon graphics"},
|
|
"SunPro": {"sunpro"},
|
|
"X11": {"x consortium"},
|
|
}
|
|
|
|
for k, ps := range inducedPhrases {
|
|
if strings.HasPrefix(LicenseName(id), k) {
|
|
for _, p := range ps {
|
|
if strings.Index(text, p) != -1 {
|
|
// Check to make sure there isn't a corresponding diff for this
|
|
// insert that also contains the text. This prevents against diff
|
|
// blocks that are too big and force a false hit on this check,
|
|
// which usually happens with URLs since they are stored in one
|
|
// token but can happen in other cases as well. We don't look just
|
|
// for delete diffs because the subsequent text may reference the
|
|
// content in case a URL was truncated.
|
|
if i+1 < len(diffs) && strings.Index(diffs[i+1].Text, p) != -1 {
|
|
continue
|
|
}
|
|
return introducedPhraseChange
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Ignore changes between "library" and "lesser" in a GNU context as they
|
|
// changed the terms, but look for introductions of Lesser that would
|
|
// otherwise disqualify a match.
|
|
if text == "lesser" && strings.HasSuffix(prevText, "gnu") && prevDelete != "library" {
|
|
// The LGPL 3.0 doesn't have a standard header, so people tend to craft
|
|
// their own. As a result, sometimes the warranty clause refers to the
|
|
// GPL instead of the LGPL. This is fine from a licensing perspective,
|
|
// but we need to tweak matching to ignore that particular case. In
|
|
// other circumstances, inserting or removing the word Lesser in the
|
|
// GPL context is not an acceptable change. There is also a reference to
|
|
// it when suggesting to use the LGPL.
|
|
if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
|
|
return lesserGPLChange
|
|
}
|
|
}
|
|
case diffmatchpatch.DiffEqual:
|
|
prevText = text
|
|
prevDelete = ""
|
|
|
|
case diffmatchpatch.DiffDelete:
|
|
// Avoid substitution in most cases. The two exceptions are with usage
|
|
// statements that are talking about *another* license, and don't affect
|
|
// the detection of the current license.
|
|
if (text == "lesser" || text == "library") && strings.HasSuffix(prevText, "gnu") {
|
|
// Same as above to avoid matching GPL instead of LGPL here.
|
|
if !strings.Contains(prevText, "warranty") && !strings.Contains(prevText, "is covered by the gnu") {
|
|
return lesserGPLChange
|
|
}
|
|
}
|
|
prevDelete = text
|
|
}
|
|
}
|
|
return diffLevenshteinWord(diffs)
|
|
}
|