116 lines
3.7 KiB
Go
116 lines
3.7 KiB
Go
// Copyright 2020 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package classifier
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"github.com/sergi/go-diff/diffmatchpatch"
|
|
)
|
|
|
|
// This file contains word-diffing routines that build on top of the go-diff package.
|
|
// The algorithm implemented here is from the suggested word diffing technique in
|
|
// https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs
|
|
|
|
// diffRange returns the indices of the beginning and end locations of the diff
|
|
// that reconstruct (as best possible) the source value.
|
|
func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) {
|
|
var foundStart bool
|
|
var seen string
|
|
for end = 0; end < len(diffs); end++ {
|
|
if len(seen) > 1 && seen[:len(seen)-1] == known {
|
|
break
|
|
}
|
|
switch diffs[end].Type {
|
|
case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
|
|
if !foundStart {
|
|
start = end
|
|
foundStart = true
|
|
}
|
|
seen += diffs[end].Text + " "
|
|
}
|
|
}
|
|
return start, end
|
|
}
|
|
|
|
func docDiff(id string, doc1 *indexedDocument, doc1Start, doc1End int, doc2 *indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff {
|
|
chars1 := doc1.runes[doc1Start:doc1End]
|
|
chars2 := doc2.runes[doc2Start:doc2End]
|
|
|
|
dmp := diffmatchpatch.New()
|
|
diffs := dmp.DiffMainRunes(chars1, chars2, false)
|
|
|
|
// Recover the words from the previous rune encoding and return the textual diffs.
|
|
diffs = diffRunesToWords(diffs, doc1.dict)
|
|
return diffs
|
|
}
|
|
|
|
func diffWordsToRunes(doc *indexedDocument, start, end int) []rune {
|
|
// Creates a slice of runes using the indexed values as a basis for runes.
|
|
// The go-diff code basically does exactly this using ephemeral dictionaries
|
|
// for each input string. We leverage the fact we have a persistent dictionary
|
|
// to make this operation cheaper.
|
|
// TODO: perhaps we should cache these in the corpus?
|
|
runes := make([]rune, 0, end-start)
|
|
|
|
for _, t := range doc.Tokens[start:end] {
|
|
runes = append(runes, rune(t.ID))
|
|
}
|
|
return runes
|
|
}
|
|
|
|
// diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text.
|
|
func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff {
|
|
hydrated := make([]diffmatchpatch.Diff, 0, len(diffs))
|
|
for _, aDiff := range diffs {
|
|
chars := []rune(aDiff.Text)
|
|
var sb strings.Builder
|
|
|
|
for i, r := range chars {
|
|
sb.WriteString(dict.getWord(tokenID(r)))
|
|
if (i + 1) < len(chars) {
|
|
sb.WriteByte(' ')
|
|
}
|
|
}
|
|
|
|
aDiff.Text = sb.String()
|
|
hydrated = append(hydrated, aDiff)
|
|
}
|
|
return hydrated
|
|
}
|
|
|
|
// Returns the number of words in the input string. Used by scoring and distance functions.
|
|
// This function depends on the behavior of the tokenizer such that strings are separated
|
|
// by exactly one space and don't start or end with whitespace.
|
|
func wordLen(text string) int {
|
|
if text == "" {
|
|
return 0
|
|
}
|
|
return strings.Count(text, " ") + 1
|
|
}
|
|
|
|
// textLength returns the number of tokens in the diff. This value is used to
|
|
// adjust the offset for detection, since this is the number of tokens
|
|
// discarded while matching a diff. By virtue of how it's called, there won't
|
|
// be "change" diffs (a paired insert/delete) so we can simplify the scan to
|
|
// just count up everything.
|
|
func textLength(diffs []diffmatchpatch.Diff) int {
|
|
l := 0
|
|
for _, d := range diffs {
|
|
l += wordLen(d.Text)
|
|
}
|
|
return l
|
|
}
|