381 lines
9.2 KiB
Go
381 lines
9.2 KiB
Go
// Copyright 2017 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package commentparser does a basic parse over a source file and returns all
|
|
// of the comments from the code. This is useful for when you want to analyze
|
|
// text written in comments (like copyright notices) but not in the code
|
|
// itself.
|
|
package commentparser
|
|
|
|
import (
|
|
"bytes"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"github.com/google/licenseclassifier/commentparser/language"
|
|
)
|
|
|
|
const (
|
|
eofInString = "%d:EOF in string"
|
|
eofInSingleLineComment = "%d:EOF in single line comment"
|
|
eofInMultilineComment = "%d:EOF in multiline comment"
|
|
)
|
|
|
|
// Parse parses the input data and returns the comments.
|
|
func Parse(contents []byte, lang language.Language) Comments {
|
|
if len(contents) == 0 {
|
|
return nil
|
|
}
|
|
|
|
c := string(contents)
|
|
if !strings.HasSuffix(c, "\n") {
|
|
// Force a terminating newline if one isn't present.
|
|
c += "\n"
|
|
}
|
|
i := &input{
|
|
s: c,
|
|
lang: lang,
|
|
offset: 0,
|
|
pos: position{line: 1, lineRune: []int{0}},
|
|
}
|
|
i.lex()
|
|
return i.comments
|
|
}
|
|
|
|
// Comment is either a single line or multiline comment in a source code file.
|
|
// A single line comment has StartLine equal to EndLine. The lines are 1-based.
|
|
type Comment struct {
|
|
StartLine int
|
|
EndLine int
|
|
Text string
|
|
}
|
|
|
|
// Comments allows us to treat a slice of comments as a unit.
|
|
type Comments []*Comment
|
|
|
|
// ChunkIterator returns a read-only channel and generates the comments in a
|
|
// goroutine, then closes the channel.
|
|
func (c Comments) ChunkIterator() <-chan Comments {
|
|
ch := make(chan Comments)
|
|
go func() {
|
|
defer close(ch)
|
|
|
|
if len(c) == 0 {
|
|
return
|
|
}
|
|
|
|
prevChunk := c[0]
|
|
for index := 0; index < len(c); index++ {
|
|
var chunk Comments
|
|
for ; index < len(c); index++ {
|
|
if c[index].StartLine > prevChunk.StartLine+1 {
|
|
break
|
|
}
|
|
if c[index].StartLine == prevChunk.StartLine+2 {
|
|
if c[index].StartLine != c[index].EndLine || prevChunk.StartLine != prevChunk.EndLine {
|
|
break
|
|
}
|
|
}
|
|
chunk = append(chunk, c[index])
|
|
prevChunk = c[index]
|
|
}
|
|
if len(chunk) == 0 {
|
|
break
|
|
}
|
|
|
|
ch <- chunk
|
|
if index >= len(c) {
|
|
break
|
|
}
|
|
|
|
prevChunk = c[index]
|
|
index--
|
|
}
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
// StartLine is the line number (1-based) the first part of the comment block
|
|
// starts on.
|
|
func (c Comments) StartLine() int {
|
|
if len(c) == 0 {
|
|
return 0
|
|
}
|
|
return c[0].StartLine
|
|
}
|
|
|
|
// String creates a string out of the text of the comments. Comment begin and
|
|
// end markers are removed.
|
|
func (c Comments) String() string {
|
|
var s []string
|
|
for _, cmt := range c {
|
|
s = append(s, cmt.Text)
|
|
}
|
|
return strings.Join(s, "\n")
|
|
}
|
|
|
|
// position records the location of a lexeme.
|
|
type position struct {
|
|
line int // Line number of input: 1-based
|
|
lineRune []int // Rune offset from beginning of line: 0-based
|
|
}
|
|
|
|
// input holds the current state of the lexer.
|
|
type input struct {
|
|
s string // Entire input.
|
|
lang language.Language // Source code language.
|
|
offset int // Offset into input.
|
|
pos position // Current position in the input.
|
|
comments Comments // Comments in the source file.
|
|
}
|
|
|
|
// lex is called to obtain the comments.
|
|
func (i *input) lex() {
|
|
for {
|
|
c, ok := i.peekRune()
|
|
if !ok {
|
|
break
|
|
}
|
|
|
|
switch c {
|
|
case '"', '\'', '`': // String
|
|
// Ignore strings because they could contain comment
|
|
// start or end sequences which we need to ignore.
|
|
if i.lang == language.HTML {
|
|
// Quotes in HTML-like files aren't meaningful,
|
|
// because it's basically plain text
|
|
break
|
|
}
|
|
|
|
ok, hasEscape := i.lang.QuoteCharacter(c)
|
|
if !ok {
|
|
break
|
|
}
|
|
|
|
var content bytes.Buffer
|
|
isDocString := false
|
|
quote := string(c)
|
|
if i.lang == language.Python {
|
|
if c == '\'' && i.match("'''") {
|
|
quote = "'''"
|
|
// Assume module-level docstrings start at the
|
|
// beginning of a line. Function docstrings not
|
|
// supported.
|
|
if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
|
|
isDocString = true
|
|
}
|
|
} else if c == '"' && i.match(`"""`) {
|
|
quote = `"""`
|
|
if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
|
|
isDocString = true
|
|
}
|
|
} else {
|
|
i.readRune() // Eat quote.
|
|
}
|
|
} else {
|
|
i.readRune() // Eat quote.
|
|
}
|
|
|
|
startLine := i.pos.line
|
|
for {
|
|
c, ok = i.peekRune()
|
|
if !ok {
|
|
return
|
|
}
|
|
if hasEscape && c == '\\' {
|
|
i.readRune() // Eat escape.
|
|
} else if i.match(quote) {
|
|
break
|
|
} else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' {
|
|
// JavaScript and Perl allow you to
|
|
// specify regexes without quotes, but
|
|
// which contain quotes. So treat the
|
|
// newline as terminating the string.
|
|
break
|
|
}
|
|
c := i.readRune()
|
|
if isDocString {
|
|
content.WriteRune(c)
|
|
}
|
|
if i.eof() {
|
|
return
|
|
}
|
|
}
|
|
if isDocString {
|
|
i.comments = append(i.comments, &Comment{
|
|
StartLine: startLine,
|
|
EndLine: i.pos.line,
|
|
Text: content.String(),
|
|
})
|
|
}
|
|
default:
|
|
startLine := i.pos.line
|
|
var comment bytes.Buffer
|
|
if ok, start, end := i.multiLineComment(); ok { // Multiline comment
|
|
nesting := 0
|
|
startLine := i.pos.line
|
|
for {
|
|
if i.eof() {
|
|
return
|
|
}
|
|
c := i.readRune()
|
|
comment.WriteRune(c)
|
|
if i.lang.NestedComments() && i.match(start) {
|
|
// Allows nested comments.
|
|
comment.WriteString(start)
|
|
nesting++
|
|
}
|
|
if i.match(end) {
|
|
if nesting > 0 {
|
|
comment.WriteString(end)
|
|
nesting--
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
i.comments = append(i.comments, &Comment{
|
|
StartLine: startLine,
|
|
EndLine: i.pos.line,
|
|
Text: comment.String(),
|
|
})
|
|
} else if i.singleLineComment() { // Single line comment
|
|
for {
|
|
if i.eof() {
|
|
return
|
|
}
|
|
c = i.readRune()
|
|
if c == '\n' {
|
|
i.unreadRune(c)
|
|
break
|
|
}
|
|
comment.WriteRune(c)
|
|
}
|
|
i.comments = append(i.comments, &Comment{
|
|
StartLine: startLine,
|
|
EndLine: i.pos.line,
|
|
Text: comment.String(),
|
|
})
|
|
}
|
|
}
|
|
|
|
i.readRune() // Ignore non-comments.
|
|
}
|
|
}
|
|
|
|
// singleLineComment returns 'true' if we've run across a single line comment
|
|
// in the given language.
|
|
func (i *input) singleLineComment() bool {
|
|
if i.match(i.lang.SingleLineCommentStart()) {
|
|
return true
|
|
}
|
|
|
|
if i.lang == language.SQL {
|
|
return i.match(language.MySQL.SingleLineCommentStart())
|
|
} else if i.lang == language.ObjectiveC {
|
|
return i.match(language.Matlab.SingleLineCommentStart())
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// multiLineComment returns 'true' if we've run across a multiline comment in
|
|
// the given language.
|
|
func (i *input) multiLineComment() (bool, string, string) {
|
|
if s := i.lang.MultilineCommentStart(); i.match(s) {
|
|
return true, s, i.lang.MultilineCommentEnd()
|
|
}
|
|
|
|
if i.lang == language.SQL {
|
|
if s := language.MySQL.MultilineCommentStart(); i.match(s) {
|
|
return true, s, language.MySQL.MultilineCommentEnd()
|
|
}
|
|
} else if i.lang == language.ObjectiveC {
|
|
if s := language.Matlab.MultilineCommentStart(); i.match(s) {
|
|
return true, s, language.Matlab.MultilineCommentEnd()
|
|
}
|
|
}
|
|
|
|
return false, "", ""
|
|
}
|
|
|
|
// match returns 'true' if the next tokens in the stream match the given
|
|
// string.
|
|
func (i *input) match(s string) bool {
|
|
if s == "" {
|
|
return false
|
|
}
|
|
saved := s
|
|
var read []rune
|
|
for len(s) > 0 && !i.eof() {
|
|
r, size := utf8.DecodeRuneInString(s)
|
|
if c, ok := i.peekRune(); ok && c == r {
|
|
read = append(read, c)
|
|
} else {
|
|
// No match. Push the tokens we read back onto the stack.
|
|
for idx := len(read) - 1; idx >= 0; idx-- {
|
|
i.unreadRune(read[idx])
|
|
}
|
|
return false
|
|
}
|
|
s = s[size:]
|
|
i.readRune() // Eat token.
|
|
}
|
|
return string(read) == saved
|
|
}
|
|
|
|
// eof reports whether the input has reached the end of the file.
|
|
func (i *input) eof() bool {
|
|
return len(i.s) <= i.offset
|
|
}
|
|
|
|
// peekRune returns the next rune in the input without consuming it.
|
|
func (i *input) peekRune() (rune, bool) {
|
|
if i.eof() {
|
|
return rune(0), false
|
|
}
|
|
r, _ := utf8.DecodeRuneInString(i.s[i.offset:])
|
|
return r, true
|
|
}
|
|
|
|
// readRune consumes and returns the next rune in the input.
|
|
func (i *input) readRune() rune {
|
|
r, size := utf8.DecodeRuneInString(i.s[i.offset:])
|
|
if r == '\n' {
|
|
i.pos.line++
|
|
i.pos.lineRune = append(i.pos.lineRune, 0)
|
|
} else {
|
|
i.pos.lineRune[len(i.pos.lineRune)-1]++
|
|
}
|
|
i.offset += size
|
|
return r
|
|
}
|
|
|
|
// unreadRune winds the lexer's state back to before the rune was read.
|
|
func (i *input) unreadRune(c rune) {
|
|
p := make([]byte, utf8.UTFMax)
|
|
size := utf8.EncodeRune(p, c)
|
|
i.offset -= size
|
|
if c == '\n' {
|
|
i.pos.line--
|
|
if len(i.pos.lineRune) > 1 {
|
|
i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1]
|
|
} else {
|
|
i.pos.lineRune[len(i.pos.lineRune)-1] = 0
|
|
}
|
|
} else {
|
|
i.pos.lineRune[len(i.pos.lineRune)-1]--
|
|
}
|
|
}
|