Refactor CountStats state machine

dbaggerman · dbaggerman · commit 9770aba7cd9d · 2021-03-28T21:10:25.000+11:00
diff --git a/examples/issue246.py b/examples/issue246.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+"""
+Docstrings containing an apostrophe (') are handled incorrectly
+The line above is counted as code despite being in the middle of a docstring.
+The end of docstring flag seems to be changed to an apostrophe,
+which means the next line will not exit the docstring.
+"""
+# Code containing single quotes will exit the docstring,
+# but presuming the quotes are balanced the second
+# quote will put us in string scanning mode.
+if __name__ == '__main__':
+    print('Hello, World!')
+# Not counted as a comment
+
+# ^ Not counted as a blank line
+# Break out of string scanner with unbalanced single quote: '
+    exit(0)
diff --git a/processor/file.go b/processor/file.go
@@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
 			Extension:         extension,
 			PossibleLanguages: language,
 			Bytes:             fileInfo.Size(),
+			EndPoint:          int(fileInfo.Size() - 1),
 		}
 	} else if Verbose {
 		printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))
diff --git a/processor/state_blank.go b/processor/state_blank.go
@@ -0,0 +1,60 @@
+package processor
+
+type StateBlank struct {}
+
+func (state *StateBlank) String() string {
+	return "blank"
+}
+
+func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
+	case TMlcomment:
+		commentType := lineType
+		if commentType == LINE_BLANK {
+			commentType = LINE_COMMENT
+		}
+
+		index += offsetJump - 1
+		return index, commentType, NewStateCommentMulti(endString)
+
+	case TSlcomment:
+		commentType := lineType
+		if commentType == LINE_BLANK {
+			commentType = LINE_COMMENT
+		}
+		return index, commentType, &StateCommentSingle{}
+
+	case TString:
+		index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)
+
+		if docString {
+			commentType := lineType
+			if commentType == LINE_BLANK {
+				commentType = LINE_COMMENT
+			}
+
+			return index, commentType, &StateDocString{
+				End:     endString,
+				SkipEsc: skipEsc,
+			}
+		}
+
+		return index, LINE_CODE, &StateString{
+			End:     endString,
+			SkipEsc: skipEsc,
+		}
+
+	case TComplexity:
+		if index == 0 || isWhitespace(job.Content[index-1]) {
+			job.Complexity++
+		}
+		return index, LINE_BLANK, state
+
+	default:
+		return index, LINE_CODE, &StateCode{}
+	}
+}
+
+func (state *StateBlank) Reset() (LineType, State) {
+	return LINE_BLANK, state
+}
diff --git a/processor/state_code.go b/processor/state_code.go
@@ -0,0 +1,92 @@
+package processor
+
+type StateCode struct {}
+
+func (state *StateCode) String() string {
+	return "code"
+}
+
+func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	// Hacky fix to https://github.com/boyter/scc/issues/181
+	endPoint := job.EndPoint
+	if endPoint > len(job.Content) {
+		endPoint--
+	}
+
+	var i int
+	for i = index; i < endPoint; i++ {
+		curByte := job.Content[i]
+
+		if curByte == '\n' {
+			return i, LINE_CODE, state
+		}
+
+		if isBinary(i, curByte) {
+			job.Binary = true
+			return i, LINE_CODE, state
+		}
+
+		if shouldProcess(curByte, lang.ProcessMask) {
+			if Duplicates {
+				// Technically this is wrong because we skip bytes so this is not a true
+				// hash of the file contents, but for duplicate files it shouldn't matter
+				// as both will skip the same way
+				digestible := []byte{job.Content[index]}
+				job.Hash.Write(digestible)
+			}
+
+			switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
+			case TString:
+				// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string
+
+				// It is safe to -1 here as to enter the code state we need to have
+				// transitioned from blank to here hence i should always be >= 1
+				// This check is to ensure we aren't in a character declaration
+				// TODO this should use language features
+				if job.Content[i-1] == '\\' {
+					break // from switch, not from the loop
+				}
+
+				i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)
+
+				if docString {
+					commentType := lineType
+					if commentType == LINE_BLANK {
+						commentType = LINE_COMMENT
+					}
+
+					return i, commentType, &StateDocString{
+						End:     endString,
+						SkipEsc: skipEsc,
+					}
+				}
+
+				// i += offsetJump - 1
+				return i, LINE_CODE, &StateString{
+					End:     endString,
+					SkipEsc: skipEsc,
+				}
+
+			case TSlcomment:
+				i += offsetJump - 1
+				return i, LINE_CODE, &StateCommentSingle{}
+
+			case TMlcomment:
+				i += offsetJump - 1
+
+				return i, LINE_CODE, NewStateCommentMulti(endString)
+
+			case TComplexity:
+				if i == 0 || isWhitespace(job.Content[i-1]) {
+					job.Complexity++
+				}
+			}
+		}
+	}
+
+	return i, LINE_CODE, state
+}
+
+func (state *StateCode) Reset() (LineType, State) {
+	return LINE_BLANK, &StateBlank{}
+}
diff --git a/processor/state_comment_multi.go b/processor/state_comment_multi.go
@@ -0,0 +1,70 @@
+package processor
+
+type StateCommentMulti struct {
+	Stack     [][]byte
+}
+
+func (state *StateCommentMulti) String() string {
+	return "multiline-comment"
+}
+
+func NewStateCommentMulti(token []byte) *StateCommentMulti {
+	return &StateCommentMulti{
+		Stack: [][]byte{token},
+	}
+}
+
+func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	var i int
+	for i = index; i < job.EndPoint; i++ {
+		curByte := job.Content[i]
+
+		if curByte == '\n' {
+			break
+		}
+
+		endToken := state.peek()
+		if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
+			// set offset jump here
+			i += len(endToken) - 1
+
+			if len(state.Stack) == 1 {
+				return i, lineType, &StateBlank{}
+			} else {
+				state.pop()
+				return i, lineType, state
+			}
+		}
+
+		// Check if we are entering another multiline comment
+		// This should come below check for match single as it speeds up processing
+		if lang.Nested {
+			if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
+				i += offsetJump - 1
+				state.push(endString)
+				return i, lineType, state
+			}
+		}
+	}
+
+	return i, lineType, state
+}
+
+func (state *StateCommentMulti) Reset() (LineType, State) {
+	return LINE_COMMENT, state
+}
+
+func (state *StateCommentMulti) peek() []byte {
+	i := len(state.Stack) - 1
+	return state.Stack[i]
+}
+
+func (state *StateCommentMulti) push(token []byte) {
+	state.Stack = append(state.Stack, token)
+}
+
+func (state *StateCommentMulti) pop() {
+	i := len(state.Stack) - 1
+
+	state.Stack = state.Stack[:i]
+}
diff --git a/processor/state_comment_single.go b/processor/state_comment_single.go
@@ -0,0 +1,24 @@
+package processor
+
+type StateCommentSingle struct {}
+
+func (state *StateCommentSingle) String() string {
+	return "comment"
+}
+
+func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	var i int
+	for i = index; i < job.EndPoint; i++ {
+		curByte := job.Content[i]
+
+		if curByte == '\n' {
+			break
+		}
+	}
+
+	return i, lineType, state
+}
+
+func (state *StateCommentSingle) Reset() (LineType, State) {
+	return LINE_BLANK, &StateBlank{}
+}
diff --git a/processor/state_docstring.go b/processor/state_docstring.go
@@ -0,0 +1,53 @@
+package processor
+
+import (
+	"fmt"
+)
+
+type StateDocString struct {
+	End       []byte
+	SkipEsc   bool
+}
+
+func (state *StateDocString) String() string {
+	return "docstring"
+}
+
+func (state *StateDocString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	var i int
+	for i = index; i < job.EndPoint; i++ {
+		if job.Content[i] == '\n' {
+			return i, lineType, state
+		}
+
+		if job.Content[i-1] != '\\' {
+			if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
+				// So we have hit end of docstring at this point in which case check if only whitespace characters till the next
+				// newline and if so we change to a comment otherwise to code
+				// need to start the loop after ending definition of docstring, therefore adding the length of the string to
+				// the index
+				for j := i + len(state.End); j <= job.EndPoint; j++ {
+					if job.Content[j] == '\n' {
+						if Debug {
+							printDebug("Found newline so docstring is comment")
+						}
+						return j, LINE_COMMENT, &StateBlank{}
+					}
+
+					if !isWhitespace(job.Content[j]) {
+						if Debug {
+							printDebug(fmt.Sprintf("Found something not whitespace so is code: %s", string(job.Content[j])))
+						}
+						return j, LINE_CODE, &StateBlank{}
+					}
+				}
+			}
+		}
+	}
+
+	return i, lineType, state
+}
+
+func (state *StateDocString) Reset() (LineType, State) {
+	return LINE_COMMENT, state
+}
diff --git a/processor/state_string.go b/processor/state_string.go
@@ -0,0 +1,37 @@
+package processor
+
+import "fmt"
+
+type StateString struct {
+	End     []byte
+	SkipEsc bool
+}
+
+func (state *StateString) String() string {
+	return fmt.Sprintf("string[end=%s,skipesc=%v]", state.End, state.SkipEsc)
+}
+
+func (state *StateString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	var i int
+	for i = index; i < job.EndPoint; i++ {
+		// If we hit a newline, return because we want to count the stats but keep
+		// the current state so we end up back in this loop when the outer
+		// one calls again
+		if job.Content[i] == '\n' {
+			return i, LINE_CODE, state
+		}
+
+		// If we are in a literal string we want to ignore the \ check OR we aren't checking for special ones
+		if state.SkipEsc || job.Content[i-1] != '\\' {
+			if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
+				return i, LINE_CODE, &StateCode{}
+			}
+		}
+	}
+
+	return i, LINE_CODE, state
+}
+
+func (state *StateString) Reset() (LineType, State) {
+	return LINE_CODE, state
+}
diff --git a/processor/states.go b/processor/states.go
@@ -0,0 +1,6 @@
+package processor
+
+type State interface {
+	Process(*FileJob, *LanguageFeature, int, LineType) (int, LineType, State)
+	Reset() (LineType, State)
+}
diff --git a/processor/structs.go b/processor/structs.go
@@ -5,6 +5,7 @@ package processor
 import (
 	"bytes"
 	"sync"
+	"hash"
 )
 
 // Used by trie structure to store the types
@@ -76,11 +77,12 @@ type FileJob struct {
 	Blank              int64
 	Complexity         int64
 	WeightedComplexity float64
-	Hash               []byte
+	Hash               hash.Hash
 	Callback           FileJobCallback
 	Binary             bool
 	Minified           bool
 	Generated          bool
+	EndPoint           int
 }
 
 // LanguageSummary is used to hold summarised results for a single language
diff --git a/processor/workers.go b/processor/workers.go

Original file line number	Diff line number	Diff line change
`@@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {`
`259`	`259`	`Extension: extension,`
`260`	`260`	`PossibleLanguages: language,`
`261`	`261`	`Bytes: fileInfo.Size(),`
	`262`	`+ EndPoint: int(fileInfo.Size() - 1),`
`262`	`263`	`}`
`263`	`264`	`} else if Verbose {`
`264`	`265`	`printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))`