Skip to content

Commit 9770aba

Browse files
committed
Refactor CountStats state machine
1 parent d98709e commit 9770aba

File tree

11 files changed

+419
-344
lines changed

11 files changed

+419
-344
lines changed

examples/issue246.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Docstrings containing an apostrophe (') are handled incorrectly
5+
The line above is counted as code despite being in the middle of a docstring.
6+
The end of docstring flag seems to be changed to an apostrophe,
7+
which means the next line will not exit the docstring.
8+
"""
9+
# Code containing single quotes will exit the docstring,
10+
# but presuming the quotes are balanced the second
11+
# quote will put us in string scanning mode.
12+
if __name__ == '__main__':
13+
print('Hello, World!')
14+
# Not counted as a comment
15+
16+
# ^ Not counted as a blank line
17+
# Break out of string scanner with unbalanced single quote: '
18+
exit(0)

processor/file.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
259259
Extension: extension,
260260
PossibleLanguages: language,
261261
Bytes: fileInfo.Size(),
262+
EndPoint: int(fileInfo.Size() - 1),
262263
}
263264
} else if Verbose {
264265
printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))

processor/state_blank.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package processor
2+
3+
type StateBlank struct {}
4+
5+
func (state *StateBlank) String() string {
6+
return "blank"
7+
}
8+
9+
func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
10+
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
11+
case TMlcomment:
12+
commentType := lineType
13+
if commentType == LINE_BLANK {
14+
commentType = LINE_COMMENT
15+
}
16+
17+
index += offsetJump - 1
18+
return index, commentType, NewStateCommentMulti(endString)
19+
20+
case TSlcomment:
21+
commentType := lineType
22+
if commentType == LINE_BLANK {
23+
commentType = LINE_COMMENT
24+
}
25+
return index, commentType, &StateCommentSingle{}
26+
27+
case TString:
28+
index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)
29+
30+
if docString {
31+
commentType := lineType
32+
if commentType == LINE_BLANK {
33+
commentType = LINE_COMMENT
34+
}
35+
36+
return index, commentType, &StateDocString{
37+
End: endString,
38+
SkipEsc: skipEsc,
39+
}
40+
}
41+
42+
return index, LINE_CODE, &StateString{
43+
End: endString,
44+
SkipEsc: skipEsc,
45+
}
46+
47+
case TComplexity:
48+
if index == 0 || isWhitespace(job.Content[index-1]) {
49+
job.Complexity++
50+
}
51+
return index, LINE_BLANK, state
52+
53+
default:
54+
return index, LINE_CODE, &StateCode{}
55+
}
56+
}
57+
58+
func (state *StateBlank) Reset() (LineType, State) {
59+
return LINE_BLANK, state
60+
}

processor/state_code.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package processor
2+
3+
type StateCode struct {}
4+
5+
func (state *StateCode) String() string {
6+
return "code"
7+
}
8+
9+
func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
10+
// Hacky fix to https://github.com/boyter/scc/issues/181
11+
endPoint := job.EndPoint
12+
if endPoint > len(job.Content) {
13+
endPoint--
14+
}
15+
16+
var i int
17+
for i = index; i < endPoint; i++ {
18+
curByte := job.Content[i]
19+
20+
if curByte == '\n' {
21+
return i, LINE_CODE, state
22+
}
23+
24+
if isBinary(i, curByte) {
25+
job.Binary = true
26+
return i, LINE_CODE, state
27+
}
28+
29+
if shouldProcess(curByte, lang.ProcessMask) {
30+
if Duplicates {
31+
// Technically this is wrong because we skip bytes so this is not a true
32+
// hash of the file contents, but for duplicate files it shouldn't matter
33+
// as both will skip the same way
34+
digestible := []byte{job.Content[index]}
35+
job.Hash.Write(digestible)
36+
}
37+
38+
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
39+
case TString:
40+
// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string
41+
42+
// It is safe to -1 here as to enter the code state we need to have
43+
// transitioned from blank to here hence i should always be >= 1
44+
// This check is to ensure we aren't in a character declaration
45+
// TODO this should use language features
46+
if job.Content[i-1] == '\\' {
47+
break // from switch, not from the loop
48+
}
49+
50+
i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)
51+
52+
if docString {
53+
commentType := lineType
54+
if commentType == LINE_BLANK {
55+
commentType = LINE_COMMENT
56+
}
57+
58+
return i, commentType, &StateDocString{
59+
End: endString,
60+
SkipEsc: skipEsc,
61+
}
62+
}
63+
64+
// i += offsetJump - 1
65+
return i, LINE_CODE, &StateString{
66+
End: endString,
67+
SkipEsc: skipEsc,
68+
}
69+
70+
case TSlcomment:
71+
i += offsetJump - 1
72+
return i, LINE_CODE, &StateCommentSingle{}
73+
74+
case TMlcomment:
75+
i += offsetJump - 1
76+
77+
return i, LINE_CODE, NewStateCommentMulti(endString)
78+
79+
case TComplexity:
80+
if i == 0 || isWhitespace(job.Content[i-1]) {
81+
job.Complexity++
82+
}
83+
}
84+
}
85+
}
86+
87+
return i, LINE_CODE, state
88+
}
89+
90+
func (state *StateCode) Reset() (LineType, State) {
91+
return LINE_BLANK, &StateBlank{}
92+
}

processor/state_comment_multi.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package processor
2+
3+
type StateCommentMulti struct {
4+
Stack [][]byte
5+
}
6+
7+
func (state *StateCommentMulti) String() string {
8+
return "multiline-comment"
9+
}
10+
11+
func NewStateCommentMulti(token []byte) *StateCommentMulti {
12+
return &StateCommentMulti{
13+
Stack: [][]byte{token},
14+
}
15+
}
16+
17+
func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
18+
var i int
19+
for i = index; i < job.EndPoint; i++ {
20+
curByte := job.Content[i]
21+
22+
if curByte == '\n' {
23+
break
24+
}
25+
26+
endToken := state.peek()
27+
if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
28+
// set offset jump here
29+
i += len(endToken) - 1
30+
31+
if len(state.Stack) == 1 {
32+
return i, lineType, &StateBlank{}
33+
} else {
34+
state.pop()
35+
return i, lineType, state
36+
}
37+
}
38+
39+
// Check if we are entering another multiline comment
40+
// This should come below check for match single as it speeds up processing
41+
if lang.Nested {
42+
if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
43+
i += offsetJump - 1
44+
state.push(endString)
45+
return i, lineType, state
46+
}
47+
}
48+
}
49+
50+
return i, lineType, state
51+
}
52+
53+
func (state *StateCommentMulti) Reset() (LineType, State) {
54+
return LINE_COMMENT, state
55+
}
56+
57+
func (state *StateCommentMulti) peek() []byte {
58+
i := len(state.Stack) - 1
59+
return state.Stack[i]
60+
}
61+
62+
func (state *StateCommentMulti) push(token []byte) {
63+
state.Stack = append(state.Stack, token)
64+
}
65+
66+
func (state *StateCommentMulti) pop() {
67+
i := len(state.Stack) - 1
68+
69+
state.Stack = state.Stack[:i]
70+
}

processor/state_comment_single.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package processor
2+
3+
type StateCommentSingle struct {}
4+
5+
func (state *StateCommentSingle) String() string {
6+
return "comment"
7+
}
8+
9+
func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
10+
var i int
11+
for i = index; i < job.EndPoint; i++ {
12+
curByte := job.Content[i]
13+
14+
if curByte == '\n' {
15+
break
16+
}
17+
}
18+
19+
return i, lineType, state
20+
}
21+
22+
func (state *StateCommentSingle) Reset() (LineType, State) {
23+
return LINE_BLANK, &StateBlank{}
24+
}

processor/state_docstring.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package processor
2+
3+
import (
4+
"fmt"
5+
)
6+
7+
type StateDocString struct {
8+
End []byte
9+
SkipEsc bool
10+
}
11+
12+
func (state *StateDocString) String() string {
13+
return "docstring"
14+
}
15+
16+
func (state *StateDocString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
17+
var i int
18+
for i = index; i < job.EndPoint; i++ {
19+
if job.Content[i] == '\n' {
20+
return i, lineType, state
21+
}
22+
23+
if job.Content[i-1] != '\\' {
24+
if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
25+
// So we have hit end of docstring at this point in which case check if only whitespace characters till the next
26+
// newline and if so we change to a comment otherwise to code
27+
// need to start the loop after ending definition of docstring, therefore adding the length of the string to
28+
// the index
29+
for j := i + len(state.End); j <= job.EndPoint; j++ {
30+
if job.Content[j] == '\n' {
31+
if Debug {
32+
printDebug("Found newline so docstring is comment")
33+
}
34+
return j, LINE_COMMENT, &StateBlank{}
35+
}
36+
37+
if !isWhitespace(job.Content[j]) {
38+
if Debug {
39+
printDebug(fmt.Sprintf("Found something not whitespace so is code: %s", string(job.Content[j])))
40+
}
41+
return j, LINE_CODE, &StateBlank{}
42+
}
43+
}
44+
}
45+
}
46+
}
47+
48+
return i, lineType, state
49+
}
50+
51+
func (state *StateDocString) Reset() (LineType, State) {
52+
return LINE_COMMENT, state
53+
}

processor/state_string.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package processor
2+
3+
import "fmt"
4+
5+
type StateString struct {
6+
End []byte
7+
SkipEsc bool
8+
}
9+
10+
func (state *StateString) String() string {
11+
return fmt.Sprintf("string[end=%s,skipesc=%v]", state.End, state.SkipEsc)
12+
}
13+
14+
func (state *StateString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
15+
var i int
16+
for i = index; i < job.EndPoint; i++ {
17+
// If we hit a newline, return because we want to count the stats but keep
18+
// the current state so we end up back in this loop when the outer
19+
// one calls again
20+
if job.Content[i] == '\n' {
21+
return i, LINE_CODE, state
22+
}
23+
24+
// If we are in a literal string we want to ignore the \ check OR we aren't checking for special ones
25+
if state.SkipEsc || job.Content[i-1] != '\\' {
26+
if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
27+
return i, LINE_CODE, &StateCode{}
28+
}
29+
}
30+
}
31+
32+
return i, LINE_CODE, state
33+
}
34+
35+
func (state *StateString) Reset() (LineType, State) {
36+
return LINE_CODE, state
37+
}

processor/states.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package processor
2+
3+
type State interface {
4+
Process(*FileJob, *LanguageFeature, int, LineType) (int, LineType, State)
5+
Reset() (LineType, State)
6+
}

processor/structs.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package processor
55
import (
66
"bytes"
77
"sync"
8+
"hash"
89
)
910

1011
// Used by trie structure to store the types
@@ -76,11 +77,12 @@ type FileJob struct {
7677
Blank int64
7778
Complexity int64
7879
WeightedComplexity float64
79-
Hash []byte
80+
Hash hash.Hash
8081
Callback FileJobCallback
8182
Binary bool
8283
Minified bool
8384
Generated bool
85+
EndPoint int
8486
}
8587

8688
// LanguageSummary is used to hold summarised results for a single language

0 commit comments

Comments
 (0)