Skip to content

Commit a663333

Browse files
authored
Merge pull request #1 from boyter/master
merge updated code
2 parents c916c20 + e8a6442 commit a663333

15 files changed

+486
-363
lines changed

examples/issue246.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Docstrings containing an apostrophe (') are handled incorrectly
5+
The line above is counted as code despite being in the middle of a docstring.
6+
The end of docstring flag seems to be changed to an apostrophe,
7+
which means the next line will not exit the docstring.
8+
"""
9+
# Code containing single quotes will exit the docstring,
10+
# but presuming the quotes are balanced the second
11+
# quote will put us in string scanning mode.
12+
if __name__ == '__main__':
13+
print('Hello, World!')
14+
# Not counted as a comment
15+
16+
# ^ Not counted as a blank line
17+
# Break out of string scanner with unbalanced single quote: '
18+
exit(0)

languages.json

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6645,10 +6645,20 @@
66456645
"== "
66466646
],
66476647
"extensions": [
6648-
"vim"
6648+
"vim",
6649+
"vimrc",
6650+
"gvimrc"
6651+
],
6652+
"filenames": [
6653+
"_vimrc",
6654+
".vimrc",
6655+
"_gvimrc",
6656+
".gvimrc",
6657+
"vimrc",
6658+
"gvimrc"
66496659
],
66506660
"line_comment": [
6651-
"\\\""
6661+
"\"", "#"
66526662
],
66536663
"multi_line": [],
66546664
"quotes": [

processor/bloom.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package processor
2+
3+
import "math/rand"
4+
5+
var BloomTable [256]uint64
6+
7+
func init() {
8+
for i := range BloomTable {
9+
BloomTable[i] = BloomHash(byte(i))
10+
}
11+
}
12+
13+
func BloomHash(b byte) uint64 {
14+
// Since our input is based on ASCII characters (and majority lower case
15+
// characters) the values are not well distributed through the 0-255 byte
16+
// range. math/rand gives us a way to generate a value with more well
17+
// distributed randomness.
18+
k := rand.New(rand.NewSource(int64(b))).Uint64()
19+
20+
// Mask to slice out a 0-63 value
21+
var mask64 uint64 = 0b00111111
22+
23+
// For a bloom filter we only want a few bits set, but distributed
24+
// through the 64 bit space.
25+
// The logic here is to slice a value between 0 and 63 from k, and set a
26+
// single bit in the output hash based on that.
27+
// Setting three bits this way seems to give the best results. Fewer bits
28+
// makes the hash not unique enough, more leads to overcrowding the bloom
29+
// filter.
30+
var hash uint64
31+
for i := uint64(0); i < 3; i++ {
32+
n := k >> (i*8) & mask64
33+
hash |= 1 << n
34+
}
35+
36+
return hash
37+
}

processor/constants.go

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

processor/file.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
259259
Extension: extension,
260260
PossibleLanguages: language,
261261
Bytes: fileInfo.Size(),
262+
EndPoint: int(fileInfo.Size() - 1),
262263
}
263264
} else if Verbose {
264265
printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))

processor/processor.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -350,14 +350,14 @@ func processLanguageFeature(name string, value Language) {
350350
stringTrie := &Trie{}
351351
tokenTrie := &Trie{}
352352

353-
complexityMask := byte(0)
354-
singleLineCommentMask := byte(0)
355-
multiLineCommentMask := byte(0)
356-
stringMask := byte(0)
357-
processMask := byte(0)
353+
var complexityMask uint64
354+
var singleLineCommentMask uint64
355+
var multiLineCommentMask uint64
356+
var stringMask uint64
357+
var processMask uint64
358358

359359
for _, v := range value.ComplexityChecks {
360-
complexityMask |= v[0]
360+
complexityMask |= BloomTable[v[0]]
361361
complexityTrie.Insert(TComplexity, []byte(v))
362362
if !Complexity {
363363
tokenTrie.Insert(TComplexity, []byte(v))
@@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
368368
}
369369

370370
for _, v := range value.LineComment {
371-
singleLineCommentMask |= v[0]
371+
singleLineCommentMask |= BloomTable[v[0]]
372372
slCommentTrie.Insert(TSlcomment, []byte(v))
373373
tokenTrie.Insert(TSlcomment, []byte(v))
374374
}
375375
processMask |= singleLineCommentMask
376376

377377
for _, v := range value.MultiLine {
378-
multiLineCommentMask |= v[0][0]
378+
multiLineCommentMask |= BloomTable[v[0][0]]
379379
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
380380
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
381381
}
382382
processMask |= multiLineCommentMask
383383

384384
for _, v := range value.Quotes {
385-
stringMask |= v.Start[0]
385+
stringMask |= BloomTable[v.Start[0]]
386386
stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
387387
tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
388388
}

processor/state_blank.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package processor
2+
3+
type StateBlank struct {}
4+
5+
func (state *StateBlank) String() string {
6+
return "blank"
7+
}
8+
9+
func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
10+
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
11+
case TMlcomment:
12+
commentType := lineType
13+
if commentType == LINE_BLANK {
14+
commentType = LINE_COMMENT
15+
}
16+
17+
index += offsetJump - 1
18+
return index, commentType, NewStateCommentMulti(endString)
19+
20+
case TSlcomment:
21+
commentType := lineType
22+
if commentType == LINE_BLANK {
23+
commentType = LINE_COMMENT
24+
}
25+
return index, commentType, &StateCommentSingle{}
26+
27+
case TString:
28+
index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)
29+
30+
if docString {
31+
commentType := lineType
32+
if commentType == LINE_BLANK {
33+
commentType = LINE_COMMENT
34+
}
35+
36+
return index, commentType, &StateDocString{
37+
End: endString,
38+
SkipEsc: skipEsc,
39+
}
40+
}
41+
42+
return index, LINE_CODE, &StateString{
43+
End: endString,
44+
SkipEsc: skipEsc,
45+
}
46+
47+
case TComplexity:
48+
if index == 0 || isWhitespace(job.Content[index-1]) {
49+
job.Complexity++
50+
}
51+
return index, LINE_BLANK, state
52+
53+
default:
54+
return index, LINE_CODE, &StateCode{}
55+
}
56+
}
57+
58+
func (state *StateBlank) Reset() (LineType, State) {
59+
return LINE_BLANK, state
60+
}

processor/state_code.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package processor
2+
3+
type StateCode struct {}
4+
5+
func (state *StateCode) String() string {
6+
return "code"
7+
}
8+
9+
func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
10+
// Hacky fix to https://github.com/boyter/scc/issues/181
11+
endPoint := job.EndPoint
12+
if endPoint > len(job.Content) {
13+
endPoint--
14+
}
15+
16+
var i int
17+
for i = index; i < endPoint; i++ {
18+
curByte := job.Content[i]
19+
20+
if curByte == '\n' {
21+
return i, LINE_CODE, state
22+
}
23+
24+
if isBinary(i, curByte) {
25+
job.Binary = true
26+
return i, LINE_CODE, state
27+
}
28+
29+
if shouldProcess(curByte, lang.ProcessMask) {
30+
if Duplicates {
31+
// Technically this is wrong because we skip bytes so this is not a true
32+
// hash of the file contents, but for duplicate files it shouldn't matter
33+
// as both will skip the same way
34+
digestible := []byte{job.Content[index]}
35+
job.Hash.Write(digestible)
36+
}
37+
38+
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
39+
case TString:
40+
// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string
41+
42+
// It is safe to -1 here as to enter the code state we need to have
43+
// transitioned from blank to here hence i should always be >= 1
44+
// This check is to ensure we aren't in a character declaration
45+
// TODO this should use language features
46+
if job.Content[i-1] == '\\' {
47+
break // from switch, not from the loop
48+
}
49+
50+
i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)
51+
52+
if docString {
53+
commentType := lineType
54+
if commentType == LINE_BLANK {
55+
commentType = LINE_COMMENT
56+
}
57+
58+
return i, commentType, &StateDocString{
59+
End: endString,
60+
SkipEsc: skipEsc,
61+
}
62+
}
63+
64+
// i += offsetJump - 1
65+
return i, LINE_CODE, &StateString{
66+
End: endString,
67+
SkipEsc: skipEsc,
68+
}
69+
70+
case TSlcomment:
71+
i += offsetJump - 1
72+
return i, LINE_CODE, &StateCommentSingle{}
73+
74+
case TMlcomment:
75+
i += offsetJump - 1
76+
77+
return i, LINE_CODE, NewStateCommentMulti(endString)
78+
79+
case TComplexity:
80+
if i == 0 || isWhitespace(job.Content[i-1]) {
81+
job.Complexity++
82+
}
83+
}
84+
}
85+
}
86+
87+
return i, LINE_CODE, state
88+
}
89+
90+
func (state *StateCode) Reset() (LineType, State) {
91+
return LINE_BLANK, &StateBlank{}
92+
}

processor/state_comment_multi.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package processor
2+
3+
type StateCommentMulti struct {
4+
Stack [][]byte
5+
}
6+
7+
func (state *StateCommentMulti) String() string {
8+
return "multiline-comment"
9+
}
10+
11+
func NewStateCommentMulti(token []byte) *StateCommentMulti {
12+
return &StateCommentMulti{
13+
Stack: [][]byte{token},
14+
}
15+
}
16+
17+
func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
18+
var i int
19+
for i = index; i < job.EndPoint; i++ {
20+
curByte := job.Content[i]
21+
22+
if curByte == '\n' {
23+
break
24+
}
25+
26+
endToken := state.peek()
27+
if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
28+
// set offset jump here
29+
i += len(endToken) - 1
30+
31+
if len(state.Stack) == 1 {
32+
return i, lineType, &StateBlank{}
33+
} else {
34+
state.pop()
35+
return i, lineType, state
36+
}
37+
}
38+
39+
// Check if we are entering another multiline comment
40+
// This should come below check for match single as it speeds up processing
41+
if lang.Nested {
42+
if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
43+
i += offsetJump - 1
44+
state.push(endString)
45+
return i, lineType, state
46+
}
47+
}
48+
}
49+
50+
return i, lineType, state
51+
}
52+
53+
func (state *StateCommentMulti) Reset() (LineType, State) {
54+
return LINE_COMMENT, state
55+
}
56+
57+
func (state *StateCommentMulti) peek() []byte {
58+
i := len(state.Stack) - 1
59+
return state.Stack[i]
60+
}
61+
62+
func (state *StateCommentMulti) push(token []byte) {
63+
state.Stack = append(state.Stack, token)
64+
}
65+
66+
func (state *StateCommentMulti) pop() {
67+
i := len(state.Stack) - 1
68+
69+
state.Stack = state.Stack[:i]
70+
}

processor/state_comment_single.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package processor
2+
3+
type StateCommentSingle struct {}
4+
5+
func (state *StateCommentSingle) String() string {
6+
return "comment"
7+
}
8+
9+
func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
10+
var i int
11+
for i = index; i < job.EndPoint; i++ {
12+
curByte := job.Content[i]
13+
14+
if curByte == '\n' {
15+
break
16+
}
17+
}
18+
19+
return i, lineType, state
20+
}
21+
22+
func (state *StateCommentSingle) Reset() (LineType, State) {
23+
return LINE_BLANK, &StateBlank{}
24+
}

0 commit comments

Comments
 (0)