Skip to content

Commit b24b909

Browse files
committed
Clean up new bloom filter implementation
1 parent 95d766e commit b24b909

File tree

2 files changed

+25
-12
lines changed

2 files changed

+25
-12
lines changed

processor/bloom.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package processor
22

3-
// Prime number less than 256
4-
const BloomPrime = 251
3+
import "math/rand"
54

65
var BloomTable [256]uint64
76

@@ -12,13 +11,27 @@ func init() {
1211
}
1312

1413
func BloomHash(b byte) uint64 {
15-
i := uint64(b)
14+
// Since our input is based on ASCII characters (and majority lower case
15+
// characters) the values are not well distributed through the 0-255 byte
16+
// range. math/rand gives us a way to generate a value with more well
17+
// distributed randomness.
18+
k := rand.New(rand.NewSource(int64(b))).Uint64()
1619

17-
k := (i^BloomPrime) * i
20+
// Mask to slice out a 0-63 value
21+
var mask64 uint64 = 0b00111111
1822

19-
k1 := k & 0x3f
20-
k2 := k >> 1 & 0x3f
21-
k3 := k >> 2 & 0x3f
23+
// For a bloom filter we only want a few bits set, but distributed
24+
// through the 64 bit space.
25+
// The logic here is to slice a value between 0 and 63 from k, and set a
26+
// single bit in the output hash based on that.
27+
// Setting three bits this way seems to give the best results. Fewer bits
28+
// makes the hash not unique enough, more leads to overcrowding the bloom
29+
// filter.
30+
var hash uint64
31+
for i := uint64(0); i < 3; i++ {
32+
n := k >> (i*8) & mask64
33+
hash |= 1 << n
34+
}
2235

23-
return (1 << k1) | (1 << k2) | (1 << k3)
36+
return hash
2437
}

processor/processor.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ func processLanguageFeature(name string, value Language) {
357357
var processMask uint64
358358

359359
for _, v := range value.ComplexityChecks {
360-
complexityMask |= BloomHash(v[0])
360+
complexityMask |= BloomTable[v[0]]
361361
complexityTrie.Insert(TComplexity, []byte(v))
362362
if !Complexity {
363363
tokenTrie.Insert(TComplexity, []byte(v))
@@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
368368
}
369369

370370
for _, v := range value.LineComment {
371-
singleLineCommentMask |= BloomHash(v[0])
371+
singleLineCommentMask |= BloomTable[v[0]]
372372
slCommentTrie.Insert(TSlcomment, []byte(v))
373373
tokenTrie.Insert(TSlcomment, []byte(v))
374374
}
375375
processMask |= singleLineCommentMask
376376

377377
for _, v := range value.MultiLine {
378-
multiLineCommentMask |= BloomHash(v[0][0])
378+
multiLineCommentMask |= BloomTable[v[0][0]]
379379
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
380380
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
381381
}
382382
processMask |= multiLineCommentMask
383383

384384
for _, v := range value.Quotes {
385-
stringMask |= BloomHash(v.Start[0])
385+
stringMask |= BloomTable[v.Start[0]]
386386
stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
387387
tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
388388
}

0 commit comments

Comments
 (0)