Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit a92e15e

Browse files
committed
Add BenchmarkCacheFalseSharing
It seems as though the existing benchmarks cannot be used to measure the effect of false sharing, so I added a new benchmark.
1 parent e7745aa commit a92e15e

File tree

2 files changed

+67
-1
lines changed

2 files changed

+67
-1
lines changed

pkg/clockcache/cache.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ type element struct {
4444
used uint32
4545
size uint64
4646

47-
// pad Elements out to be cache aligned
47+
// pad Elements out to be cache-aligned, see BenchmarkCacheFalseSharing
4848
_ [16]byte
4949
}
5050

pkg/clockcache/cache_bench_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package clockcache
33
import (
44
"fmt"
55
"math/rand"
6+
"sync"
67
"testing"
78
)
89

@@ -236,6 +237,71 @@ func BenchmarkInsertConcurrent(b *testing.B) {
236237
}
237238
}
238239

240+
// BenchmarkCacheFalseSharing is a benchmark to measure the effect of the false
241+
// sharing of CPU cache lines. In the clockcache.element struct, we introduce
242+
// padding to ensure that only one clockcache.element fits in a CPU cache line,
243+
// avoiding false sharing.
244+
//
245+
// The principle behind this benchmark is simple: construct a cache with two
246+
// entries, and start two goroutines which each clobber one of the cache values
247+
// over and over again. If there is false sharing, it should be measurable by
248+
// toggling the padding on and off, and measuring the difference in output of
249+
// this benchmark.
250+
//
251+
// At the time of writing, this code was tested on an M1 MacBook Pro, where the
252+
// advantage obtained by introducing padding is approximately 16%:
253+
// go test -bench=BenchmarkCacheFalseSharing -cpu=2 -count=10 > no-padding.txt
254+
// go test -bench=BenchmarkCacheFalseSharing -cpu=2 -count=10 > padding.txt
255+
// benchstat no-padding.txt padding.txt
256+
// name old time/op new time/op delta
257+
// CacheFalseSharing-2 230ns ± 6% 193ns ±19% -16.09% (p=0.001 n=10+9)
258+
//
259+
// Note: This benchmark _must_ be run with the `-cpu=2` argument, to ensure
260+
// that each goroutine ends up on a different CPU, possibly causing contention
261+
// for the same cache line.
262+
func BenchmarkCacheFalseSharing(b *testing.B) {
263+
cache := WithMax(2)
264+
b.ReportAllocs()
265+
266+
// define waitgroup so that we can coordinate the start of the stressors
267+
startWg := &sync.WaitGroup{}
268+
startWg.Add(2)
269+
270+
// define waitgroup, so we can wait until concurrent stressors are finished
271+
endWg := &sync.WaitGroup{}
272+
endWg.Add(2)
273+
274+
key1 := 0
275+
key2 := 1
276+
times := b.N
277+
278+
// stressor is a function to be run in a goroutine which continually writes
279+
// and reads to/from a specific key in the cache
280+
stressor := func(key, count int) {
281+
var val interface{}
282+
283+
// Coordinate the start of the two stressors
284+
startWg.Done()
285+
startWg.Wait()
286+
287+
// Reset the timer immediately before doing the real work
288+
b.ResetTimer()
289+
for i := 0; i < count; i++ {
290+
cache.Insert(key, i, 16)
291+
val, _ = cache.Get(key)
292+
}
293+
294+
bval = val
295+
endWg.Done()
296+
}
297+
298+
// run two contending goroutines
299+
go stressor(key1, times)
300+
go stressor(key2, times)
301+
302+
// wait for tasks to complete
303+
endWg.Wait()
304+
}
239305
func BenchmarkMemoryEmptyCache(b *testing.B) {
240306
b.ReportAllocs()
241307
WithMax(uint64(b.N))

0 commit comments

Comments
 (0)