Add BenchmarkCacheFalseSharing

JamesGuthrie · JamesGuthrie · commit a92e15e07702 · 2022-07-22T12:13:35.000+02:00
It seems as though the existing benchmarks cannot be used to measure the
effect of false sharing, so I added a new benchmark.
diff --git a/pkg/clockcache/cache.go b/pkg/clockcache/cache.go
@@ -44,7 +44,7 @@ type element struct {
 	used uint32
 	size uint64
 
-	// pad Elements out to be cache aligned
+	// pad Elements out to be cache-aligned, see BenchmarkCacheFalseSharing
 	_ [16]byte
 }
 
diff --git a/pkg/clockcache/cache_bench_test.go b/pkg/clockcache/cache_bench_test.go
@@ -3,6 +3,7 @@ package clockcache
 import (
 	"fmt"
 	"math/rand"
+	"sync"
 	"testing"
 )
 
@@ -236,6 +237,71 @@ func BenchmarkInsertConcurrent(b *testing.B) {
 	}
 }
 
+// BenchmarkCacheFalseSharing is a benchmark to measure the effect of the false
+// sharing of CPU cache lines. In the clockcache.element struct, we introduce
+// padding to ensure that only one clockcache.element fits in a CPU cache line,
+// avoiding false sharing.
+//
+// The principle behind this benchmark is simple: construct a cache with two
+// entries, and start two goroutines which each clobber one of the cache values
+// over and over again. If there is false sharing, it should be measurable by
+// toggling the padding on and off, and measuring the difference in output of
+// this benchmark.
+//
+// At the time of writing, this code was tested on an M1 MacBook Pro, where the
+// advantage obtained by introducing padding is approximately 16%:
+//     go test -bench=BenchmarkCacheFalseSharing -cpu=2 -count=10 > no-padding.txt
+//     go test -bench=BenchmarkCacheFalseSharing -cpu=2 -count=10 > padding.txt
+//     benchstat no-padding.txt padding.txt
+//       name                 old time/op    new time/op    delta
+//       CacheFalseSharing-2     230ns ± 6%     193ns ±19%  -16.09%  (p=0.001 n=10+9)
+//
+// Note: This benchmark _must_ be run with the `-cpu=2` argument, to ensure
+// that each goroutine ends up on a different CPU, possibly causing contention
+// for the same cache line.
+func BenchmarkCacheFalseSharing(b *testing.B) {
+	cache := WithMax(2)
+	b.ReportAllocs()
+
+	// define waitgroup so that we can coordinate the start of the stressors
+	startWg := &sync.WaitGroup{}
+	startWg.Add(2)
+
+	// define waitgroup, so we can wait until concurrent stressors are finished
+	endWg := &sync.WaitGroup{}
+	endWg.Add(2)
+
+	key1 := 0
+	key2 := 1
+	times := b.N
+
+	// stressor is a function to be run in a goroutine which continually writes
+	// and reads to/from a specific key in the cache
+	stressor := func(key, count int) {
+		var val interface{}
+
+		// Coordinate the start of the two stressors
+		startWg.Done()
+		startWg.Wait()
+
+		// Reset the timer immediately before doing the real work
+		b.ResetTimer()
+		for i := 0; i < count; i++ {
+			cache.Insert(key, i, 16)
+			val, _ = cache.Get(key)
+		}
+
+		bval = val
+		endWg.Done()
+	}
+
+	// run two contending goroutines
+	go stressor(key1, times)
+	go stressor(key2, times)
+
+	// wait for tasks to complete
+	endWg.Wait()
+}
 func BenchmarkMemoryEmptyCache(b *testing.B) {
 	b.ReportAllocs()
 	WithMax(uint64(b.N))

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ type element struct {`
`44`	`44`	`used uint32`
`45`	`45`	`size uint64`
`46`	`46`
`47`		`- // pad Elements out to be cache aligned`
	`47`	`+ // pad Elements out to be cache-aligned, see BenchmarkCacheFalseSharing`
`48`	`48`	`_ [16]byte`
`49`	`49`	`}`
`50`	`50`