Skip to content

Commit c37bef4

Browse files
paulcacheuxlmb
authored andcommitted
ringbuf: fix corrupt samples on arm64
Reading the len field of the event header needs to be atomic to match the release memory barrier from https://github.com/torvalds/linux/blob/eb26cbb1a754ccde5d4d74527dad5ba051808fad/kernel/bpf/ringbuf.c#L487 We also don't care about the PgOff field which is only used by ringbuf internals. [ Additional context by Lorenz ] Based on a suggestion by Daniel Borkmann I wrote a litmus test against the Linux Kernel Memory Model: C ringbuf {} P0(int *hdr_len, int *prod_pos, int *data) // Producer { // __bpf_ringbuf_reserve *hdr_len = 1; // BUSY smp_store_release(prod_pos, 1); // store some data. *data = 1; // bpf_ringbuf_commit int r0 = xchg(hdr_len, 2); // COMMIT } P1(int *hdr_len, int *prod_pos, int *data) // Consumer { int p = smp_load_acquire(prod_pos); if (p > 0) { int l = *hdr_len; if (l == 2) { int d = *data; } } } exists (prod_pos=1 /\ 1:l=2 /\ 1:d=0) Note that hdr_len is read via a plain read. Executing the litmus test shows that we can indeed observe a COMMIT (l=2) before data is visible: $ herd7 -conf linux-kernel.cfg litmus-tests/ringbuf.litmus Test ringbuf Allowed States 4 1:d=0; 1:l=0; [prod_pos]=1; 1:d=0; 1:l=1; [prod_pos]=1; 1:d=0; 1:l=2; [prod_pos]=1; 1:d=1; 1:l=2; [prod_pos]=1; Ok Witnesses Positive: 1 Negative: 3 Flag data-race Condition exists ([prod_pos]=1 /\ 1:l=2 /\ 1:d=0) Observation ringbuf Sometimes 1 3 Time ringbuf 0.01 Hash=7aed947102aa203443a5b9b5884f956f This is fixed by using an smp_load_acquire or equivalent to read hdr_len. Signed-off-by: Paul Cacheux <[email protected]> Co-developed-by: Lorenz Bauer <[email protected]>
1 parent ebfeccc commit c37bef4

File tree

2 files changed

+26
-15
lines changed

2 files changed

+26
-15
lines changed

ringbuf/reader.go

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ var (
2323

2424
// ringbufHeader from 'struct bpf_ringbuf_hdr' in kernel/bpf/ringbuf.c
2525
type ringbufHeader struct {
26-
Len uint32
27-
PgOff uint32
26+
Len uint32
27+
_ uint32 // pg_off, only used by kernel internals
2828
}
2929

3030
func (rh *ringbufHeader) isBusy() bool {
@@ -47,23 +47,16 @@ type Record struct {
4747
}
4848

4949
// Read a record from an event ring.
50-
//
51-
// buf must be at least BPF_RINGBUF_HDR_SZ bytes long.
52-
func readRecord(rd *ringbufEventRing, rec *Record, buf []byte) error {
50+
func readRecord(rd *ringbufEventRing, rec *Record) error {
5351
rd.loadConsumer()
5452

55-
buf = buf[:unix.BPF_RINGBUF_HDR_SZ]
56-
if _, err := io.ReadFull(rd, buf); err == io.EOF {
53+
header, err := rd.readHeader()
54+
if err == io.EOF {
5755
return errEOR
5856
} else if err != nil {
5957
return fmt.Errorf("read event header: %w", err)
6058
}
6159

62-
header := ringbufHeader{
63-
internal.NativeEndian.Uint32(buf[0:4]),
64-
internal.NativeEndian.Uint32(buf[4:8]),
65-
}
66-
6760
if header.isBusy() {
6861
// the next sample in the ring is not committed yet so we
6962
// exit without storing the reader/consumer position
@@ -111,7 +104,6 @@ type Reader struct {
111104
mu sync.Mutex
112105
ring *ringbufEventRing
113106
epollEvents []unix.EpollEvent
114-
header []byte
115107
haveData bool
116108
deadline time.Time
117109
bufferSize int
@@ -148,7 +140,6 @@ func NewReader(ringbufMap *ebpf.Map) (*Reader, error) {
148140
poller: poller,
149141
ring: ring,
150142
epollEvents: make([]unix.EpollEvent, 1),
151-
header: make([]byte, unix.BPF_RINGBUF_HDR_SZ),
152143
bufferSize: ring.size(),
153144
}, nil
154145
}
@@ -220,7 +211,7 @@ func (r *Reader) ReadInto(rec *Record) error {
220211
}
221212

222213
for {
223-
err := readRecord(r.ring, rec, r.header)
214+
err := readRecord(r.ring, rec)
224215
// Not using errors.Is which is quite a bit slower
225216
// For a tight loop it might make a difference
226217
if err == errBusy || err == errDiscard {

ringbuf/ring.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,26 @@ func (rr *ringReader) remaining() int {
102102
return int((prod - cons) & rr.mask)
103103
}
104104

105+
func (rr *ringReader) readHeader() (ringbufHeader, error) {
106+
prod := atomic.LoadUint64(rr.prod_pos)
107+
108+
if remaining := prod - rr.cons; remaining == 0 {
109+
return ringbufHeader{}, io.EOF
110+
} else if remaining < unix.BPF_RINGBUF_HDR_SZ {
111+
return ringbufHeader{}, io.ErrUnexpectedEOF
112+
}
113+
114+
// read the len field of the header atomically to ensure a happens before
115+
// relationship with the xchg in the kernel. Without this we may see len
116+
// without BPF_RINGBUF_BUSY_BIT before the written data is visible.
117+
// See https://github.com/torvalds/linux/blob/v6.8/kernel/bpf/ringbuf.c#L484
118+
len := atomic.LoadUint32((*uint32)((unsafe.Pointer)(&rr.ring[rr.cons&rr.mask])))
119+
header := ringbufHeader{Len: len}
120+
121+
rr.cons += unix.BPF_RINGBUF_HDR_SZ
122+
return header, nil
123+
}
124+
105125
func (rr *ringReader) Read(p []byte) (int, error) {
106126
prod := atomic.LoadUint64(rr.prod_pos)
107127
n := min(prod-rr.cons, uint64(len(p)))

0 commit comments

Comments
 (0)