-
Notifications
You must be signed in to change notification settings - Fork 848
ringbuf: add zero-copy consumer APIs #1915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason for the change on this file?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CI told me to run There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. poking here -
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not really, we tend to be quite anal about reproducible builds and tests. This file is pulled from an image in cilium/ci-kernels, which saw some changes recently. A rebase should fix this. |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -5,9 +5,12 @@ import ( | |||||
| "fmt" | ||||||
| "os" | ||||||
| "sync" | ||||||
| "sync/atomic" | ||||||
| "time" | ||||||
| "unsafe" | ||||||
|
|
||||||
| "iter" | ||||||
|
|
||||||
| "github.com/cilium/ebpf" | ||||||
| "github.com/cilium/ebpf/internal/platform" | ||||||
| "github.com/cilium/ebpf/internal/sys" | ||||||
|
|
@@ -40,6 +43,11 @@ func (rh *ringbufHeader) dataLen() int { | |||||
| } | ||||||
|
|
||||||
| type Record struct { | ||||||
| // RawSample contains the raw bytes of a ringbuf record. | ||||||
| // | ||||||
| // When obtained via [Reader.Records], RawSample is a zero-copy view into the | ||||||
| // underlying mmap. It is only valid until the iterator yields the next | ||||||
| // record or terminates. Callers must copy the data if they need to retain it. | ||||||
| RawSample []byte | ||||||
|
|
||||||
| // The minimum number of bytes remaining in the ring buffer after this Record has been read. | ||||||
|
|
@@ -144,6 +152,59 @@ func (r *Reader) Read() (Record, error) { | |||||
|
|
||||||
| // ReadInto is like Read except that it allows reusing Record and associated buffers. | ||||||
| func (r *Reader) ReadInto(rec *Record) error { | ||||||
| return r.readLocked(func() error { | ||||||
| return r.ring.readRecord(rec) | ||||||
| }) | ||||||
| } | ||||||
|
|
||||||
| // Records iterates over records in the reader until [Reader.Close] is called. | ||||||
| // | ||||||
| // Record.RawSample is only valid until the next call to the iterator. Callers | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Even more explicit: both Record and Record.RawSample are only valid until the next call. |
||||||
| // must copy the data if it needs to outlive the current iteration. | ||||||
| // | ||||||
| // This convenience wrapper allocates a single Record once. To fully avoid | ||||||
| // allocations, use [Reader.RecordsInto] and pass in a reusable Record. | ||||||
| func (r *Reader) Records() iter.Seq2[*Record, error] { | ||||||
| rec := Record{} | ||||||
| return r.RecordsInto(&rec) | ||||||
| } | ||||||
|
|
||||||
| // RecordsInto is like Records but allows reusing the Record and associated buffers. | ||||||
| func (r *Reader) RecordsInto(rec *Record) iter.Seq2[*Record, error] { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||||||
| return func(yield func(*Record, error) bool) { | ||||||
| var ( | ||||||
| sample []byte | ||||||
| remaining int | ||||||
| nextCons uintptr | ||||||
| ) | ||||||
|
|
||||||
| for { | ||||||
| err := r.readLocked(func() error { | ||||||
| var err error | ||||||
| sample, remaining, nextCons, err = r.ring.readSample() | ||||||
| return err | ||||||
| }) | ||||||
| if err != nil { | ||||||
| yield(nil, err) | ||||||
| return | ||||||
| } | ||||||
|
|
||||||
| // Limit cap to len so append can't write past the record and corrupt the ring. | ||||||
| rec.RawSample = sample[:len(sample):len(sample)] | ||||||
| rec.Remaining = remaining | ||||||
|
|
||||||
| if !yield(rec, nil) { | ||||||
| atomic.StoreUintptr(r.ring.cons_pos, nextCons) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is safe to do. Consider two concurrent callers of Records():
I think you need to hold the lock across the yield. Maybe its as simple as moving it all into readLocked?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am aware of this concurrent read issue, my first version was holding the lock during the iterator. However, that leads to the problem of dead lock when calling reader.SetDeadline: for rec, err := range reader.Records() {
reader.SetDeadline() // dead lock
}reader.Close() also tries to acquire the reader.mu, it will become impossible to call Close() during iteration 😬
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I drafted a PR on my fork for your preview: jschwinger233#5
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not being able to call SetDeadline is a problem, that should work. This might mean you need to split the lock somehow. Take a look at the perf reader, this has more complex locking due to pauseMu. IIRC it also drops the lock before waiting, and then acquires it after. Maybe you can use a similar strategy here (which could resolve Close() during Wait()).
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Let me make it clear, so we are talking something like for rec, err := range reader.Records() {
// read.mu is released
reader.SetDeadline() // no dead lock
}This strategy doesn't fix the two concurrent callers of Records(), right?
Consider 1st reader: for rec, err := range reader.Records() {
// read.mu is released
time.Sleep() // for some reason, let's sleep
handle_rec()
}2nd reader: for rec, err := range reader.Records() {
// read.mu is released, so no blocking when 1st reader is sleeping
handle_rec()
}2nd reader will move forward the cosumer_pointer, leaving slower 1st reader's rec invalid. |
||||||
| return | ||||||
| } | ||||||
|
|
||||||
| atomic.StoreUintptr(r.ring.cons_pos, nextCons) | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // readLocked drives the polling / data-availability loop shared by Record reads. | ||||||
| func (r *Reader) readLocked(read func() error) error { | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. having read be a closure over Reader state is a bit black magic. How about
Suggested change
readLocked would be responsible for calling readSample and advancing the consumer position. Probably also easier for the compiler to turn into good code since handle doesn't have to be a closure. |
||||||
| r.mu.Lock() | ||||||
| defer r.mu.Unlock() | ||||||
|
|
||||||
|
|
@@ -171,9 +232,7 @@ func (r *Reader) ReadInto(rec *Record) error { | |||||
| } | ||||||
|
|
||||||
| for { | ||||||
| err := r.ring.readRecord(rec) | ||||||
| // Not using errors.Is which is quite a bit slower | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: please keep the comment. |
||||||
| // For a tight loop it might make a difference | ||||||
| err := read() | ||||||
| if err == errBusy { | ||||||
| continue | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -384,3 +384,34 @@ func BenchmarkReadInto(b *testing.B) { | |
| } | ||
| } | ||
| } | ||
|
|
||
| func TestRecordsIterator(t *testing.T) { | ||
| testutils.SkipOnOldKernel(t, "5.8", "BPF ring buffer") | ||
|
|
||
| prog, events := mustOutputSamplesProg(t, | ||
| sampleMessage{size: 5, flags: 0}, | ||
| sampleMessage{size: 7, flags: 0}, | ||
| ) | ||
| mustRun(t, prog) | ||
|
|
||
| rd, err := NewReader(events) | ||
| if err != nil { | ||
| t.Fatal(err) | ||
| } | ||
| defer rd.Close() | ||
|
|
||
| var seen [][]byte | ||
| for rec, err := range rd.Records() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs a test that Reader.Close() stops the iterator without yielding io.ErrClosed. Using iter.Pull might be the easiest way to achieve that. |
||
| if err != nil { | ||
| t.Fatalf("iteration error: %v", err) | ||
| } | ||
| seen = append(seen, append([]byte(nil), rec.RawSample...)) | ||
| if len(seen) == 2 { | ||
| break | ||
| } | ||
| } | ||
|
|
||
| qt.Assert(t, qt.Equals(len(seen), 2)) | ||
| qt.Assert(t, qt.DeepEquals(seen[0], []byte{1, 2, 3, 4, 4})) | ||
| qt.Assert(t, qt.DeepEquals(seen[1], []byte{1, 2, 3, 4, 4, 3, 2})) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,16 +41,17 @@ func (rr *ringReader) AvailableBytes() uint64 { | |
| return uint64(prod - cons) | ||
| } | ||
|
|
||
| // Read a record from an event ring. | ||
| func (rr *ringReader) readRecord(rec *Record) error { | ||
| // readSample returns a zero-copy view into the next sample, together with the | ||
| // consumer position that should be stored to release the data. | ||
| func (rr *ringReader) readSample() (sample []byte, remaining int, nextCons uintptr, err error) { | ||
| prod := atomic.LoadUintptr(rr.prod_pos) | ||
| cons := atomic.LoadUintptr(rr.cons_pos) | ||
|
|
||
| for { | ||
| if remaining := prod - cons; remaining == 0 { | ||
| return errEOR | ||
| return nil, 0, 0, errEOR | ||
| } else if remaining < sys.BPF_RINGBUF_HDR_SZ { | ||
| return fmt.Errorf("read record header: %w", io.ErrUnexpectedEOF) | ||
| return nil, 0, 0, fmt.Errorf("read record header: %w", io.ErrUnexpectedEOF) | ||
| } | ||
|
|
||
| // read the len field of the header atomically to ensure a happens before | ||
|
|
@@ -65,15 +66,15 @@ func (rr *ringReader) readRecord(rec *Record) error { | |
| // the next sample in the ring is not committed yet so we | ||
| // exit without storing the reader/consumer position | ||
| // and start again from the same position. | ||
| return errBusy | ||
| return nil, 0, 0, errBusy | ||
| } | ||
|
|
||
| cons += sys.BPF_RINGBUF_HDR_SZ | ||
|
|
||
| // Data is always padded to 8 byte alignment. | ||
| dataLenAligned := uintptr(internal.Align(header.dataLen(), 8)) | ||
| if remaining := prod - cons; remaining < dataLenAligned { | ||
| return fmt.Errorf("read sample data: %w", io.ErrUnexpectedEOF) | ||
| return nil, 0, 0, fmt.Errorf("read sample data: %w", io.ErrUnexpectedEOF) | ||
| } | ||
|
|
||
| start = cons & rr.mask | ||
|
|
@@ -87,15 +88,26 @@ func (rr *ringReader) readRecord(rec *Record) error { | |
| continue | ||
| } | ||
|
|
||
| if n := header.dataLen(); cap(rec.RawSample) < n { | ||
| rec.RawSample = make([]byte, n) | ||
| } else { | ||
| rec.RawSample = rec.RawSample[:n] | ||
| } | ||
| end := int(start) + header.dataLen() | ||
| return rr.ring[start:end], int(prod - cons), cons, nil | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reslicing to reduce cap could already happen here. |
||
| } | ||
| } | ||
|
|
||
| copy(rec.RawSample, rr.ring[start:]) | ||
| rec.Remaining = int(prod - cons) | ||
| atomic.StoreUintptr(rr.cons_pos, cons) | ||
| return nil | ||
| // Read a record from an event ring, copying the sample into the provided Record. | ||
| func (rr *ringReader) readRecord(rec *Record) error { | ||
| sample, remaining, nextCons, err := rr.readSample() | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if n := len(sample); cap(rec.RawSample) < n { | ||
| rec.RawSample = make([]byte, n) | ||
| } else { | ||
| rec.RawSample = rec.RawSample[:n] | ||
| } | ||
|
|
||
| copy(rec.RawSample, sample) | ||
| rec.Remaining = remaining | ||
| atomic.StoreUintptr(rr.cons_pos, nextCons) | ||
| return nil | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.