Skip to content

Commit

Permalink
Merge pull request #63 from mariomac/capacity-limiter
Browse files Browse the repository at this point in the history
NETOBSERV-613: drop messages when they accumulate in the exporter
  • Loading branch information
jotak authored Oct 19, 2022
2 parents f63d104 + 1f06575 commit 689b24d
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 9 deletions.
11 changes: 6 additions & 5 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ flowchart TD
E(ebpf.FlowFetcher) --> |"pushes via<br/>RingBuffer"| RB(flow.RingBufTracer)
E <--> |"polls<br/>PerCPUHashMap"| M(flow.MapTracer)
RB --> |*flow.Record| ACC(flow.Accounter)
ACC --> |"[]*flow.Record"| DD(flow.Deduper)
M --> |"[]*flow.Record"| DD
RB --> |chan *flow.Record| ACC(flow.Accounter)
ACC --> |"chan []*flow.Record"| DD(flow.Deduper)
M --> |"chan []*flow.Record"| DD
subgraph Optional
DD
end
DD --> |"[]*flow.Record"| EX("export.GRPCProto<br/>or<br/>export.KafkaProto")
DD --> |"chan []*flow.Record"| CL(flow.CapacityLimiter)
CL --> |"chan []*flow.Record"| EX("export.GRPCProto<br/>or<br/>export.KafkaProto")
```
4 changes: 4 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ so no user should need to change them.

* `BUFFERS_LENGTH` (default: `50`). Length of the internal communication channels between the different
processing stages.
* `EXPORTER_BUFFER_LENGTH` (default: value of `BUFFERS_LENGTH`) establishes the length of the buffer
of flow batches (not individual flows) that can be accumulated before the Kafka or GRPC exporter.
When this buffer is full (e.g. because the Kafka or GRPC endpoint is slow), incoming flow batches
will be dropped. If unset, its value is the same as the BUFFERS_LENGTH property.
* `KAFKA_ASYNC` (default: `true`). If `true`, the message writing process will never block. It also
means that errors are ignored since the caller will not receive the returned value.
* `LISTEN_INTERFACES` (default: `watch`). Mechanism used by the agent to listen for added or removed
Expand Down
17 changes: 13 additions & 4 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,21 +265,30 @@ func (f *Flows) buildAndStartPipeline(ctx context.Context) (*node.Terminal, erro
accounter := node.AsMiddle(f.accounter.Account,
node.ChannelBufferLen(f.cfg.BuffersLength))

export := node.AsTerminal(f.exporter,
limiter := node.AsMiddle((&flow.CapacityLimiter{}).Limit,
node.ChannelBufferLen(f.cfg.BuffersLength))

ebl := f.cfg.ExporterBufferLength
if ebl == 0 {
ebl = f.cfg.BuffersLength
}

export := node.AsTerminal(f.exporter,
node.ChannelBufferLen(ebl))

rbTracer.SendsTo(accounter)

if f.cfg.Deduper == DeduperFirstCome {
deduper := node.AsMiddle(flow.Dedupe(f.cfg.DeduperFCExpiry),
node.ChannelBufferLen(f.cfg.BuffersLength))
mapTracer.SendsTo(deduper)
accounter.SendsTo(deduper)
deduper.SendsTo(export)
deduper.SendsTo(limiter)
} else {
mapTracer.SendsTo(export)
accounter.SendsTo(export)
mapTracer.SendsTo(limiter)
accounter.SendsTo(limiter)
}
limiter.SendsTo(export)
alog.Debug("starting graph")
mapTracer.Start()
rbTracer.Start()
Expand Down
5 changes: 5 additions & 0 deletions pkg/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ type Config struct {
// BuffersLength establishes the length of communication channels between the different processing
// stages
BuffersLength int `env:"BUFFERS_LENGTH" envDefault:"50"`
// ExporterBufferLength establishes the length of the buffer of flow batches (not individual flows)
// that can be accumulated before the Kafka or GRPC exporter. When this buffer is full (e.g.
// because the Kafka or GRPC endpoint is slow), incoming flow batches will be dropped. If unset,
// its value is the same as the BUFFERS_LENGTH property.
ExporterBufferLength int `env:"EXPORTER_BUFFER_LENGTH"`
// CacheMaxFlows specifies how many flows can be accumulated in the accounting cache before
// being flushed for its later export
CacheMaxFlows int `env:"CACHE_MAX_FLOWS" envDefault:"5000"`
Expand Down
59 changes: 59 additions & 0 deletions pkg/flow/limiter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package flow

import (
"time"

"github.com/sirupsen/logrus"
)

const initialLogPeriod = time.Minute
const maxLogPeriod = time.Hour

var cllog = logrus.WithField("component", "capacity.Limiter")

// CapacityLimiter forwards the flows between two nodes but checks the status of the destination
// node's buffered channel. If it is already full, it drops the incoming flow and periodically will
// log a message about the number of lost flows.
type CapacityLimiter struct {
droppedFlows int
}

func (c *CapacityLimiter) Limit(in <-chan []*Record, out chan<- []*Record) {
go c.logDroppedFlows()
for i := range in {
if len(out) < cap(out) {
out <- i
} else {
c.droppedFlows += len(i)
}
}
}

func (c *CapacityLimiter) logDroppedFlows() {
logPeriod := initialLogPeriod
debugging := logrus.IsLevelEnabled(logrus.DebugLevel)
for {
time.Sleep(logPeriod)

// a race condition might happen in this counter but it's not important as it's just for
// logging purposes
df := c.droppedFlows
if df > 0 {
c.droppedFlows = 0
cllog.Warnf("%d flows were dropped during the last %s because the agent is forwarding "+
"more flows than the remote ingestor is able to process. You might "+
"want to increase the CACHE_MAX_FLOWS and CACHE_ACTIVE_TIMEOUT property",
df, logPeriod)

// if not debug logs, backoff to avoid flooding the log with warning messages
if !debugging && logPeriod < maxLogPeriod {
logPeriod *= 2
if logPeriod > maxLogPeriod {
logPeriod = maxLogPeriod
}
}
} else {
logPeriod = initialLogPeriod
}
}
}
87 changes: 87 additions & 0 deletions pkg/flow/limiter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package flow

import (
"strconv"
"testing"

"github.com/netobserv/gopipes/pkg/node"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

const limiterLen = 50

func TestCapacityLimiter_NoDrop(t *testing.T) {
// GIVEN a limiter-enabled pipeline
pipeIn, pipeOut := capacityLimiterPipe()

// WHEN it buffers less elements than it's maximum capacity
for i := 0; i < 33; i++ {
pipeIn <- []*Record{{Interface: strconv.Itoa(i)}}
}

// THEN it is able to retrieve all the buffered elements
for i := 0; i < 33; i++ {
elem := <-pipeOut
require.Len(t, elem, 1)
assert.Equal(t, strconv.Itoa(i), elem[0].Interface)
}

// AND not a single extra element
select {
case elem := <-pipeOut:
assert.Failf(t, "unexpected element", "%#v", elem)
default:
// ok!
}
}

func TestCapacityLimiter_Drop(t *testing.T) {
// GIVEN a limiter-enabled pipeline
pipeIn, pipeOut := capacityLimiterPipe()

// WHEN it receives more elements than its maximum capacity
// (it's not blocking)
for i := 0; i < limiterLen*2; i++ {
pipeIn <- []*Record{{Interface: strconv.Itoa(i)}}
}

// THEN it is only able to retrieve all the nth first buffered elements
// (plus the single element that is buffered in the output channel)
for i := 0; i < limiterLen+1; i++ {
elem := <-pipeOut
require.Len(t, elem, 1)
assert.Equal(t, strconv.Itoa(i), elem[0].Interface)
}

// BUT not a single extra element
select {
case elem := <-pipeOut:
assert.Failf(t, "unexpected element", "%#v", elem)
default:
// ok!
}
}

func capacityLimiterPipe() (in chan<- []*Record, out <-chan []*Record) {
inCh, outCh := make(chan []*Record), make(chan []*Record)

init := node.AsInit(func(initOut chan<- []*Record) {
for i := range inCh {
initOut <- i
}
})
limiter := node.AsMiddle((&CapacityLimiter{}).Limit)
term := node.AsTerminal(func(termIn <-chan []*Record) {
for i := range termIn {
outCh <- i
}
}, node.ChannelBufferLen(limiterLen))

init.SendsTo(limiter)
limiter.SendsTo(term)

init.Start()

return inCh, outCh
}

0 comments on commit 689b24d

Please sign in to comment.