From 70686e3df1b786e799fc91b7fdef3705d9f098b5 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Tue, 30 Jun 2020 12:09:03 -0400 Subject: [PATCH 1/3] add oversize msg metric --- metrics/metrics.go | 8 ++++++++ snapshot/snapshot.go | 18 +++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 0abdfee..5dabcd2 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -152,6 +152,14 @@ var ( Help: "Number of snapshots taken.", }, ) + + // LargeNetlinkMsgTotal counts the total number of snapshots collected across all connections. + LargeNetlinkMsgTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "large_netlink_total", + Help: "Number of oversize netlink messages.", + }, []string{"type"}, + ) ) // init() prints a log message to let the user know that the package has been diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index 31d460c..ff33f33 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -11,6 +11,7 @@ import ( "github.com/m-lab/go/logx" "github.com/m-lab/tcp-info/inetdiag" + "github.com/m-lab/tcp-info/metrics" "github.com/m-lab/tcp-info/netlink" "github.com/m-lab/tcp-info/tcp" ) @@ -102,20 +103,23 @@ type RouteAttrValue []byte // maybeCopy checks whether the src is the full size of the intended struct size. // If so, it just returns the pointer, otherwise it copies the content to an // appropriately sized new byte slice, and returns pointer to that. -func maybeCopy(src []byte, size int) (unsafe.Pointer, bool) { +func maybeCopy(src []byte, size int, msgType string) (unsafe.Pointer, bool) { if len(src) < size { data := make([]byte, size) copy(data, src) return unsafe.Pointer(&data[0]), true } // TODO Check for larger than expected, and increment a metric with appropriate label. + if len(src) > size { + metrics.LargeNetlinkMsgTotal.WithLabelValues(msgType).Inc() + } return unsafe.Pointer(&src[0]), len(src) == size } // toMemInfo maps the raw RouteAttrValue onto a MemInfo. func (raw RouteAttrValue) toMemInfo() (*inetdiag.MemInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.MemInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "MemInfo") if !ok { oneSecondLog.Println("memInfo data is larger than struct") } @@ -126,7 +130,7 @@ func (raw RouteAttrValue) toMemInfo() (*inetdiag.MemInfo, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toLinuxTCPInfo() (*tcp.LinuxTCPInfo, bool) { structSize := (int)(unsafe.Sizeof(tcp.LinuxTCPInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "TCPInfo") if !ok { oneSecondLog.Println("tcpinfo data is larger than struct") } @@ -137,7 +141,7 @@ func (raw RouteAttrValue) toLinuxTCPInfo() (*tcp.LinuxTCPInfo, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toVegasInfo() (*inetdiag.VegasInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.VegasInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "VegasInfo") return (*inetdiag.VegasInfo)(data), ok } @@ -174,7 +178,7 @@ func (raw RouteAttrValue) toClassID() (uint8, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toSockMemInfo() (*inetdiag.SocketMemInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.SocketMemInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "SockMemInfo") return (*inetdiag.SocketMemInfo)(data), ok } @@ -186,7 +190,7 @@ func (raw RouteAttrValue) toShutdown() (uint8, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toDCTCPInfo() (*inetdiag.DCTCPInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.DCTCPInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "DCTCPInfo") return (*inetdiag.DCTCPInfo)(data), ok } @@ -206,7 +210,7 @@ func (raw RouteAttrValue) toMark() (uint32, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toBBRInfo() (*inetdiag.BBRInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.BBRInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "BBRInfo") return (*inetdiag.BBRInfo)(data), ok } From ebee1bd4dd6a1ce989cd5ffd83f27dc1ee7ad929 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Tue, 30 Jun 2020 16:35:24 -0400 Subject: [PATCH 2/3] add metric for ignored messages --- metrics/metrics.go | 8 ++++++++ snapshot/snapshot.go | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 5dabcd2..d0cc18f 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -160,6 +160,14 @@ var ( Help: "Number of oversize netlink messages.", }, []string{"type"}, ) + + // LargeNetlinkMsgTotal counts the total number of snapshots collected across all connections. + NetlinkNotDecoded = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "netlink_skipped_total", + Help: "Number of skipped netlink messages.", + }, []string{"type"}, + ) ) // init() prints a log message to let the user know that the package has been diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index ff33f33..d5c098c 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -4,6 +4,7 @@ package snapshot import ( "errors" + "fmt" "io" "log" "time" @@ -19,6 +20,8 @@ import ( // ErrEmptyRecord is returned if an ArchivalRecord is empty. var ErrEmptyRecord = errors.New("Message should contain Metadata or RawIDM") +var missingDecodeLog = logx.NewLogEvery(nil, time.Second) + // Decode decodes a netlink.ArchivalRecord into a single Snapshot // Initial ArchivalRecord may have just a Snapshot, just Metadata, or both. func Decode(ar *netlink.ArchivalRecord) (*netlink.Metadata, *Snapshot, error) { @@ -63,13 +66,17 @@ func Decode(ar *netlink.ArchivalRecord) (*netlink.Metadata, *Snapshot, error) { case inetdiag.INET_DIAG_PROTOCOL: result.Protocol, ok = rta.toProtocol() case inetdiag.INET_DIAG_SKV6ONLY: - log.Println("SKV6ONLY not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_SKV6ONLY").Inc() + missingDecodeLog.Println("SKV6ONLY not handled", len(rta)) case inetdiag.INET_DIAG_LOCALS: - log.Println("LOCAL not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_LOCALS").Inc() + missingDecodeLog.Println("LOCAL not handled", len(rta)) case inetdiag.INET_DIAG_PEERS: - log.Println("PEERS not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_PEERS").Inc() + missingDecodeLog.Println("PEERS not handled", len(rta)) case inetdiag.INET_DIAG_PAD: - log.Println("PAD not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_PAD").Inc() + missingDecodeLog.Println("PAD not handled", len(rta)) case inetdiag.INET_DIAG_MARK: result.Mark, ok = rta.toMark() case inetdiag.INET_DIAG_BBRINFO: @@ -77,10 +84,10 @@ func Decode(ar *netlink.ArchivalRecord) (*netlink.Metadata, *Snapshot, error) { case inetdiag.INET_DIAG_CLASS_ID: result.ClassID, ok = rta.toClassID() case inetdiag.INET_DIAG_MD5SIG: - log.Println("MD5SIGnot handled", len(rta)) + missingDecodeLog.Println("MD5SIGnot handled", len(rta)) default: - // TODO metric so we can alert. - log.Println("unhandled attribute type:", t) + metrics.NetlinkNotDecoded.WithLabelValues(fmt.Sprint(t)).Inc() + missingDecodeLog.Println("unhandled attribute type:", t) } bit := uint32(1) << uint8(t-1) result.Observed |= bit From 279a15fa69d91e5b46d1965c6191da517eadc6fa Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Tue, 30 Jun 2020 16:53:30 -0400 Subject: [PATCH 3/3] fix docstring --- metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index d0cc18f..67a7700 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -161,7 +161,7 @@ var ( }, []string{"type"}, ) - // LargeNetlinkMsgTotal counts the total number of snapshots collected across all connections. + // NetlinkNotDecoded counts the total number of snapshots collected across all connections. NetlinkNotDecoded = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "netlink_skipped_total",