diff --git a/metrics/metrics.go b/metrics/metrics.go index 0abdfee..67a7700 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -152,6 +152,22 @@ var ( Help: "Number of snapshots taken.", }, ) + + // LargeNetlinkMsgTotal counts the total number of snapshots collected across all connections. + LargeNetlinkMsgTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "large_netlink_total", + Help: "Number of oversize netlink messages.", + }, []string{"type"}, + ) + + // NetlinkNotDecoded counts the total number of snapshots collected across all connections. + NetlinkNotDecoded = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "netlink_skipped_total", + Help: "Number of skipped netlink messages.", + }, []string{"type"}, + ) ) // init() prints a log message to let the user know that the package has been diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index 31d460c..d5c098c 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -4,6 +4,7 @@ package snapshot import ( "errors" + "fmt" "io" "log" "time" @@ -11,6 +12,7 @@ import ( "github.com/m-lab/go/logx" "github.com/m-lab/tcp-info/inetdiag" + "github.com/m-lab/tcp-info/metrics" "github.com/m-lab/tcp-info/netlink" "github.com/m-lab/tcp-info/tcp" ) @@ -18,6 +20,8 @@ import ( // ErrEmptyRecord is returned if an ArchivalRecord is empty. var ErrEmptyRecord = errors.New("Message should contain Metadata or RawIDM") +var missingDecodeLog = logx.NewLogEvery(nil, time.Second) + // Decode decodes a netlink.ArchivalRecord into a single Snapshot // Initial ArchivalRecord may have just a Snapshot, just Metadata, or both. func Decode(ar *netlink.ArchivalRecord) (*netlink.Metadata, *Snapshot, error) { @@ -62,13 +66,17 @@ func Decode(ar *netlink.ArchivalRecord) (*netlink.Metadata, *Snapshot, error) { case inetdiag.INET_DIAG_PROTOCOL: result.Protocol, ok = rta.toProtocol() case inetdiag.INET_DIAG_SKV6ONLY: - log.Println("SKV6ONLY not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_SKV6ONLY").Inc() + missingDecodeLog.Println("SKV6ONLY not handled", len(rta)) case inetdiag.INET_DIAG_LOCALS: - log.Println("LOCAL not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_LOCALS").Inc() + missingDecodeLog.Println("LOCAL not handled", len(rta)) case inetdiag.INET_DIAG_PEERS: - log.Println("PEERS not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_PEERS").Inc() + missingDecodeLog.Println("PEERS not handled", len(rta)) case inetdiag.INET_DIAG_PAD: - log.Println("PAD not handled", len(rta)) + metrics.NetlinkNotDecoded.WithLabelValues("INET_DIAG_PAD").Inc() + missingDecodeLog.Println("PAD not handled", len(rta)) case inetdiag.INET_DIAG_MARK: result.Mark, ok = rta.toMark() case inetdiag.INET_DIAG_BBRINFO: @@ -76,10 +84,10 @@ func Decode(ar *netlink.ArchivalRecord) (*netlink.Metadata, *Snapshot, error) { case inetdiag.INET_DIAG_CLASS_ID: result.ClassID, ok = rta.toClassID() case inetdiag.INET_DIAG_MD5SIG: - log.Println("MD5SIGnot handled", len(rta)) + missingDecodeLog.Println("MD5SIGnot handled", len(rta)) default: - // TODO metric so we can alert. - log.Println("unhandled attribute type:", t) + metrics.NetlinkNotDecoded.WithLabelValues(fmt.Sprint(t)).Inc() + missingDecodeLog.Println("unhandled attribute type:", t) } bit := uint32(1) << uint8(t-1) result.Observed |= bit @@ -102,20 +110,23 @@ type RouteAttrValue []byte // maybeCopy checks whether the src is the full size of the intended struct size. // If so, it just returns the pointer, otherwise it copies the content to an // appropriately sized new byte slice, and returns pointer to that. -func maybeCopy(src []byte, size int) (unsafe.Pointer, bool) { +func maybeCopy(src []byte, size int, msgType string) (unsafe.Pointer, bool) { if len(src) < size { data := make([]byte, size) copy(data, src) return unsafe.Pointer(&data[0]), true } // TODO Check for larger than expected, and increment a metric with appropriate label. + if len(src) > size { + metrics.LargeNetlinkMsgTotal.WithLabelValues(msgType).Inc() + } return unsafe.Pointer(&src[0]), len(src) == size } // toMemInfo maps the raw RouteAttrValue onto a MemInfo. func (raw RouteAttrValue) toMemInfo() (*inetdiag.MemInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.MemInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "MemInfo") if !ok { oneSecondLog.Println("memInfo data is larger than struct") } @@ -126,7 +137,7 @@ func (raw RouteAttrValue) toMemInfo() (*inetdiag.MemInfo, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toLinuxTCPInfo() (*tcp.LinuxTCPInfo, bool) { structSize := (int)(unsafe.Sizeof(tcp.LinuxTCPInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "TCPInfo") if !ok { oneSecondLog.Println("tcpinfo data is larger than struct") } @@ -137,7 +148,7 @@ func (raw RouteAttrValue) toLinuxTCPInfo() (*tcp.LinuxTCPInfo, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toVegasInfo() (*inetdiag.VegasInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.VegasInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "VegasInfo") return (*inetdiag.VegasInfo)(data), ok } @@ -174,7 +185,7 @@ func (raw RouteAttrValue) toClassID() (uint8, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toSockMemInfo() (*inetdiag.SocketMemInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.SocketMemInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "SockMemInfo") return (*inetdiag.SocketMemInfo)(data), ok } @@ -186,7 +197,7 @@ func (raw RouteAttrValue) toShutdown() (uint8, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toDCTCPInfo() (*inetdiag.DCTCPInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.DCTCPInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "DCTCPInfo") return (*inetdiag.DCTCPInfo)(data), ok } @@ -206,7 +217,7 @@ func (raw RouteAttrValue) toMark() (uint32, bool) { // For older data, it may have to copy the bytes. func (raw RouteAttrValue) toBBRInfo() (*inetdiag.BBRInfo, bool) { structSize := (int)(unsafe.Sizeof(inetdiag.BBRInfo{})) - data, ok := maybeCopy(raw, structSize) + data, ok := maybeCopy(raw, structSize, "BBRInfo") return (*inetdiag.BBRInfo)(data), ok }