From 21d8899c70bcd091397c6afb92f1d43c6d6cb2f3 Mon Sep 17 00:00:00 2001 From: Adrian Lopez Date: Tue, 26 Jan 2021 18:37:39 +0100 Subject: [PATCH 1/4] Procstat: collect connections and listener for each proc Procstat modified to add a new metric "procstat_tcp" with to values "conn" and "listen". It also adds number of connections for each TCP state. Those metrics will be added only if it they are activated specifically. Those values will contain a comma separated value of the endpoints where the proc is connecting to or listening (IPv4 and IPv6). Only for linux. Local, virtual and docker interfaces are ignored. If some proc is listening on 0.0.0.0, an endpoint for each of the "public" (those not ignored as internal) IPv4 IPs is created. If it is listening on :: (IPv6) an endpoint is created for each IPv4 and IPv6. For programs with one parent and several children, all listening in some endpoint, only the parent process is taken into account. Child endpoints are ignored. Connections made to this cost (local port is one of the listening ports) are ignored. To avoid having servers with thousands of connections. We prefer to collect that info in the clients. It is also added connection info (number of connections in each of the TCP states) for each proc. Improving PR #5402 To gather tcp connections netlink is used, to avoid the cost of parsing /proc/net/tcp(6), but /proc should be readed to get the mapping between inodes and pids. --- docs/LICENSE_OF_DEPENDENCIES.md | 1 + go.mod | 1 + go.sum | 3 + plugins/inputs/procstat/README.md | 44 ++ plugins/inputs/procstat/connections.go | 38 ++ .../inputs/procstat/connections_fallback.go | 35 + plugins/inputs/procstat/connections_linux.go | 227 +++++++ plugins/inputs/procstat/process.go | 10 + plugins/inputs/procstat/procstat.go | 131 +++- plugins/inputs/procstat/procstat_fallback.go | 16 + plugins/inputs/procstat/procstat_linux.go | 109 +++ .../inputs/procstat/procstat_linux_test.go | 625 ++++++++++++++++++ plugins/inputs/procstat/procstat_test.go | 6 + 13 files changed, 1229 insertions(+), 17 deletions(-) create mode 100644 plugins/inputs/procstat/connections.go create mode 100644 plugins/inputs/procstat/connections_fallback.go create mode 100644 plugins/inputs/procstat/connections_linux.go create mode 100644 plugins/inputs/procstat/procstat_fallback.go create mode 100644 plugins/inputs/procstat/procstat_linux.go create mode 100644 plugins/inputs/procstat/procstat_linux_test.go diff --git a/docs/LICENSE_OF_DEPENDENCIES.md b/docs/LICENSE_OF_DEPENDENCIES.md index 14c46448c3b4a..e6b9540b969b9 100644 --- a/docs/LICENSE_OF_DEPENDENCIES.md +++ b/docs/LICENSE_OF_DEPENDENCIES.md @@ -49,6 +49,7 @@ following works: - github.com/eapache/go-xerial-snappy [MIT License](https://github.com/eapache/go-xerial-snappy/blob/master/LICENSE) - github.com/eapache/queue [MIT License](https://github.com/eapache/queue/blob/master/LICENSE) - github.com/eclipse/paho.mqtt.golang [Eclipse Public License - v 1.0](https://github.com/eclipse/paho.mqtt.golang/blob/master/LICENSE) +- github.com/elastic/gosigar [Apache License 2.0](https://github.com/elastic/gosigar/blob/master/LICENSE) - github.com/ericchiang/k8s [Apache License 2.0](https://github.com/ericchiang/k8s/blob/master/LICENSE) - github.com/ghodss/yaml [MIT License](https://github.com/ghodss/yaml/blob/master/LICENSE) - github.com/go-logfmt/logfmt [MIT License](https://github.com/go-logfmt/logfmt/blob/master/LICENSE) diff --git a/go.mod b/go.mod index 45a9a48ba618e..a4e8a3ef2368a 100644 --- a/go.mod +++ b/go.mod @@ -47,6 +47,7 @@ require ( github.com/docker/go-units v0.3.3 // indirect github.com/docker/libnetwork v0.8.0-dev.2.0.20181012153825-d7b61745d166 github.com/eclipse/paho.mqtt.golang v1.2.0 + github.com/elastic/gosigar v0.13.0 github.com/ericchiang/k8s v1.2.0 github.com/ghodss/yaml v1.0.1-0.20190212211648-25d852aebe32 github.com/go-logfmt/logfmt v0.4.0 diff --git a/go.sum b/go.sum index 18fc73ab9df13..8807b3c0b9c0e 100644 --- a/go.sum +++ b/go.sum @@ -188,6 +188,8 @@ github.com/eapache/queue v1.1.0 h1:YOEu7KNc61ntiQlcEeUIoDTJ2o8mQznoNvUhiigpIqc= github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I= github.com/eclipse/paho.mqtt.golang v1.2.0 h1:1F8mhG9+aO5/xpdtFkW4SxOJB67ukuDC3t2y2qayIX0= github.com/eclipse/paho.mqtt.golang v1.2.0/go.mod h1:H9keYFcgq3Qr5OUJm/JZI/i6U7joQ8SYLhZwfeOo6Ts= +github.com/elastic/gosigar v0.13.0 h1:EIeuQcLPKia759s6mlVztlxUyKiKYHo6y6kOODOLO7A= +github.com/elastic/gosigar v0.13.0/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs= github.com/elazarl/goproxy v0.0.0-20170405201442-c4fc26588b6e/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= @@ -729,6 +731,7 @@ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a h1:WXEvlFVvvGxCJLG6REjsT03iWnKLEWinaScsxF2Vm2o= golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20170830134202-bb24a47a89ea/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180810173357-98c5dad5d1a0/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= diff --git a/plugins/inputs/procstat/README.md b/plugins/inputs/procstat/README.md index 8d43d86eaf568..ebc8fc568fd67 100644 --- a/plugins/inputs/procstat/README.md +++ b/plugins/inputs/procstat/README.md @@ -60,6 +60,12 @@ Processes can be selected for monitoring using one of several methods: ## the native finder performs the search directly in a manor dependent on the ## platform. Default is 'pgrep' # pid_finder = "pgrep" + + ## Select wich extra metrics should be added: + ## - "connections_stats": tcp_* and upd_socket metrics + ## - "connections_endpoints": new metric procstat_tcp with connections and listeners endpoints + ## Default is empty list. + # metrics_include = ["connections_stats", "connections_endpoints"] ``` #### Windows support @@ -157,6 +163,42 @@ the `win_perf_counters` input plugin as a more mature alternative. - running (int) - result_code (int, success = 0, lookup_error = 1) +If ``connections_stats`` enabled, added fields: +- procstat + - fields: + - tcp_close (int) + - tcp_close_wait (int) + - tcp_closing (int) + - tcp_established (int) + - tcp_fin_wait1 (int) + - tcp_fin_wait2 (int) + - tcp_last_ack (int) + - tcp_listen (int) + - tcp_none (int) + - tcp_syn_recv (int) + - tcp_syn_sent (int) + +If ``connections_endpoints`` enabled, added fields: +- procstat_tcp + - tags: + - pid (when `pid_tag` is true) + - cmdline (when 'cmdline_tag' is true) + - process_name + - pidfile (when defined) + - exe (when defined) + - pattern (when defined) + - user (when selected) + - systemd_unit (when defined) + - cgroup (when defined) + - fields: + - conn (string) + - listen (string) + +To gather connection info, if Telegraf is not run as root, it needs the following capabilities +``` +sudo setcap "CAP_DAC_READ_SEARCH,CAP_SYS_PTRACE+ep" telegraf +``` + *NOTE: Resource limit > 2147483647 will be reported as 2147483647.* ### Example Output: @@ -164,4 +206,6 @@ the `win_perf_counters` input plugin as a more mature alternative. ``` procstat_lookup,host=prash-laptop,pattern=influxd,pid_finder=pgrep,result=success pid_count=1i,running=1i,result_code=0i 1582089700000000000 procstat,host=prash-laptop,pattern=influxd,process_name=influxd,user=root involuntary_context_switches=151496i,child_minor_faults=1061i,child_major_faults=8i,cpu_time_user=2564.81,cpu_time_idle=0,cpu_time_irq=0,cpu_time_guest=0,pid=32025i,major_faults=8609i,created_at=1580107536000000000i,voluntary_context_switches=1058996i,cpu_time_system=616.98,cpu_time_steal=0,cpu_time_guest_nice=0,memory_swap=0i,memory_locked=0i,memory_usage=1.7797634601593018,num_threads=18i,cpu_time_nice=0,cpu_time_iowait=0,cpu_time_soft_irq=0,memory_rss=148643840i,memory_vms=1435688960i,memory_data=0i,memory_stack=0i,minor_faults=1856550i 1582089700000000000 +procstat,host=laptop,pattern=httpd,process_name=httpd,user=root child_major_faults=0i,child_minor_faults=70i,cpu_time=0i,cpu_time_guest=0,cpu_time_guest_nice=0,cpu_time_idle=0,cpu_time_iowait=0,cpu_time_irq=0,cpu_time_nice=0,cpu_time_soft_irq=0,cpu_time_steal=0,cpu_time_system=0.01,cpu_time_user=0.02,cpu_usage=0,created_at=1611738400000000000i,involuntary_context_switches=15i,listen=1i,major_faults=0i,memory_data=999424i,memory_locked=0i,memory_rss=4677632i,memory_stack=135168i,memory_swap=0i,memory_usage=0.013990458101034164,memory_vms=6078464i,minor_faults=1636i,nice_priority=20i,num_fds=8i,num_threads=1i,pid=1738811i,read_bytes=0i,read_count=4397i,realtime_priority=0i,rlimit_cpu_time_hard=2147483647i,rlimit_cpu_time_soft=2147483647i,rlimit_file_locks_hard=2147483647i,rlimit_file_locks_soft=2147483647i,rlimit_memory_data_hard=2147483647i,rlimit_memory_data_soft=2147483647i,rlimit_memory_locked_hard=65536i,rlimit_memory_locked_soft=65536i,rlimit_memory_rss_hard=2147483647i,rlimit_memory_rss_soft=2147483647i,rlimit_memory_stack_hard=2147483647i,rlimit_memory_stack_soft=8388608i,rlimit_memory_vms_hard=2147483647i,rlimit_memory_vms_soft=2147483647i,rlimit_nice_priority_hard=0i,rlimit_nice_priority_soft=0i,rlimit_num_fds_hard=1048576i,rlimit_num_fds_soft=1048576i,rlimit_realtime_priority_hard=0i,rlimit_realtime_priority_soft=0i,rlimit_signals_pending_hard=127473i,rlimit_signals_pending_soft=127473i,signals_pending=0i,tcp_close=0i,tcp_close_wait=0i,tcp_closing=0i,tcp_established=0i,tcp_fin_wait1=0i,tcp_fin_wait2=0i,tcp_last_ack=0i,tcp_listen=1i,tcp_syn_recv=0i,tcp_syn_sent=0i,voluntary_context_switches=169i,write_bytes=53248i,write_count=10i 1611738522000000000 +procstat_tcp,host=laptop,pattern=httpd,process_name=httpd,user=root conn="",listen="192.168.1.35:80,192.168.1.48:80,[da01:beef:234:3830:aeda:f001:a00c:0091]:80,[aa01:beef:234:3830:e8e:0000:000a:6b0f]:80" 1611738522000000000 ``` diff --git a/plugins/inputs/procstat/connections.go b/plugins/inputs/procstat/connections.go new file mode 100644 index 0000000000000..a6962657b7ca5 --- /dev/null +++ b/plugins/inputs/procstat/connections.go @@ -0,0 +1,38 @@ +package procstat + +import ( + "fmt" + "net" +) + +const ( + // DockerMACPrefix https://macaddress.io/faq/how-to-recognise-a-docker-container-by-its-mac-address + DockerMACPrefix = "02:42" + // VirtualBoxMACPrefix https://github.com/mdaniel/virtualbox-org-svn-vbox-trunk/blob/2d259f948bc352ee400f9fd41c4a08710cd9138a/src/VBox/HostDrivers/VBoxNetAdp/VBoxNetAdp.c#L93 + VirtualBoxMACPrefix = "0a:00:27" + // HardwareAddrLength is the number of bytes of a MAC address + HardwareAddrLength = 6 +) + +var ( + // ErrorPIDNotFound is the error generated when the pid does not have network info + ErrorPIDNotFound = fmt.Errorf("pid not found") +) + +// InodeInfo represents information of a proc associated with an inode +type InodeInfo struct { + pid uint32 + ppid uint32 +} + +// NetworkInfo implements NetworkInfo using the netlink calls and parsing /proc to map sockets to PIDs +type NetworkInfo struct { + // tcp contains the connection info for each pid + tcp map[uint32][]ConnInfo + // listenPorts is a map with the listen ports in the host, used to ignore inbound connections + listenPorts map[uint32]interface{} + // publicIPs list of IPs considered "public" (used to connect to other hosts) + publicIPs []net.IP + // privateIPs list of IPs considered "private" (loopback, virtual interfaces, point2point, etc) + privateIPs []net.IP +} diff --git a/plugins/inputs/procstat/connections_fallback.go b/plugins/inputs/procstat/connections_fallback.go new file mode 100644 index 0000000000000..8bc2968fe97ff --- /dev/null +++ b/plugins/inputs/procstat/connections_fallback.go @@ -0,0 +1,35 @@ +// +build !linux + +package procstat + +import ( + "fmt" + "net" +) + +type ConnInfo struct { +} + +func (n *NetworkInfo) IsAListenPort(port uint32) bool { + return false +} + +func (n *NetworkInfo) Fetch() error { + return nil +} + +func (n *NetworkInfo) GetConnectionsByPid(pid uint32) (conn []ConnInfo, err error) { + return conn, fmt.Errorf("platform not supported") +} + +func (n *NetworkInfo) GetPublicIPs() []net.IP { + return []net.IP{} +} + +func (n *NetworkInfo) GetPrivateIPs() []net.IP { + return []net.IP{} +} + +func (n *NetworkInfo) IsPidListeningInAddr(pid uint32, ip net.IP, port uint32) bool { + return false +} diff --git a/plugins/inputs/procstat/connections_linux.go b/plugins/inputs/procstat/connections_linux.go new file mode 100644 index 0000000000000..cb0868e068da2 --- /dev/null +++ b/plugins/inputs/procstat/connections_linux.go @@ -0,0 +1,227 @@ +// +build linux + +package procstat + +import ( + "fmt" + "io" + "net" + "os" + "strconv" + "strings" + + "github.com/elastic/gosigar/sys/linux" +) + +// ConnInfo represents a single proc's connection and the parent pid (for practical purpouses) +type ConnInfo struct { + state linux.TCPState + srcIP net.IP + srcPort uint32 + dstIP net.IP + dstPort uint32 +} + +// IsAListenPort returns true if the port param is associated with a listener found in the host connections +func (n *NetworkInfo) IsAListenPort(port uint32) bool { + _, ok := n.listenPorts[port] + return ok +} + +// Fetch fetches network info: TCP connections and hosts' IPs. +// Parameter getConnections is the function that will be used to obtain TCP connections +// Parameter getLocalIPs is the function that will be used to get IPs. +// It is passed as a parameter to facilitate testing +func (n *NetworkInfo) Fetch() error { + var err error + n.tcp, n.listenPorts, err = getTCPProcInfo() + if err != nil { + return fmt.Errorf("gathering host TCP info: %v", err) + } + + // Get IPs, to be able to resolve procs listening in 0.0.0.0 or :: + ifaces, err := net.Interfaces() + if err != nil { + return fmt.Errorf("getting network interfaces: %v", err) + } + + n.publicIPs, n.privateIPs, err = getLocalIPs(ifaces) + if err != nil { + return fmt.Errorf("E! Error: procstat getting local IPs: %v", err) + } + + return nil +} + +// GetConnectionsByPid return connection info for a particular PID +func (n *NetworkInfo) GetConnectionsByPid(pid uint32) (conn []ConnInfo, err error) { + conn, ok := n.tcp[pid] + if !ok { + return conn, ErrorPIDNotFound + } + return conn, nil +} + +// GetPublicIPs return the list of public IPs (used to connect to others hosts) +func (n *NetworkInfo) GetPublicIPs() []net.IP { + return n.publicIPs +} + +// GetPrivateIPs return the list of private IPs (loopback devices, virtual, point2point) +func (n *NetworkInfo) GetPrivateIPs() []net.IP { + return n.privateIPs +} + +// IsPidListeningInAddr returns true if the pid has a listener in that ip and port +// Return false is pid=0 +func (n *NetworkInfo) IsPidListeningInAddr(pid uint32, ip net.IP, port uint32) bool { + if pid == 0 { + return false + } + + for _, c := range n.tcp[pid] { + if c.srcIP.Equal(ip) && c.srcPort == port { + return true + } + } + + return false +} + +// getLocalIPs return the IPv4/v6 addresses active in the current host divided in two groups: +// "publicIPs" contains addresses to connect with other external hosts. +// "privateIPs" contains loopback addreses, virtual interfaces, etc. +// This division is a best effort and probably does not contains all the possibilities. +// It should extract the information from a list of interfaces passed as a parameter. +func getLocalIPs(ifaces []net.Interface) (publicIPs, privateIPs []net.IP, err error) { + for _, i := range ifaces { + // Ignore down interfaces + if i.Flags&net.FlagUp == 0 { + continue + } + + addresses, err := i.Addrs() + if err != nil { + return nil, nil, fmt.Errorf("getting addresses from interfaces: %v", err) + } + + ips, err := extractIPs(addresses) + if err != nil { + return nil, nil, fmt.Errorf("getting IPs from interface addresses: %v", err) + } + + // TODO allow passing a list of MAC headers to ignore + // TODO allow passing exclude list of regexp to match interface names + if i.Flags&net.FlagLoopback != 0 || // Ignore loopback interfaces + i.Flags&net.FlagPointToPoint != 0 || // ignore VPN interfaces + len(i.HardwareAddr) != HardwareAddrLength || // ignore interfaces without a MAC address + strings.HasPrefix(i.HardwareAddr.String(), DockerMACPrefix) || // ignore docker virtual interfaces + strings.HasPrefix(i.HardwareAddr.String(), VirtualBoxMACPrefix) { // ignore VirtualBox virtual interfaces + privateIPs = append(privateIPs, ips...) + } else { + for _, i := range ips { + if i.IsLinkLocalUnicast() { + // Do not add link-local IPs: 169.254.0.0/16 or fe80::/10 + continue + } + publicIPs = append(publicIPs, i) + } + } + } + + return publicIPs, privateIPs, nil +} + +// getTCPProcInfo return the connections grouped by pid and a map of listening ports. +// Both results are for IPv4 and IPv6 +func getTCPProcInfo() (connectionsByPid map[uint32][]ConnInfo, listeners map[uint32]interface{}, err error) { + req := linux.NewInetDiagReq() + var diagWriter io.Writer + msgs, err := linux.NetlinkInetDiagWithBuf(req, nil, diagWriter) + if err != nil { + return nil, nil, fmt.Errorf("calling netlink to get sockets: %v", err) + } + + listeners = map[uint32]interface{}{} + connectionsByPid = map[uint32][]ConnInfo{} + + inodeToPid := mapInodesToPid() + + for _, diag := range msgs { + inodeInfo := inodeToPid[diag.Inode] + + for _, proc := range inodeInfo { + if linux.TCPState(diag.State) == linux.TCP_LISTEN { + listeners[uint32(diag.SrcPort())] = nil + } + + connectionsByPid[proc.pid] = append(connectionsByPid[proc.pid], ConnInfo{ + state: linux.TCPState(diag.State), + srcIP: diag.SrcIP(), + srcPort: uint32(diag.SrcPort()), + dstIP: diag.DstIP(), + dstPort: uint32(diag.DstPort()), + }) + } + } + + return connectionsByPid, listeners, nil +} + +// mapInodesToPid return a map with the procs associated to each inode. +func mapInodesToPid() (ret map[uint32][]InodeInfo) { + ret = map[uint32][]InodeInfo{} + + fd, err := os.Open("/proc") + if err != nil { + fmt.Printf("Error opening /proc: %v", err) + } + defer fd.Close() + + dirContents, err := fd.Readdirnames(0) + if err != nil { + fmt.Printf("Error reading files in /proc: %v", err) + } + + for _, pidStr := range dirContents { + pid, err := strconv.ParseUint(pidStr, 10, 32) + if err != nil { + // exclude files with a not numeric name. We only want to access pid directories + continue + } + + pidDir, err := os.Open("/proc/" + pidStr + "/fd/") + if err != nil { + // ignore errors: + // - missing directory, pid has already finished + // - permission denied + continue + } + + fds, err := pidDir.Readdirnames(0) + if err != nil { + continue + } + + for _, fd := range fds { + link, err := os.Readlink("/proc/" + pidStr + "/fd/" + fd) + if err != nil { + continue + } + + var inode uint32 + + _, err = fmt.Sscanf(link, "socket:[%d]", &inode) + if err != nil { + // this inode is not a socket + continue + } + + ret[inode] = append(ret[inode], InodeInfo{ + pid: uint32(pid), + }) + } + } + + return ret +} diff --git a/plugins/inputs/procstat/process.go b/plugins/inputs/procstat/process.go index 042929f0864cf..11a64daa13977 100644 --- a/plugins/inputs/procstat/process.go +++ b/plugins/inputs/procstat/process.go @@ -10,6 +10,7 @@ import ( type Process interface { PID() PID + Ppid() (PID, error) Tags() map[string]string PageFaults() (*process.PageFaultsStat, error) @@ -59,6 +60,15 @@ func (p *Proc) Tags() map[string]string { return p.tags } +func (p *Proc) Ppid() (PID, error) { + pid, err := p.Process.Ppid() + if err != nil { + return 0, err + } + + return PID(pid), nil +} + func (p *Proc) PID() PID { return PID(p.Process.Pid) } diff --git a/plugins/inputs/procstat/procstat.go b/plugins/inputs/procstat/procstat.go index 35f60342270dd..d020b54befe4e 100644 --- a/plugins/inputs/procstat/procstat.go +++ b/plugins/inputs/procstat/procstat.go @@ -2,8 +2,10 @@ package procstat import ( "bytes" + "errors" "fmt" "io/ioutil" + "net" "os/exec" "path/filepath" "runtime" @@ -21,26 +23,39 @@ var ( defaultProcess = NewProc ) +const ( + // MetricsConnectionsStats tag to enable connections_stats metrics + MetricsConnectionsStats = "connections_stats" + // MetricsConnectionsEndpoints tag to enable connections_endpoints metrics + MetricsConnectionsEndpoints = "connections_endpoints" + // MetricNameTCPConnections is the measurement name for TCP connections metrics + MetricNameTCPConnections = "procstat_tcp" + // TCPConnectionKey is the metric value to put all the listen endpoints + TCPConnectionKey = "conn" + // TCPListenKey is the metric value to put all the connection endpoints + TCPListenKey = "listen" +) + type PID int32 type Procstat struct { - PidFinder string `toml:"pid_finder"` - PidFile string `toml:"pid_file"` - Exe string - Pattern string - Prefix string - CmdLineTag bool `toml:"cmdline_tag"` - ProcessName string - User string - SystemdUnit string - CGroup string `toml:"cgroup"` - PidTag bool - WinService string `toml:"win_service"` - Mode string + PidFinder string `toml:"pid_finder"` + PidFile string `toml:"pid_file"` + Exe string + Pattern string + Prefix string + CmdLineTag bool `toml:"cmdline_tag"` + ProcessName string + User string + SystemdUnit string + CGroup string `toml:"cgroup"` + PidTag bool + WinService string `toml:"win_service"` + Mode string + MetricsInclude []string `toml:"metrics_include"` solarisMode bool - - finder PIDFinder + finder PIDFinder createPIDFinder func() (PIDFinder, error) procs map[PID]Process @@ -90,6 +105,12 @@ var sampleConfig = ` ## the native finder performs the search directly in a manor dependent on the ## platform. Default is 'pgrep' # pid_finder = "pgrep" + + ## Select wich extra metrics should be added: + ## - "connections_stats": tcp_* and upd_socket metrics + ## - "connections_endpoints": new metric procstat_tcp with connections and listeners endpoints + ## Default is empty list. + # metrics_include = ["connections_stats", "connections_endpoints"] ` func (_ *Procstat) SampleConfig() string { @@ -141,8 +162,16 @@ func (p *Procstat) Gather(acc telegraf.Accumulator) error { } p.procs = procs + // Initialize the conn object. Gather info about all TCP connections organized per PID + // Avoid repeating this task for each proc + netInfo := NetworkInfo{} + err = netInfo.Fetch() + if err != nil { + acc.AddError(fmt.Errorf("E! [inputs.procstat] getting TCP network info: %v", err)) + } + for _, proc := range p.procs { - p.addMetric(proc, acc, now) + p.addMetric(proc, acc, now, netInfo) } fields := map[string]interface{}{ @@ -158,7 +187,7 @@ func (p *Procstat) Gather(acc telegraf.Accumulator) error { } // Add metrics a single Process -func (p *Procstat) addMetric(proc Process, acc telegraf.Accumulator, t time.Time) { +func (p *Procstat) addMetric(proc Process, acc telegraf.Accumulator, t time.Time, netInfo NetworkInfo) { var prefix string if p.Prefix != "" { prefix = p.Prefix + "_" @@ -311,7 +340,40 @@ func (p *Procstat) addMetric(proc Process, acc telegraf.Accumulator, t time.Time } } + if p.metricEnabled(MetricsConnectionsStats) { + // Add values with the number of connections in each TCP state + pidConnections, err := netInfo.GetConnectionsByPid(uint32(proc.PID())) + if err == nil { + addConnectionStats(pidConnections, fields, prefix) + } else { + // Ignore errors because pid was not found. It is normal to have procs without connections + if !errors.Is(err, ErrorPIDNotFound) { + acc.AddError(fmt.Errorf("D! [inputs.procstat] not able to get connections for pid=%v: %v", proc.PID(), err)) + } + } + } + acc.AddFields("procstat", fields, proc.Tags(), t) + + if p.metricEnabled(MetricsConnectionsEndpoints) { + // add measurement procstat_tcp with tcp listeners and connections for each proccess + err = addConnectionEnpoints(acc, proc, netInfo) + if err != nil { + acc.AddError(fmt.Errorf("D! [inputs.procstat] not able to generate network metrics for pid=%v: %v", proc.PID(), err)) + } + } +} + +// appendIPs extract and return IPs from addresses +func extractIPs(addreses []net.Addr) (ret []net.IP, err error) { + for _, a := range addreses { + ip, _, err := net.ParseCIDR(a.String()) + if err != nil { + return nil, fmt.Errorf("parsing interface address: %v", err) + } + ret = append(ret, ip) + } + return ret, nil } // Update monitored Processes @@ -483,6 +545,41 @@ func (p *Procstat) Init() error { return nil } +func containsIP(a []net.IP, x net.IP) bool { + for _, n := range a { + if x.Equal(n) { + return true + } + } + return false +} + +func isIPV4(ip net.IP) bool { + return ip.To4() != nil +} + +func isIPV6(ip net.IP) bool { + return ip.To4() == nil +} + +// endpointString return the correct representation of ip and port for IPv4 and IPv6 +func endpointString(ip net.IP, port uint32) string { + if isIPV6(ip) { + return fmt.Sprintf("[%s]:%d", ip, port) + } + return fmt.Sprintf("%s:%d", ip, port) +} + +// metricEnabled check is some group of metrics are enabled in the config file +func (p *Procstat) metricEnabled(m string) bool { + for _, n := range p.MetricsInclude { + if m == n { + return true + } + } + return false +} + func init() { inputs.Add("procstat", func() telegraf.Input { return &Procstat{} diff --git a/plugins/inputs/procstat/procstat_fallback.go b/plugins/inputs/procstat/procstat_fallback.go new file mode 100644 index 0000000000000..8f6134d60e53f --- /dev/null +++ b/plugins/inputs/procstat/procstat_fallback.go @@ -0,0 +1,16 @@ +// +build !linux + +package procstat + +import ( + "fmt" + + "github.com/influxdata/telegraf" +) + +func addConnectionStats(pidConnections []ConnInfo, fields map[string]interface{}, prefix string) { +} + +func addConnectionEnpoints(acc telegraf.Accumulator, proc Process, netInfo NetworkInfo) error { + return fmt.Errorf("platform not supported") +} diff --git a/plugins/inputs/procstat/procstat_linux.go b/plugins/inputs/procstat/procstat_linux.go new file mode 100644 index 0000000000000..0e96c271555d7 --- /dev/null +++ b/plugins/inputs/procstat/procstat_linux.go @@ -0,0 +1,109 @@ +// +build linux + +package procstat + +import ( + "errors" + "fmt" + "sort" + "strings" + + "github.com/elastic/gosigar/sys/linux" + "github.com/influxdata/telegraf" +) + +// addConnectionStats count the number of connections in each TCP state and add those values to the metric +func addConnectionStats(pidConnections []ConnInfo, fields map[string]interface{}, prefix string) { + counts := make(map[linux.TCPState]int) + for _, netcon := range pidConnections { + counts[netcon.state]++ + } + + fields[prefix+"tcp_established"] = counts[linux.TCP_ESTABLISHED] + fields[prefix+"tcp_syn_sent"] = counts[linux.TCP_SYN_SENT] + fields[prefix+"tcp_syn_recv"] = counts[linux.TCP_SYN_RECV] + fields[prefix+"tcp_fin_wait1"] = counts[linux.TCP_FIN_WAIT1] + fields[prefix+"tcp_fin_wait2"] = counts[linux.TCP_FIN_WAIT2] + //fields[prefix+"tcp_time_wait"] = counts[linux.TCP_TIME_WAIT] // TIME-WAIT connections does not have a pid associated + fields[prefix+"tcp_close"] = counts[linux.TCP_CLOSE] + fields[prefix+"tcp_close_wait"] = counts[linux.TCP_CLOSE_WAIT] + fields[prefix+"tcp_last_ack"] = counts[linux.TCP_LAST_ACK] + fields[prefix+"tcp_listen"] = counts[linux.TCP_LISTEN] + fields[prefix+"tcp_closing"] = counts[linux.TCP_CLOSING] +} + +// addConnectionEnpoints add listen and connection endpoints to the procstat_tcp metric. +// If listen is 0.0.0.0 or ::, it will be added one value for each of the IP addresses of the host. +// Listeners in private IPs are ignored (maybe a flag could be added, but now the reasoning is matching connections between hosts). +// Connections made to this server are ignored (the local port is one of the listening ports). +func addConnectionEnpoints(acc telegraf.Accumulator, proc Process, netInfo NetworkInfo) error { + TCPListen := map[string]interface{}{} + TCPConn := map[string]interface{}{} + + pidConnections, err := netInfo.GetConnectionsByPid(uint32(proc.PID())) + if err != nil { + if errors.Is(err, ErrorPIDNotFound) { + return nil + } + + return fmt.Errorf("W! [inputs.procstat] not able to get connections for pid=%v: %v", proc.PID(), err) + } + + // In case of error, ppid=0 and will be ignored in IsPidListeningInPort + ppid, _ := proc.Ppid() + + for _, c := range pidConnections { + // Ignore listeners or connections in/to localhost or private IPs + if c.srcIP.IsLoopback() || containsIP(netInfo.GetPrivateIPs(), c.srcIP) { + continue + } + + if c.state == linux.TCP_LISTEN { + if netInfo.IsPidListeningInAddr(uint32(ppid), c.srcIP, c.srcPort) { + continue + } + + if c.srcIP.IsUnspecified() { + // 0.0.0.0 listen in all IPv4 addresses + // :: listen in all IPv4 + IPv6 addresses + for _, ip := range netInfo.GetPublicIPs() { + if isIPV4(ip) || isIPV6(c.srcIP) { + TCPListen[endpointString(ip, c.srcPort)] = nil + } + } + } else { + TCPListen[endpointString(c.srcIP, c.srcPort)] = nil + } + } else if c.state != linux.TCP_SYN_SENT { // All TCP states except LISTEN (already processed) and SYN_SENT imply a connection between the hosts + // Ignore connections from outside hosts to listeners in this host (status != LISTEN and localPort in listenPorts) + if !netInfo.IsAListenPort(c.srcPort) { + TCPConn[endpointString(c.dstIP, c.dstPort)] = nil + } + } + } + + // Only add metrics if we have data + if len(TCPConn) > 0 || len(TCPListen) > 0 { + tcpConnections := []string{} + tcpListeners := []string{} + + for k := range TCPConn { + tcpConnections = append(tcpConnections, k) + } + sort.Strings(tcpConnections) // sort to make testing simplier + + for k := range TCPListen { + tcpListeners = append(tcpListeners, k) + } + sort.Strings(tcpListeners) + + fields := map[string]interface{}{ + TCPConnectionKey: strings.Join(tcpConnections, ","), + TCPListenKey: strings.Join(tcpListeners, ","), + } + + acc.AddFields(MetricNameTCPConnections, fields, proc.Tags()) + } + + return nil +} diff --git a/plugins/inputs/procstat/procstat_linux_test.go b/plugins/inputs/procstat/procstat_linux_test.go new file mode 100644 index 0000000000000..ff5ad6bb63698 --- /dev/null +++ b/plugins/inputs/procstat/procstat_linux_test.go @@ -0,0 +1,625 @@ +package procstat + +import ( + "net" + "testing" + "time" + + "github.com/elastic/gosigar/sys/linux" + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" +) + +func TestAddConnectionEndpoints(t *testing.T) { + tests := []struct { + name string + pid PID + ppid PID + listenPorts map[uint32]interface{} + tcp map[uint32][]ConnInfo + publicIPs []net.IP + privateIPs []net.IP + metrics []telegraf.Metric + err string + }{ + { + name: "no connections, no metrics", + }, + { + name: "outside connection", + pid: 100, + listenPorts: map[uint32]interface{}{}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 34567, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 80, + state: linux.TCP_ESTABLISHED, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPConnectionKey: "1.1.1.1:80", + }, + time.Now(), + ), + }, + }, + { + name: "TCP states except SYN_SENT are used for connections", + pid: 100, + listenPorts: map[uint32]interface{}{}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10000, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 80, + state: linux.TCP_ESTABLISHED, + }, + { // this is ignore, is a host trying to connect but the other end has not replied + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10001, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 81, + state: linux.TCP_SYN_SENT, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10002, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 82, + state: linux.TCP_SYN_RECV, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10003, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 83, + state: linux.TCP_FIN_WAIT1, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10004, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 84, + state: linux.TCP_FIN_WAIT2, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10005, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 85, + state: linux.TCP_TIME_WAIT, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10006, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 86, + state: linux.TCP_CLOSE, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10007, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 87, + state: linux.TCP_CLOSE_WAIT, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10008, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 88, + state: linux.TCP_LAST_ACK, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 10009, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 89, + state: linux.TCP_CLOSING, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPConnectionKey: "1.1.1.1:80,1.1.1.1:82,1.1.1.1:83,1.1.1.1:84,1.1.1.1:85,1.1.1.1:86,1.1.1.1:87,1.1.1.1:88,1.1.1.1:89", + }, + time.Now(), + ), + }, + }, + { + name: "IPv4 listener", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{net.ParseIP("192.168.0.2")}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "192.168.0.2:80", + }, + time.Now(), + ), + }, + }, + { + name: "process listening in a IP not present in the local IPs will generate metric anyway", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "192.168.0.2:80", + }, + time.Now(), + ), + }, + }, + { + name: "process listening in a port not present in the listeners list will generate metric anyway", + pid: 100, + listenPorts: map[uint32]interface{}{}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "192.168.0.2:80", + }, + time.Now(), + ), + }, + }, + { + name: "IPv6 listener", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("dead::beef"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "[dead::beef]:80", + }, + time.Now(), + ), + }, + }, + { + name: "private IPv4 listener do not generate metrics", + pid: 100, + listenPorts: map[uint32]interface{}{ + 80: nil, + }, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{net.ParseIP("192.168.0.2")}, + metrics: []telegraf.Metric{}, + }, + { + name: "private IPv6 listener do not generate metrics", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("dead::beef"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{net.ParseIP("dead::beef")}, + metrics: []telegraf.Metric{}, + }, + { + name: "0.0.0.0 listener listen in all public IPv4s", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("0.0.0.0"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{net.ParseIP("192.168.0.2"), net.ParseIP("10.10.0.2"), net.ParseIP("dead::beef")}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "10.10.0.2:80,192.168.0.2:80", + }, + time.Now(), + ), + }, + }, + { + name: ":: listener listen in all public IPv4 and IPv6s", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("::"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{net.ParseIP("192.168.0.2"), net.ParseIP("10.10.0.2"), net.ParseIP("dead::beef")}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "10.10.0.2:80,192.168.0.2:80,[dead::beef]:80", + }, + time.Now(), + ), + }, + }, + { + name: "ignore listeners in loopback IPs", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("127.0.0.1"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{}, + }, + { + name: "ignore connections from external hosts to local listeners", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("127.0.0.1"), + srcPort: 80, + dstIP: net.ParseIP("54.89.89.54"), + dstPort: 30123, + state: linux.TCP_ESTABLISHED, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{}, + }, + { + name: "ignore connections from internal procs to other internal procs using the public IPs", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 30000, + dstIP: net.ParseIP("192.168.0.2"), + dstPort: 80, + state: linux.TCP_ESTABLISHED, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{net.ParseIP("192.168.0.2")}, + metrics: []telegraf.Metric{}, + }, + { // We are testing how behaves addConnectionEnpoints it if received a "pid not found" kind of error + name: "proc without network info does not generates an error, nor metrics", + pid: 100, + listenPorts: map[uint32]interface{}{}, + tcp: map[uint32][]ConnInfo{}, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{}, + }, + { + name: "process listening in two differents ports using :: with differents public IPs", + }, + { // same schema valid for: apache httpd, php-fpm + name: "service with a parent process and several child, only the parent should report the listeners, parent case (nginx style)", + pid: 101, // parent + listenPorts: map[uint32]interface{}{ + 80: nil, + }, + tcp: map[uint32][]ConnInfo{ + 100: { // parent + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + 101: { // child + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "192.168.0.2:80", + }, + time.Now(), + ), + }, + }, + { + name: "service with a parent process and several child, only the parent should report the listeners, child case (nginx style)", + pid: 101, // child + ppid: 100, + listenPorts: map[uint32]interface{}{ + 80: nil, + }, + tcp: map[uint32][]ConnInfo{ + 100: { // parent + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + 101: { // child + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{}, + }, + { + name: "child process listening in parent process plus other port, generate metric with the extra listener", + pid: 101, // child + ppid: 100, + listenPorts: map[uint32]interface{}{ + 80: nil, + 443: nil, + }, + tcp: map[uint32][]ConnInfo{ + 100: { // parent + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + 101: { // child + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 443, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "192.168.0.2:443", + }, + time.Now(), + ), + }, + }, + { + name: "process listening in 0.0.0.0 and also in some IPv4 address, avoid duplication", + pid: 100, + listenPorts: map[uint32]interface{}{80: nil}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("0.0.0.0"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + { + srcIP: net.ParseIP("172.17.0.1"), + srcPort: 80, + state: linux.TCP_LISTEN, + }, + }, + }, + publicIPs: []net.IP{net.ParseIP("192.168.0.2"), net.ParseIP("10.10.0.2"), net.ParseIP("dead::beef")}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPListenKey: "10.10.0.2:80,172.17.0.1:80,192.168.0.2:80", + }, + time.Now(), + ), + }, + }, + { + name: "avoid duplication in outboun connections", + pid: 100, + listenPorts: map[uint32]interface{}{}, + tcp: map[uint32][]ConnInfo{ + 100: { + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 34567, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 80, + state: linux.TCP_ESTABLISHED, + }, + { + srcIP: net.ParseIP("192.168.0.2"), + srcPort: 34568, + dstIP: net.ParseIP("1.1.1.1"), + dstPort: 80, + state: linux.TCP_ESTABLISHED, + }, + }, + }, + publicIPs: []net.IP{}, + privateIPs: []net.IP{}, + metrics: []telegraf.Metric{ + testutil.MustMetric( + MetricNameTCPConnections, + map[string]string{}, + map[string]interface{}{ + TCPConnectionKey: "1.1.1.1:80", + }, + time.Now(), + ), + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + var acc testutil.Accumulator + + proc := &testProc{ + pid: test.pid, + ppid: test.ppid, + } + + netInfo := NetworkInfo{ + tcp: test.tcp, + listenPorts: test.listenPorts, + publicIPs: test.publicIPs, + privateIPs: test.privateIPs, + } + + err := addConnectionEnpoints(&acc, proc, netInfo) + if err != nil { + assert.EqualError(t, err, test.err) + //assert.FailNowf(t, "error calling addConnectionEnpoints", err.Error()) + } + + // Function has generated the same number of metrics defined in the test + assert.Len(t, acc.GetTelegrafMetrics(), len(test.metrics)) + + for _, m := range test.metrics { + for _, value := range m.FieldList() { + assert.Truef( + t, + acc.HasPoint(m.Name(), m.Tags(), value.Key, value.Value), + "Missing point: %s,%v %s=%s\nMetrics: %v", + m.Name(), + m.Tags(), + value.Key, + value.Value, + acc.GetTelegrafMetrics(), + ) + } + } + }) + } +} diff --git a/plugins/inputs/procstat/procstat_test.go b/plugins/inputs/procstat/procstat_test.go index 9836feaec8b89..31f7291ebd519 100644 --- a/plugins/inputs/procstat/procstat_test.go +++ b/plugins/inputs/procstat/procstat_test.go @@ -84,6 +84,7 @@ func (pg *testPgrep) Pattern(pattern string) ([]PID, error) { return pg.pids, pg.err } +// nolint func (pg *testPgrep) Uid(user string) ([]PID, error) { return pg.pids, pg.err } @@ -94,6 +95,7 @@ func (pg *testPgrep) FullPattern(pattern string) ([]PID, error) { type testProc struct { pid PID + ppid PID tags map[string]string } @@ -104,6 +106,10 @@ func newTestProc(pid PID) (Process, error) { return proc, nil } +func (p *testProc) Ppid() (PID, error) { + return p.ppid, nil +} + func (p *testProc) PID() PID { return p.pid } From 15144a0266d1c5f64b5548493cb26bf0dbd791b7 Mon Sep 17 00:00:00 2001 From: Adrian Lopez Date: Thu, 28 Jan 2021 15:14:44 +0100 Subject: [PATCH 2/4] procstat: option to select which metrics collect If we want to collect only the metric procstat_lookup or procstat_tcp is a big overhead having to parse lots of files in /proc to get number of fds, CPU time, memory usage, etc. Added a new configuration option to be able to select which metrics to collect. If the configuration parameter, 'metrics_include', is undefined, the plugin behaves like before adding this parameter. The difference (measured with ~1100 procs) between gathering the procstat_tcp metrics alone or with the default collection of metrics is around one tenth (~50ms alone, ~500ms with default collection). --- plugins/inputs/procstat/README.md | 31 ++- plugins/inputs/procstat/procstat.go | 289 +++++++++++++++-------- plugins/inputs/procstat/procstat_test.go | 34 +++ 3 files changed, 248 insertions(+), 106 deletions(-) diff --git a/plugins/inputs/procstat/README.md b/plugins/inputs/procstat/README.md index ebc8fc568fd67..3026055d93a6e 100644 --- a/plugins/inputs/procstat/README.md +++ b/plugins/inputs/procstat/README.md @@ -62,10 +62,33 @@ Processes can be selected for monitoring using one of several methods: # pid_finder = "pgrep" ## Select wich extra metrics should be added: - ## - "connections_stats": tcp_* and upd_socket metrics - ## - "connections_endpoints": new metric procstat_tcp with connections and listeners endpoints - ## Default is empty list. - # metrics_include = ["connections_stats", "connections_endpoints"] + ## - "threads": to enable collection of number of file descriptors + ## - "fds": to enable collection of context switches + ## - "ctx_switches": to enable collection of page faults + ## - "page_faults": to enable collection of IO + ## - "io": to enable collection of proc creation time + ## - "create_time": to enable collection of CPU time used + ## - "cpu": to enable collection of percentage of CPU used + ## - "cpu_percent": to enable collection of memory used + ## - "mem": to enable collection of memory percentage used + ## - "mem_percent": to enable collection of procs' limits + ## - "limits": to enable collection of procs' limits + ## - "tcp_stats": tcp_* and upd_socket metrics + ## - "connections_endpoints": new metric procstat_tcp with connections and listeners endpoints + ## Default value: + # metrics_include = [ + # "threads", + # "fds", + # "ctx_switches", + # "page_faults", + # "io", + # "create_time", + # "cpu", + # "cpu_percent", + # "mem", + # "mem_percent", + # "limits", + # ] ``` #### Windows support diff --git a/plugins/inputs/procstat/procstat.go b/plugins/inputs/procstat/procstat.go index d020b54befe4e..126b732dcc762 100644 --- a/plugins/inputs/procstat/procstat.go +++ b/plugins/inputs/procstat/procstat.go @@ -21,13 +21,50 @@ import ( var ( defaultPIDFinder = NewPgrep defaultProcess = NewProc + // defaultCollection is the default group of metrics to gather + defaultCollection = []string{ + MetricsThreads, + MetricsFDs, + MetricsContextSwitches, + MetricsPageFaults, + MetricsIO, + MetricsCreateTime, + MetricsCPU, + MetricsCPUPercent, + MetricsMemory, + MetricsMemoryPercent, + MetricsLimits, + } ) const ( - // MetricsConnectionsStats tag to enable connections_stats metrics - MetricsConnectionsStats = "connections_stats" - // MetricsConnectionsEndpoints tag to enable connections_endpoints metrics + // MetricsThreads to enable collection of number of threads + MetricsThreads = "threads" + // MetricsFDs to enable collection of number of file descriptors + MetricsFDs = "fds" + // MetricsContextSwitches to enable collection of context switches + MetricsContextSwitches = "ctx_switches" + // MetricsPageFaults to enable collection of page faults + MetricsPageFaults = "page_faults" + // MetricsIO to enable collection of IO + MetricsIO = "io" + // MetricsCreateTime to enable collection of proc creation time + MetricsCreateTime = "create_time" + // MetricsCPU to enable collection of CPU time used + MetricsCPU = "cpu" + // MetricsCPUPercent to enable collection of percentage of CPU used + MetricsCPUPercent = "cpu_percent" + // MetricsMemory to enable collection of memory used + MetricsMemory = "mem" + // MetricsMemoryPercent to enable collection of memory percentage used + MetricsMemoryPercent = "mem_percent" + // MetricsLimits to enable collection of procs' limits + MetricsLimits = "limits" + // MetricsTCPStats to enable collection of procs' TCP stats + MetricsTCPStats = "tcp_stats" + // MetricsConnectionsEndpoints to enable collection of metric procstat_tcp MetricsConnectionsEndpoints = "connections_endpoints" + // MetricNameTCPConnections is the measurement name for TCP connections metrics MetricNameTCPConnections = "procstat_tcp" // TCPConnectionKey is the metric value to put all the listen endpoints @@ -107,10 +144,33 @@ var sampleConfig = ` # pid_finder = "pgrep" ## Select wich extra metrics should be added: - ## - "connections_stats": tcp_* and upd_socket metrics - ## - "connections_endpoints": new metric procstat_tcp with connections and listeners endpoints - ## Default is empty list. - # metrics_include = ["connections_stats", "connections_endpoints"] + ## - "threads": to enable collection of number of file descriptors + ## - "fds": to enable collection of context switches + ## - "ctx_switches": to enable collection of page faults + ## - "page_faults": to enable collection of IO + ## - "io": to enable collection of proc creation time + ## - "create_time": to enable collection of CPU time used + ## - "cpu": to enable collection of percentage of CPU used + ## - "cpu_percent": to enable collection of memory used + ## - "mem": to enable collection of memory percentage used + ## - "mem_percent": to enable collection of procs' limits + ## - "limits": to enable collection of procs' limits + ## - "tcp_stats": tcp_* and upd_socket metrics + ## - "connections_endpoints": new metric procstat_tcp with connections and listeners endpoints + ## Default value: + # metrics_include = [ + # "threads", + # "fds", + # "ctx_switches", + # "page_faults", + # "io", + # "create_time", + # "cpu", + # "cpu_percent", + # "mem", + # "mem_percent", + # "limits", + # ] ` func (_ *Procstat) SampleConfig() string { @@ -226,121 +286,143 @@ func (p *Procstat) addMetric(proc Process, acc telegraf.Accumulator, t time.Time } } - numThreads, err := proc.NumThreads() - if err == nil { - fields[prefix+"num_threads"] = numThreads + if p.metricEnabled(MetricsThreads) { + numThreads, err := proc.NumThreads() + if err == nil { + fields[prefix+"num_threads"] = numThreads + } } - fds, err := proc.NumFDs() - if err == nil { - fields[prefix+"num_fds"] = fds + if p.metricEnabled(MetricsFDs) { + fds, err := proc.NumFDs() + if err == nil { + fields[prefix+"num_fds"] = fds + } } - ctx, err := proc.NumCtxSwitches() - if err == nil { - fields[prefix+"voluntary_context_switches"] = ctx.Voluntary - fields[prefix+"involuntary_context_switches"] = ctx.Involuntary + if p.metricEnabled(MetricsContextSwitches) { + ctx, err := proc.NumCtxSwitches() + if err == nil { + fields[prefix+"voluntary_context_switches"] = ctx.Voluntary + fields[prefix+"involuntary_context_switches"] = ctx.Involuntary + } } - faults, err := proc.PageFaults() - if err == nil { - fields[prefix+"minor_faults"] = faults.MinorFaults - fields[prefix+"major_faults"] = faults.MajorFaults - fields[prefix+"child_minor_faults"] = faults.ChildMinorFaults - fields[prefix+"child_major_faults"] = faults.ChildMajorFaults + if p.metricEnabled(MetricsPageFaults) { + faults, err := proc.PageFaults() + if err == nil { + fields[prefix+"minor_faults"] = faults.MinorFaults + fields[prefix+"major_faults"] = faults.MajorFaults + fields[prefix+"child_minor_faults"] = faults.ChildMinorFaults + fields[prefix+"child_major_faults"] = faults.ChildMajorFaults + } } - io, err := proc.IOCounters() - if err == nil { - fields[prefix+"read_count"] = io.ReadCount - fields[prefix+"write_count"] = io.WriteCount - fields[prefix+"read_bytes"] = io.ReadBytes - fields[prefix+"write_bytes"] = io.WriteBytes + if p.metricEnabled(MetricsIO) { + io, err := proc.IOCounters() + if err == nil { + fields[prefix+"read_count"] = io.ReadCount + fields[prefix+"write_count"] = io.WriteCount + fields[prefix+"read_bytes"] = io.ReadBytes + fields[prefix+"write_bytes"] = io.WriteBytes + } + } + + if p.metricEnabled(MetricsCreateTime) { + createdAt, err := proc.CreateTime() //Returns epoch in ms + if err == nil { + fields[prefix+"created_at"] = createdAt * 1000000 //Convert ms to ns + } } - createdAt, err := proc.CreateTime() //Returns epoch in ms - if err == nil { - fields[prefix+"created_at"] = createdAt * 1000000 //Convert ms to ns + if p.metricEnabled(MetricsCPU) { + cpu_time, err := proc.Times() + if err == nil { + fields[prefix+"cpu_time_user"] = cpu_time.User + fields[prefix+"cpu_time_system"] = cpu_time.System + fields[prefix+"cpu_time_idle"] = cpu_time.Idle + fields[prefix+"cpu_time_nice"] = cpu_time.Nice + fields[prefix+"cpu_time_iowait"] = cpu_time.Iowait + fields[prefix+"cpu_time_irq"] = cpu_time.Irq + fields[prefix+"cpu_time_soft_irq"] = cpu_time.Softirq + fields[prefix+"cpu_time_steal"] = cpu_time.Steal + fields[prefix+"cpu_time_guest"] = cpu_time.Guest + fields[prefix+"cpu_time_guest_nice"] = cpu_time.GuestNice + } } - cpu_time, err := proc.Times() - if err == nil { - fields[prefix+"cpu_time_user"] = cpu_time.User - fields[prefix+"cpu_time_system"] = cpu_time.System - fields[prefix+"cpu_time_idle"] = cpu_time.Idle - fields[prefix+"cpu_time_nice"] = cpu_time.Nice - fields[prefix+"cpu_time_iowait"] = cpu_time.Iowait - fields[prefix+"cpu_time_irq"] = cpu_time.Irq - fields[prefix+"cpu_time_soft_irq"] = cpu_time.Softirq - fields[prefix+"cpu_time_steal"] = cpu_time.Steal - fields[prefix+"cpu_time_guest"] = cpu_time.Guest - fields[prefix+"cpu_time_guest_nice"] = cpu_time.GuestNice + if p.metricEnabled(MetricsCPUPercent) { + cpu_perc, err := proc.Percent(time.Duration(0)) + if err == nil { + if p.solarisMode { + fields[prefix+"cpu_usage"] = cpu_perc / float64(runtime.NumCPU()) + } else { + fields[prefix+"cpu_usage"] = cpu_perc + } + } } - cpu_perc, err := proc.Percent(time.Duration(0)) - if err == nil { - if p.solarisMode { - fields[prefix+"cpu_usage"] = cpu_perc / float64(runtime.NumCPU()) - } else { - fields[prefix+"cpu_usage"] = cpu_perc + if p.metricEnabled(MetricsMemory) { + mem, err := proc.MemoryInfo() + if err == nil { + fields[prefix+"memory_rss"] = mem.RSS + fields[prefix+"memory_vms"] = mem.VMS + fields[prefix+"memory_swap"] = mem.Swap + fields[prefix+"memory_data"] = mem.Data + fields[prefix+"memory_stack"] = mem.Stack + fields[prefix+"memory_locked"] = mem.Locked } } - mem, err := proc.MemoryInfo() - if err == nil { - fields[prefix+"memory_rss"] = mem.RSS - fields[prefix+"memory_vms"] = mem.VMS - fields[prefix+"memory_swap"] = mem.Swap - fields[prefix+"memory_data"] = mem.Data - fields[prefix+"memory_stack"] = mem.Stack - fields[prefix+"memory_locked"] = mem.Locked - } - - mem_perc, err := proc.MemoryPercent() - if err == nil { - fields[prefix+"memory_usage"] = mem_perc - } - - rlims, err := proc.RlimitUsage(true) - if err == nil { - for _, rlim := range rlims { - var name string - switch rlim.Resource { - case process.RLIMIT_CPU: - name = "cpu_time" - case process.RLIMIT_DATA: - name = "memory_data" - case process.RLIMIT_STACK: - name = "memory_stack" - case process.RLIMIT_RSS: - name = "memory_rss" - case process.RLIMIT_NOFILE: - name = "num_fds" - case process.RLIMIT_MEMLOCK: - name = "memory_locked" - case process.RLIMIT_AS: - name = "memory_vms" - case process.RLIMIT_LOCKS: - name = "file_locks" - case process.RLIMIT_SIGPENDING: - name = "signals_pending" - case process.RLIMIT_NICE: - name = "nice_priority" - case process.RLIMIT_RTPRIO: - name = "realtime_priority" - default: - continue - } + if p.metricEnabled(MetricsMemoryPercent) { + mem_perc, err := proc.MemoryPercent() + if err == nil { + fields[prefix+"memory_usage"] = mem_perc + } + } - fields[prefix+"rlimit_"+name+"_soft"] = rlim.Soft - fields[prefix+"rlimit_"+name+"_hard"] = rlim.Hard - if name != "file_locks" { // gopsutil doesn't currently track the used file locks count - fields[prefix+name] = rlim.Used + if p.metricEnabled(MetricsLimits) { + rlims, err := proc.RlimitUsage(true) + if err == nil { + for _, rlim := range rlims { + var name string + switch rlim.Resource { + case process.RLIMIT_CPU: + name = "cpu_time" + case process.RLIMIT_DATA: + name = "memory_data" + case process.RLIMIT_STACK: + name = "memory_stack" + case process.RLIMIT_RSS: + name = "memory_rss" + case process.RLIMIT_NOFILE: + name = "num_fds" + case process.RLIMIT_MEMLOCK: + name = "memory_locked" + case process.RLIMIT_AS: + name = "memory_vms" + case process.RLIMIT_LOCKS: + name = "file_locks" + case process.RLIMIT_SIGPENDING: + name = "signals_pending" + case process.RLIMIT_NICE: + name = "nice_priority" + case process.RLIMIT_RTPRIO: + name = "realtime_priority" + default: + continue + } + + fields[prefix+"rlimit_"+name+"_soft"] = rlim.Soft + fields[prefix+"rlimit_"+name+"_hard"] = rlim.Hard + if name != "file_locks" { // gopsutil doesn't currently track the used file locks count + fields[prefix+name] = rlim.Used + } } } } - if p.metricEnabled(MetricsConnectionsStats) { + if p.metricEnabled(MetricsTCPStats) { // Add values with the number of connections in each TCP state pidConnections, err := netInfo.GetConnectionsByPid(uint32(proc.PID())) if err == nil { @@ -357,7 +439,7 @@ func (p *Procstat) addMetric(proc Process, acc telegraf.Accumulator, t time.Time if p.metricEnabled(MetricsConnectionsEndpoints) { // add measurement procstat_tcp with tcp listeners and connections for each proccess - err = addConnectionEnpoints(acc, proc, netInfo) + err := addConnectionEnpoints(acc, proc, netInfo) if err != nil { acc.AddError(fmt.Errorf("D! [inputs.procstat] not able to generate network metrics for pid=%v: %v", proc.PID(), err)) } @@ -582,6 +664,9 @@ func (p *Procstat) metricEnabled(m string) bool { func init() { inputs.Add("procstat", func() telegraf.Input { - return &Procstat{} + return &Procstat{ + // Default metrics to gather + MetricsInclude: defaultCollection, + } }) } diff --git a/plugins/inputs/procstat/procstat_test.go b/plugins/inputs/procstat/procstat_test.go index 31f7291ebd519..7042b533c69e9 100644 --- a/plugins/inputs/procstat/procstat_test.go +++ b/plugins/inputs/procstat/procstat_test.go @@ -245,6 +245,7 @@ func TestGather_PidTag(t *testing.T) { p := Procstat{ Exe: exe, PidTag: true, + MetricsInclude: []string{MetricsThreads}, createPIDFinder: pidFinder([]PID{pid}, nil), createProcess: newTestProc, } @@ -259,6 +260,7 @@ func TestGather_Prefix(t *testing.T) { p := Procstat{ Exe: exe, Prefix: "custom_prefix", + MetricsInclude: []string{MetricsFDs}, createPIDFinder: pidFinder([]PID{pid}, nil), createProcess: newTestProc, } @@ -338,6 +340,7 @@ func TestGather_PercentFirstPass(t *testing.T) { p := Procstat{ Pattern: "foo", PidTag: true, + MetricsInclude: []string{MetricsCPU, MetricsCPUPercent}, createPIDFinder: pidFinder([]PID{pid}, nil), createProcess: NewProc, } @@ -354,6 +357,7 @@ func TestGather_PercentSecondPass(t *testing.T) { p := Procstat{ Pattern: "foo", PidTag: true, + MetricsInclude: []string{MetricsCPU, MetricsCPUPercent}, createPIDFinder: pidFinder([]PID{pid}, nil), createProcess: NewProc, } @@ -425,3 +429,33 @@ func TestGather_SameTimestamps(t *testing.T) { require.Equal(t, procstat.Time, procstat_lookup.Time) } + +func BenchmarkDefaultCollectionPlusEndpoints(b *testing.B) { + var acc testutil.Accumulator + pattern := "." + + p := Procstat{ + Pattern: pattern, + CmdLineTag: true, + MetricsInclude: append(defaultCollection, MetricsConnectionsEndpoints), + } + + for i := 0; i < b.N; i++ { + acc.GatherError(p.Gather) + } +} + +func BenchmarkConnEndpointsOnly(b *testing.B) { + var acc testutil.Accumulator + pattern := "." + + p := Procstat{ + Pattern: pattern, + CmdLineTag: true, + MetricsInclude: []string{MetricsConnectionsEndpoints}, + } + + for i := 0; i < b.N; i++ { + acc.GatherError(p.Gather) + } +} From 81657b5bdcdb1fc89129e01bd23562e6adb74553 Mon Sep 17 00:00:00 2001 From: Adrian Lopez Date: Tue, 9 Feb 2021 13:12:06 +0100 Subject: [PATCH 3/4] Procstat: close /proc/n/fd dirs earlier /proc/n/fd dirs were not being closed till the end of mapInodesToPid function, leaving a lot of file descriptors open. This could reach the hard limit in Linux breaking the functionality of Telegraf. --- plugins/inputs/procstat/connections_linux.go | 65 +++++++++++--------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/plugins/inputs/procstat/connections_linux.go b/plugins/inputs/procstat/connections_linux.go index cb0868e068da2..975c415ad29a6 100644 --- a/plugins/inputs/procstat/connections_linux.go +++ b/plugins/inputs/procstat/connections_linux.go @@ -184,44 +184,51 @@ func mapInodesToPid() (ret map[uint32][]InodeInfo) { } for _, pidStr := range dirContents { - pid, err := strconv.ParseUint(pidStr, 10, 32) - if err != nil { - // exclude files with a not numeric name. We only want to access pid directories - continue - } + readPidFDs(pidStr, ret) + } + + return ret +} + +// readPidFDs given a PID, add to the ret map info about its inodes +func readPidFDs(pidStr string, ret map[uint32][]InodeInfo) { + pid, err := strconv.ParseUint(pidStr, 10, 32) + if err != nil { + // exclude files with a not numeric name. We only want to access pid directories + return + } + + pidDir, err := os.Open("/proc/" + pidStr + "/fd/") + if err != nil { + // ignore errors: + // - missing directory, pid has already finished + // - permission denied + return + } + defer pidDir.Close() - pidDir, err := os.Open("/proc/" + pidStr + "/fd/") + fds, err := pidDir.Readdirnames(0) + if err != nil { + return + } + + for _, fd := range fds { + link, err := os.Readlink("/proc/" + pidStr + "/fd/" + fd) if err != nil { - // ignore errors: - // - missing directory, pid has already finished - // - permission denied continue } - fds, err := pidDir.Readdirnames(0) + var inode uint32 + + _, err = fmt.Sscanf(link, "socket:[%d]", &inode) if err != nil { + // this inode is not a socket continue } - for _, fd := range fds { - link, err := os.Readlink("/proc/" + pidStr + "/fd/" + fd) - if err != nil { - continue - } - - var inode uint32 - - _, err = fmt.Sscanf(link, "socket:[%d]", &inode) - if err != nil { - // this inode is not a socket - continue - } - - ret[inode] = append(ret[inode], InodeInfo{ - pid: uint32(pid), - }) - } + ret[inode] = append(ret[inode], InodeInfo{ + pid: uint32(pid), + }) } - return ret } From 660f86f313fc1ea517cda1f0017448b8a5232208 Mon Sep 17 00:00:00 2001 From: Adrian Lopez Date: Tue, 9 Feb 2021 14:18:17 +0100 Subject: [PATCH 4/4] procstat: avoid collection inode info if not being used netInfo.Fetch() should read all files in /proc/n/fd dirs to map connections to pids. This operation should only be executed if we are going to use that information. --- plugins/inputs/procstat/procstat.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/plugins/inputs/procstat/procstat.go b/plugins/inputs/procstat/procstat.go index 126b732dcc762..ee0e9757aeb90 100644 --- a/plugins/inputs/procstat/procstat.go +++ b/plugins/inputs/procstat/procstat.go @@ -225,9 +225,12 @@ func (p *Procstat) Gather(acc telegraf.Accumulator) error { // Initialize the conn object. Gather info about all TCP connections organized per PID // Avoid repeating this task for each proc netInfo := NetworkInfo{} - err = netInfo.Fetch() - if err != nil { - acc.AddError(fmt.Errorf("E! [inputs.procstat] getting TCP network info: %v", err)) + // Only collect this info if we are going to use it (avoid reading all /proc/N/fd dirs) + if (p.metricEnabled(MetricsTCPStats) || p.metricEnabled(MetricsConnectionsEndpoints)) && len(p.procs) > 0 { + err = netInfo.Fetch() + if err != nil { + acc.AddError(fmt.Errorf("E! [inputs.procstat] getting TCP network info: %v", err)) + } } for _, proc := range p.procs {