diff --git a/instrumentation/host/doc.go b/instrumentation/host/doc.go index b5866bb89f5..479a0204c2d 100644 --- a/instrumentation/host/doc.go +++ b/instrumentation/host/doc.go @@ -10,16 +10,45 @@ // // The metric events produced are listed here with attribute dimensions. // -// Name Attribute +// Name Attribute // // ---------------------------------------------------------------------- // -// process.cpu.time state=user|system -// system.cpu.time state=user|system|other|idle -// system.memory.usage state=used|available -// system.memory.utilization state=used|available -// system.network.io direction=transmit|receive +// process.cpu.time state=user|system +// system.cpu.time state=user|system|other|idle +// system.memory.usage state=used|available +// system.memory.utilization state=used|available +// system.network.io direction=transmit|receive +// +// Linux-specific Pressure Stall Information (PSI) metrics: +// +// system.psi.cpu.some.avg10 (no attributes) +// system.psi.cpu.some.avg60 (no attributes) +// system.psi.cpu.some.avg300 (no attributes) +// system.psi.cpu.some.total (no attributes) +// system.psi.memory.some.avg10 (no attributes) +// system.psi.memory.some.avg60 (no attributes) +// system.psi.memory.some.avg300 (no attributes) +// system.psi.memory.some.total (no attributes) +// system.psi.memory.full.avg10 (no attributes) +// system.psi.memory.full.avg60 (no attributes) +// system.psi.memory.full.avg300 (no attributes) +// system.psi.memory.full.total (no attributes) +// system.psi.io.some.avg10 (no attributes) +// system.psi.io.some.avg60 (no attributes) +// system.psi.io.some.avg300 (no attributes) +// system.psi.io.some.total (no attributes) +// system.psi.io.full.avg10 (no attributes) +// system.psi.io.full.avg60 (no attributes) +// system.psi.io.full.avg300 (no attributes) +// system.psi.io.full.total (no attributes) +// +// PSI metrics are only available on Linux systems with kernel 4.20+. +// "some" indicates that some tasks are stalled, "full" indicates all tasks are stalled. +// The avg* metrics represent pressure averages over 10, 60, and 300 second windows. +// The total metrics represent cumulative stall time in microseconds. // // See https://github.com/open-telemetry/oteps/blob/main/text/0119-standard-system-metrics.md // for the definition of these metric instruments. +// For PSI metrics, see https://docs.kernel.org/accounting/psi.html package host // import "go.opentelemetry.io/contrib/instrumentation/host" diff --git a/instrumentation/host/host.go b/instrumentation/host/host.go index 961fa1bbd17..e2dadb5a257 100644 --- a/instrumentation/host/host.go +++ b/instrumentation/host/host.go @@ -29,6 +29,7 @@ const ScopeName = "go.opentelemetry.io/contrib/instrumentation/host" type host struct { config config meter metric.Meter + psi *psiMetrics } // config contains optional settings for reporting host metrics. @@ -195,6 +196,11 @@ func (h *host) register() error { return err } + // Register PSI metrics (Linux only) + if h.psi, err = h.registerPSI(); err != nil { + return err + } + _, err = h.meter.RegisterCallback( func(ctx context.Context, o metric.Observer) error { lock.Lock() diff --git a/instrumentation/host/host_test.go b/instrumentation/host/host_test.go index c9c40d7d108..747abda09f3 100644 --- a/instrumentation/host/host_test.go +++ b/instrumentation/host/host_test.go @@ -4,6 +4,7 @@ package host_test import ( + "strings" "testing" "github.com/stretchr/testify/require" @@ -125,5 +126,20 @@ func TestHostMetrics(t *testing.T) { }, }, } - metricdatatest.AssertEqual(t, want, rm.ScopeMetrics[0], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) + + baseMetrics := rm.ScopeMetrics[0] + filteredMetrics := metricdata.ScopeMetrics{ + Scope: baseMetrics.Scope, + Metrics: []metricdata.Metrics{}, + } + + for _, m := range baseMetrics.Metrics { + // Skip PSI metrics in this test - we test those separately + if strings.HasPrefix(m.Name, "system.psi.") { + continue + } + filteredMetrics.Metrics = append(filteredMetrics.Metrics, m) + } + + metricdatatest.AssertEqual(t, want, filteredMetrics, metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) } diff --git a/instrumentation/host/psi.go b/instrumentation/host/psi.go new file mode 100644 index 00000000000..0a46d7d4b02 --- /dev/null +++ b/instrumentation/host/psi.go @@ -0,0 +1,404 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package host // import "go.opentelemetry.io/contrib/instrumentation/host" + +import ( + "context" + "fmt" + "os" + "strconv" + "strings" + + "go.opentelemetry.io/otel/metric" +) + +const ( + psiCPUFile = "/proc/pressure/cpu" + psiMemoryFile = "/proc/pressure/memory" + psiIOFile = "/proc/pressure/io" +) + +// psiStats represents parsed PSI statistics for a resource +type psiStats struct { + some psiStat + full psiStat +} + +type psiStat struct { + avg10 float64 + avg60 float64 + avg300 float64 + total int64 +} + +// psiMetrics holds all PSI metric instruments +type psiMetrics struct { + cpuSomeAvg10 metric.Float64ObservableGauge + cpuSomeAvg60 metric.Float64ObservableGauge + cpuSomeAvg300 metric.Float64ObservableGauge + cpuSomeTotal metric.Int64ObservableCounter + memorySomeAvg10 metric.Float64ObservableGauge + memorySomeAvg60 metric.Float64ObservableGauge + memorySomeAvg300 metric.Float64ObservableGauge + memorySomeTotal metric.Int64ObservableCounter + memoryFullAvg10 metric.Float64ObservableGauge + memoryFullAvg60 metric.Float64ObservableGauge + memoryFullAvg300 metric.Float64ObservableGauge + memoryFullTotal metric.Int64ObservableCounter + ioSomeAvg10 metric.Float64ObservableGauge + ioSomeAvg60 metric.Float64ObservableGauge + ioSomeAvg300 metric.Float64ObservableGauge + ioSomeTotal metric.Int64ObservableCounter + ioFullAvg10 metric.Float64ObservableGauge + ioFullAvg60 metric.Float64ObservableGauge + ioFullAvg300 metric.Float64ObservableGauge + ioFullTotal metric.Int64ObservableCounter +} + +// registerPSI registers all PSI metric instruments and their callback +func (h *host) registerPSI() (*psiMetrics, error) { + pm := &psiMetrics{} + var err error + + // CPU PSI metrics + pm.cpuSomeAvg10, err = h.meter.Float64ObservableGauge( + "system.psi.cpu.some.avg10", + metric.WithDescription("CPU pressure stall information - some tasks waiting, 10 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.cpuSomeAvg60, err = h.meter.Float64ObservableGauge( + "system.psi.cpu.some.avg60", + metric.WithDescription("CPU pressure stall information - some tasks waiting, 60 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.cpuSomeAvg300, err = h.meter.Float64ObservableGauge( + "system.psi.cpu.some.avg300", + metric.WithDescription("CPU pressure stall information - some tasks waiting, 300 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.cpuSomeTotal, err = h.meter.Int64ObservableCounter( + "system.psi.cpu.some.total", + metric.WithDescription("CPU pressure stall information - some tasks waiting, total time in microseconds"), + metric.WithUnit("us"), + ) + if err != nil { + return nil, err + } + + // Memory PSI metrics - some + pm.memorySomeAvg10, err = h.meter.Float64ObservableGauge( + "system.psi.memory.some.avg10", + metric.WithDescription("Memory pressure stall information - some tasks waiting, 10 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.memorySomeAvg60, err = h.meter.Float64ObservableGauge( + "system.psi.memory.some.avg60", + metric.WithDescription("Memory pressure stall information - some tasks waiting, 60 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.memorySomeAvg300, err = h.meter.Float64ObservableGauge( + "system.psi.memory.some.avg300", + metric.WithDescription("Memory pressure stall information - some tasks waiting, 300 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.memorySomeTotal, err = h.meter.Int64ObservableCounter( + "system.psi.memory.some.total", + metric.WithDescription("Memory pressure stall information - some tasks waiting, total time in microseconds"), + metric.WithUnit("us"), + ) + if err != nil { + return nil, err + } + + // Memory PSI metrics - full + pm.memoryFullAvg10, err = h.meter.Float64ObservableGauge( + "system.psi.memory.full.avg10", + metric.WithDescription("Memory pressure stall information - all tasks waiting, 10 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.memoryFullAvg60, err = h.meter.Float64ObservableGauge( + "system.psi.memory.full.avg60", + metric.WithDescription("Memory pressure stall information - all tasks waiting, 60 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.memoryFullAvg300, err = h.meter.Float64ObservableGauge( + "system.psi.memory.full.avg300", + metric.WithDescription("Memory pressure stall information - all tasks waiting, 300 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.memoryFullTotal, err = h.meter.Int64ObservableCounter( + "system.psi.memory.full.total", + metric.WithDescription("Memory pressure stall information - all tasks waiting, total time in microseconds"), + metric.WithUnit("us"), + ) + if err != nil { + return nil, err + } + + // IO PSI metrics - some + pm.ioSomeAvg10, err = h.meter.Float64ObservableGauge( + "system.psi.io.some.avg10", + metric.WithDescription("IO pressure stall information - some tasks waiting, 10 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.ioSomeAvg60, err = h.meter.Float64ObservableGauge( + "system.psi.io.some.avg60", + metric.WithDescription("IO pressure stall information - some tasks waiting, 60 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.ioSomeAvg300, err = h.meter.Float64ObservableGauge( + "system.psi.io.some.avg300", + metric.WithDescription("IO pressure stall information - some tasks waiting, 300 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.ioSomeTotal, err = h.meter.Int64ObservableCounter( + "system.psi.io.some.total", + metric.WithDescription("IO pressure stall information - some tasks waiting, total time in microseconds"), + metric.WithUnit("us"), + ) + if err != nil { + return nil, err + } + + // IO PSI metrics - full + pm.ioFullAvg10, err = h.meter.Float64ObservableGauge( + "system.psi.io.full.avg10", + metric.WithDescription("IO pressure stall information - all tasks waiting, 10 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.ioFullAvg60, err = h.meter.Float64ObservableGauge( + "system.psi.io.full.avg60", + metric.WithDescription("IO pressure stall information - all tasks waiting, 60 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.ioFullAvg300, err = h.meter.Float64ObservableGauge( + "system.psi.io.full.avg300", + metric.WithDescription("IO pressure stall information - all tasks waiting, 300 second average"), + metric.WithUnit("%"), + ) + if err != nil { + return nil, err + } + + pm.ioFullTotal, err = h.meter.Int64ObservableCounter( + "system.psi.io.full.total", + metric.WithDescription("IO pressure stall information - all tasks waiting, total time in microseconds"), + metric.WithUnit("us"), + ) + if err != nil { + return nil, err + } + + // Register callback for PSI metrics + _, err = h.meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + return pm.observePSI(ctx, o) + }, + pm.cpuSomeAvg10, + pm.cpuSomeAvg60, + pm.cpuSomeAvg300, + pm.cpuSomeTotal, + pm.memorySomeAvg10, + pm.memorySomeAvg60, + pm.memorySomeAvg300, + pm.memorySomeTotal, + pm.memoryFullAvg10, + pm.memoryFullAvg60, + pm.memoryFullAvg300, + pm.memoryFullTotal, + pm.ioSomeAvg10, + pm.ioSomeAvg60, + pm.ioSomeAvg300, + pm.ioSomeTotal, + pm.ioFullAvg10, + pm.ioFullAvg60, + pm.ioFullAvg300, + pm.ioFullTotal, + ) + if err != nil { + return nil, err + } + + return pm, nil +} + +// observePSI reads PSI metrics and records observations +func (pm *psiMetrics) observePSI(ctx context.Context, o metric.Observer) error { + cpuStats, err := readPSIFile(psiCPUFile) + if err == nil { + o.ObserveFloat64(pm.cpuSomeAvg10, cpuStats.some.avg10) + o.ObserveFloat64(pm.cpuSomeAvg60, cpuStats.some.avg60) + o.ObserveFloat64(pm.cpuSomeAvg300, cpuStats.some.avg300) + o.ObserveInt64(pm.cpuSomeTotal, cpuStats.some.total) + } + + memStats, err := readPSIFile(psiMemoryFile) + if err == nil { + o.ObserveFloat64(pm.memorySomeAvg10, memStats.some.avg10) + o.ObserveFloat64(pm.memorySomeAvg60, memStats.some.avg60) + o.ObserveFloat64(pm.memorySomeAvg300, memStats.some.avg300) + o.ObserveInt64(pm.memorySomeTotal, memStats.some.total) + o.ObserveFloat64(pm.memoryFullAvg10, memStats.full.avg10) + o.ObserveFloat64(pm.memoryFullAvg60, memStats.full.avg60) + o.ObserveFloat64(pm.memoryFullAvg300, memStats.full.avg300) + o.ObserveInt64(pm.memoryFullTotal, memStats.full.total) + } + + ioStats, err := readPSIFile(psiIOFile) + if err == nil { + o.ObserveFloat64(pm.ioSomeAvg10, ioStats.some.avg10) + o.ObserveFloat64(pm.ioSomeAvg60, ioStats.some.avg60) + o.ObserveFloat64(pm.ioSomeAvg300, ioStats.some.avg300) + o.ObserveInt64(pm.ioSomeTotal, ioStats.some.total) + o.ObserveFloat64(pm.ioFullAvg10, ioStats.full.avg10) + o.ObserveFloat64(pm.ioFullAvg60, ioStats.full.avg60) + o.ObserveFloat64(pm.ioFullAvg300, ioStats.full.avg300) + o.ObserveInt64(pm.ioFullTotal, ioStats.full.total) + } + + return nil +} + +// readPSIFile reads and parses a PSI file +func readPSIFile(path string) (*psiStats, error) { + content, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read PSI file %s: %w", path, err) + } + + return parsePSI(string(content)) +} + +// parsePSI parses PSI file content +// Format: +// some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +// full avg10=0.00 avg60=0.00 avg300=0.00 total=0 +func parsePSI(content string) (*psiStats, error) { + stats := &psiStats{} + lines := strings.Split(strings.TrimSpace(content), "\n") + + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + parts := strings.Fields(line) + if len(parts) < 5 { + return nil, fmt.Errorf("invalid PSI line format: %s", line) + } + + pressureType := parts[0] + var avg10, avg60, avg300 float64 + var total int64 + var err error + + // Parse avg10=X.XX + if strings.HasPrefix(parts[1], "avg10=") { + avg10, err = strconv.ParseFloat(strings.TrimPrefix(parts[1], "avg10="), 64) + if err != nil { + return nil, fmt.Errorf("failed to parse avg10: %w", err) + } + } + + // Parse avg60=X.XX + if strings.HasPrefix(parts[2], "avg60=") { + avg60, err = strconv.ParseFloat(strings.TrimPrefix(parts[2], "avg60="), 64) + if err != nil { + return nil, fmt.Errorf("failed to parse avg60: %w", err) + } + } + + // Parse avg300=X.XX + if strings.HasPrefix(parts[3], "avg300=") { + avg300, err = strconv.ParseFloat(strings.TrimPrefix(parts[3], "avg300="), 64) + if err != nil { + return nil, fmt.Errorf("failed to parse avg300: %w", err) + } + } + + // Parse total=XXXXX + if strings.HasPrefix(parts[4], "total=") { + total, err = strconv.ParseInt(strings.TrimPrefix(parts[4], "total="), 10, 64) + if err != nil { + return nil, fmt.Errorf("failed to parse total: %w", err) + } + } + + switch pressureType { + case "some": + stats.some.avg10 = avg10 + stats.some.avg60 = avg60 + stats.some.avg300 = avg300 + stats.some.total = total + case "full": + stats.full.avg10 = avg10 + stats.full.avg60 = avg60 + stats.full.avg300 = avg300 + stats.full.total = total + default: + return nil, fmt.Errorf("unknown pressure type: %s", pressureType) + } + } + + return stats, nil +} diff --git a/instrumentation/host/psi_other.go b/instrumentation/host/psi_other.go new file mode 100644 index 00000000000..48b3f4f139f --- /dev/null +++ b/instrumentation/host/psi_other.go @@ -0,0 +1,25 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !linux + +package host // import "go.opentelemetry.io/contrib/instrumentation/host" + +import ( + "context" + + "go.opentelemetry.io/otel/metric" +) + +// psiMetrics is a no-op type for non-Linux platforms +type psiMetrics struct{} + +// registerPSI returns nil on non-Linux platforms (PSI is Linux-specific) +func (h *host) registerPSI() (*psiMetrics, error) { + return nil, nil +} + +// observePSI is a no-op on non-Linux platforms +func (pm *psiMetrics) observePSI(ctx context.Context, o metric.Observer) error { + return nil +} diff --git a/instrumentation/host/psi_test.go b/instrumentation/host/psi_test.go new file mode 100644 index 00000000000..831f6503688 --- /dev/null +++ b/instrumentation/host/psi_test.go @@ -0,0 +1,174 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package host + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParsePSI(t *testing.T) { + testCases := []struct { + name string + input string + expectError bool + validate func(*testing.T, *psiStats) + }{ + { + name: "valid cpu format", + input: `some avg10=1.23 avg60=2.34 avg300=3.45 total=123456 +`, + expectError: false, + validate: func(t *testing.T, stats *psiStats) { + assert.Equal(t, 1.23, stats.some.avg10) + assert.Equal(t, 2.34, stats.some.avg60) + assert.Equal(t, 3.45, stats.some.avg300) + assert.Equal(t, int64(123456), stats.some.total) + assert.Equal(t, 0.0, stats.full.avg10) + assert.Equal(t, 0.0, stats.full.avg60) + assert.Equal(t, 0.0, stats.full.avg300) + assert.Equal(t, int64(0), stats.full.total) + }, + }, + { + name: "valid memory format with full", + input: `some avg10=1.23 avg60=2.34 avg300=3.45 total=123456 +full avg10=0.50 avg60=1.00 avg300=1.50 total=654321 +`, + expectError: false, + validate: func(t *testing.T, stats *psiStats) { + assert.Equal(t, 1.23, stats.some.avg10) + assert.Equal(t, 2.34, stats.some.avg60) + assert.Equal(t, 3.45, stats.some.avg300) + assert.Equal(t, int64(123456), stats.some.total) + assert.Equal(t, 0.50, stats.full.avg10) + assert.Equal(t, 1.00, stats.full.avg60) + assert.Equal(t, 1.50, stats.full.avg300) + assert.Equal(t, int64(654321), stats.full.total) + }, + }, + { + name: "zero values", + input: `some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 +`, + expectError: false, + validate: func(t *testing.T, stats *psiStats) { + assert.Equal(t, 0.0, stats.some.avg10) + assert.Equal(t, 0.0, stats.some.avg60) + assert.Equal(t, 0.0, stats.some.avg300) + assert.Equal(t, int64(0), stats.some.total) + assert.Equal(t, 0.0, stats.full.avg10) + assert.Equal(t, 0.0, stats.full.avg60) + assert.Equal(t, 0.0, stats.full.avg300) + assert.Equal(t, int64(0), stats.full.total) + }, + }, + { + name: "large values", + input: `some avg10=99.99 avg60=100.00 avg300=50.00 total=9223372036854775807 +full avg10=25.50 avg60=30.00 avg300=35.00 total=1234567890123456 +`, + expectError: false, + validate: func(t *testing.T, stats *psiStats) { + assert.Equal(t, 99.99, stats.some.avg10) + assert.Equal(t, 100.00, stats.some.avg60) + assert.Equal(t, 50.00, stats.some.avg300) + assert.Equal(t, int64(9223372036854775807), stats.some.total) + assert.Equal(t, 25.50, stats.full.avg10) + assert.Equal(t, 30.00, stats.full.avg60) + assert.Equal(t, 35.00, stats.full.avg300) + assert.Equal(t, int64(1234567890123456), stats.full.total) + }, + }, + { + name: "invalid format - not enough fields", + input: "some avg10=1.23\n", + expectError: true, + }, + { + name: "invalid format - bad avg10 value", + input: "some avg10=abc avg60=2.34 avg300=3.45 total=123456\n", + expectError: true, + }, + { + name: "invalid format - bad avg60 value", + input: "some avg10=1.23 avg60=xyz avg300=3.45 total=123456\n", + expectError: true, + }, + { + name: "invalid format - bad avg300 value", + input: "some avg10=1.23 avg60=2.34 avg300=bad total=123456\n", + expectError: true, + }, + { + name: "invalid format - bad total value", + input: "some avg10=1.23 avg60=2.34 avg300=3.45 total=abc\n", + expectError: true, + }, + { + name: "invalid pressure type", + input: "partial avg10=1.23 avg60=2.34 avg300=3.45 total=123456\n", + expectError: true, + }, + { + name: "empty input", + input: "", + expectError: false, + validate: func(t *testing.T, stats *psiStats) { + // Empty input should return zero values + assert.Equal(t, 0.0, stats.some.avg10) + assert.Equal(t, 0.0, stats.full.avg10) + }, + }, + { + name: "only whitespace", + input: ` + +`, + expectError: false, + validate: func(t *testing.T, stats *psiStats) { + assert.Equal(t, 0.0, stats.some.avg10) + assert.Equal(t, 0.0, stats.full.avg10) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + stats, err := parsePSI(tc.input) + if tc.expectError { + assert.Error(t, err) + } else { + require.NoError(t, err) + if tc.validate != nil { + tc.validate(t, stats) + } + } + }) + } +} + +func TestParsePSI_RealWorldExamples(t *testing.T) { + // Example from a real Linux system under light load + lightLoad := `some avg10=0.05 avg60=0.12 avg300=0.08 total=1234567 +full avg10=0.01 avg60=0.02 avg300=0.03 total=234567 +` + stats, err := parsePSI(lightLoad) + require.NoError(t, err) + assert.Equal(t, 0.05, stats.some.avg10) + assert.Equal(t, 0.01, stats.full.avg10) + + // Example from a system with no pressure (CPU typically only has "some") + noPressure := `some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +` + stats, err = parsePSI(noPressure) + require.NoError(t, err) + assert.Equal(t, 0.0, stats.some.avg10) + assert.Equal(t, int64(0), stats.some.total) +}