Skip to content

Commit 7ed8b95

Browse files
authored
Delay route harvesting for Java apps (#855)
1 parent 31f7983 commit 7ed8b95

File tree

10 files changed

+73
-16
lines changed

10 files changed

+73
-16
lines changed

internal/test/oats/http/docker-compose-java-routes-custom.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ services:
2828
OTEL_EBPF_OTLP_TRACES_BATCH_TIMEOUT: "1ms"
2929
OTEL_EBPF_LOG_LEVEL: "DEBUG"
3030
OTEL_EBPF_BPF_DEBUG: "true"
31+
OTEL_EBPF_JAVA_ROUTE_HARVEST_DELAY: "10s"
3132
OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector:4318"
3233
depends_on:
3334
testserver:

internal/test/oats/http/docker-compose-java-routes-disabled.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ services:
3131
OTEL_EBPF_BPF_BATCH_TIMEOUT: "10ms"
3232
OTEL_EBPF_LOG_LEVEL: "DEBUG"
3333
OTEL_EBPF_BPF_DEBUG: "true"
34+
OTEL_EBPF_JAVA_ROUTE_HARVEST_DELAY: "10s"
3435
OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector:4318"
3536
depends_on:
3637
testserver:

internal/test/oats/http/docker-compose-java-routes.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ services:
3131
OTEL_EBPF_BPF_BATCH_TIMEOUT: "10ms"
3232
OTEL_EBPF_LOG_LEVEL: "DEBUG"
3333
OTEL_EBPF_BPF_DEBUG: "true"
34+
OTEL_EBPF_JAVA_ROUTE_HARVEST_DELAY: "10s"
3435
OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector:4318"
3536
depends_on:
3637
testserver:

pkg/appolly/discover/attacher.go

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"log/slog"
1010
"os"
11+
"time"
1112

1213
"github.com/cilium/ebpf/link"
1314
"github.com/cilium/ebpf/rlimit"
@@ -70,6 +71,9 @@ type traceAttacher struct {
7071

7172
// Extracts HTTP routes from executables
7273
routeHarvester *harvest.RouteHarvester
74+
75+
// Is able to find process lifetime duration
76+
processAgeFunc func(int32) time.Duration
7377
}
7478

7579
func traceAttacherProvider(ta *traceAttacher) swarm.InstanceFunc {
@@ -84,7 +88,8 @@ func (ta *traceAttacher) attacherLoop(_ context.Context) (swarm.RunFunc, error)
8488
ta.processInstances = maps.MultiCounter[uint64]{}
8589
ta.obiPID = os.Getpid()
8690
ta.EbpfEventContext.CommonPIDsFilter = ebpfcommon.CommonPIDsFilter(&ta.Cfg.Discovery, ta.Metrics)
87-
ta.routeHarvester = harvest.NewRouteHarvester(ta.Cfg.Discovery.DisabledRouteHarvesters, ta.Cfg.Discovery.RouteHarvesterTimeout)
91+
ta.routeHarvester = harvest.NewRouteHarvester(&ta.Cfg.Discovery.RouteHarvestConfig, ta.Cfg.Discovery.DisabledRouteHarvesters, ta.Cfg.Discovery.RouteHarvesterTimeout)
92+
ta.processAgeFunc = ProcessAgeFunc()
8893

8994
if err := ta.init(); err != nil {
9095
ta.log.Error("cant start process tracer. Stopping it", "error", err)
@@ -136,7 +141,7 @@ func (ta *traceAttacher) getTracer(ie *ebpf.Instrumentable) bool {
136141
"cmd", ie.FileInfo.CmdExePath)
137142
ie.FileInfo.Service.SDKLanguage = ie.Type
138143
// Must be called after we've set the SDKLanguage
139-
ta.harvestRoutes(ie, false)
144+
ta.harvestRoutes(ie, true)
140145

141146
// allowing the tracer to forward traces from the new PID and its children processes
142147
ta.monitorPIDs(tracer, ie)
@@ -262,7 +267,7 @@ func (ta *traceAttacher) withCommonTracersGroup(tracers []ebpf.Tracer) []ebpf.Tr
262267
return tracers
263268
}
264269

265-
func (ta *traceAttacher) harvestRoutes(ie *ebpf.Instrumentable, reused bool) {
270+
func (ta *traceAttacher) harvestRoutesProcessor(ie *ebpf.Instrumentable, reused bool) {
266271
routes, err := ta.routeHarvester.HarvestRoutes(ie.FileInfo)
267272
if err != nil {
268273
ta.log.Info("encountered error harvesting routes", "error", err, "pid", ie.FileInfo.Pid, "cmd", ie.FileInfo.CmdExePath)
@@ -273,6 +278,24 @@ func (ta *traceAttacher) harvestRoutes(ie *ebpf.Instrumentable, reused bool) {
273278
}
274279
}
275280

281+
func (ta *traceAttacher) harvestRoutes(ie *ebpf.Instrumentable, reused bool) {
282+
if delay, delayTime := ta.routeHarvester.HarvestRoutesDelay(ie.FileInfo); delay {
283+
procAge := ta.processAgeFunc(ie.FileInfo.Pid)
284+
if procAge < delayTime {
285+
time.AfterFunc(delayTime-procAge, func() {
286+
// sanity check that the program is still up and running and it's the same command
287+
if exePath, ready := ExecutableReady(PID(ie.FileInfo.Pid)); ready && exePath == ie.FileInfo.CmdExePath {
288+
ta.harvestRoutesProcessor(ie, reused)
289+
}
290+
})
291+
292+
return
293+
}
294+
}
295+
296+
ta.harvestRoutesProcessor(ie, reused)
297+
}
298+
276299
func (ta *traceAttacher) loadExecutable(ie *ebpf.Instrumentable) (*link.Executable, bool) {
277300
// Instead of the executable file in the disk, we pass the /proc/<pid>/exec
278301
// to allow loading it from different container/pods in containerized environments

pkg/appolly/discover/watcher_proc.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ func ProcessWatcherFunc(cfg *obi.Config, ebpfContext *ebpfcommon.EBPFEventContex
7474
pids: map[PID]ProcessAttrs{},
7575
pidPorts: map[pidPort]ProcessAttrs{},
7676
listProcesses: fetchProcessPorts,
77-
executableReady: executableReady,
77+
executableReady: ExecutableReady,
7878
loadBPFWatcher: loadBPFWatcher,
7979
loadBPFLogger: loadBPFLogger,
8080
fetchPorts: true, // must be true until we've activated the bpf watcher component
@@ -284,7 +284,7 @@ func (pa *pollAccounter) snapshot(fetchedProcs map[PID]ProcessAttrs) []Event[Pro
284284
return events
285285
}
286286

287-
func executableReady(pid PID) (string, bool) {
287+
func ExecutableReady(pid PID) (string, bool) {
288288
proc, err := process.NewProcess(int32(pid))
289289
if err != nil {
290290
return "", false
@@ -348,13 +348,13 @@ func (pa *pollAccounter) checkNewProcessNotification(pid PID, reportedProcs, not
348348
return false
349349
}
350350

351-
func makeProcessAgeFunc() func(int32) time.Duration {
351+
func ProcessAgeFunc() func(int32) time.Duration {
352352
r := procStatReader{}
353353
return r.processAge
354354
}
355355

356356
// overridden in tests
357-
var processAgeFunc = makeProcessAgeFunc()
357+
var processAgeFunc = ProcessAgeFunc()
358358

359359
// see https://man7.org/linux/man-pages/man5/proc_pid_stat.5.html
360360
func parseProcStatField(buf string, field int) string {

pkg/appolly/services/criteria.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ type DiscoveryConfig struct {
110110
RouteHarvesterTimeout time.Duration `yaml:"route_harvester_timeout" env:"OTEL_EBPF_ROUTE_HARVESTER_TIMEOUT"`
111111

112112
DisabledRouteHarvesters []string `yaml:"disabled_route_harvesters"`
113+
114+
RouteHarvestConfig RouteHarvestingConfig `yaml:"route_harvester_advanced"`
115+
}
116+
117+
type RouteHarvestingConfig struct {
118+
JavaHarvestDelay time.Duration `yaml:"java_harvest_delay" env:"OTEL_EBPF_JAVA_ROUTE_HARVEST_DELAY"`
113119
}
114120

115121
func (c *DiscoveryConfig) Validate() error {

pkg/internal/transform/route/harvest/harvester.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,22 @@ import (
88
"log/slog"
99
"runtime"
1010
"strings"
11+
"sync"
1112
"time"
1213

1314
"go.opentelemetry.io/obi/pkg/appolly/app/svc"
1415
"go.opentelemetry.io/obi/pkg/appolly/discover/exec"
16+
"go.opentelemetry.io/obi/pkg/appolly/services"
1517
"go.opentelemetry.io/obi/pkg/internal/transform/route"
1618
)
1719

1820
type RouteHarvester struct {
1921
log *slog.Logger
2022
java *JavaRoutes
2123
disabled map[svc.InstrumentableType]struct{}
24+
cfg *services.RouteHarvestingConfig
2225
timeout time.Duration
26+
mux *sync.Mutex
2327

2428
// testing related
2529
javaExtractRoutes func(pid int32) (*RouteHarvesterResult, error)
@@ -46,7 +50,7 @@ func (e *HarvestError) Error() string {
4650
return e.Message
4751
}
4852

49-
func NewRouteHarvester(disabled []string, timeout time.Duration) *RouteHarvester {
53+
func NewRouteHarvester(cfg *services.RouteHarvestingConfig, disabled []string, timeout time.Duration) *RouteHarvester {
5054
dMap := map[svc.InstrumentableType]struct{}{}
5155
for _, lang := range disabled {
5256
if strings.ToLower(lang) == "java" {
@@ -59,6 +63,8 @@ func NewRouteHarvester(disabled []string, timeout time.Duration) *RouteHarvester
5963
java: NewJavaRoutesHarvester(),
6064
disabled: dMap,
6165
timeout: timeout,
66+
cfg: cfg,
67+
mux: &sync.Mutex{},
6268
}
6369

6470
h.javaExtractRoutes = h.java.ExtractRoutes
@@ -67,6 +73,10 @@ func NewRouteHarvester(disabled []string, timeout time.Duration) *RouteHarvester
6773
}
6874

6975
func (h *RouteHarvester) HarvestRoutes(fileInfo *exec.FileInfo) (*RouteHarvesterResult, error) {
76+
// Ensure we harvest one by one
77+
h.mux.Lock()
78+
defer h.mux.Unlock()
79+
7080
// Create a context with timeout
7181
ctx, cancel := context.WithTimeout(context.Background(), h.timeout)
7282
defer cancel()
@@ -136,3 +146,11 @@ func RouteMatcherFromResult(r RouteHarvesterResult) route.Matcher {
136146

137147
return nil
138148
}
149+
150+
func (h *RouteHarvester) HarvestRoutesDelay(fileInfo *exec.FileInfo) (bool, time.Duration) {
151+
if fileInfo.Service.SDKLanguage == svc.InstrumentableJava {
152+
return true, h.cfg.JavaHarvestDelay
153+
}
154+
155+
return false, 0
156+
}

pkg/internal/transform/route/harvest/harvester_test.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"go.opentelemetry.io/obi/pkg/appolly/app/svc"
1515
"go.opentelemetry.io/obi/pkg/appolly/discover/exec"
16+
"go.opentelemetry.io/obi/pkg/appolly/services"
1617
)
1718

1819
// successfulExtractRoutes simulates a successful route extraction
@@ -70,7 +71,7 @@ func createTestFileInfo(language svc.InstrumentableType) *exec.FileInfo {
7071
}
7172

7273
func TestHarvestRoutes_Successful(t *testing.T) {
73-
harvester := NewRouteHarvester([]string{}, 1*time.Second)
74+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 1*time.Second)
7475
harvester.javaExtractRoutes = successfulExtractRoutes
7576

7677
fileInfo := createTestFileInfo(svc.InstrumentableJava)
@@ -84,7 +85,7 @@ func TestHarvestRoutes_Successful(t *testing.T) {
8485
}
8586

8687
func TestHarvestRoutes_Error(t *testing.T) {
87-
harvester := NewRouteHarvester([]string{}, 1*time.Second)
88+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 1*time.Second)
8889
harvester.javaExtractRoutes = errorExtractRoutes
8990

9091
fileInfo := createTestFileInfo(svc.InstrumentableJava)
@@ -97,7 +98,7 @@ func TestHarvestRoutes_Error(t *testing.T) {
9798
}
9899

99100
func TestHarvestRoutes_Timeout(t *testing.T) {
100-
harvester := NewRouteHarvester([]string{}, 100*time.Millisecond) // Short timeout
101+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 100*time.Millisecond) // Short timeout
101102
harvester.javaExtractRoutes = timeoutExtractRoutes
102103

103104
fileInfo := createTestFileInfo(svc.InstrumentableJava)
@@ -120,7 +121,7 @@ func TestHarvestRoutes_Timeout(t *testing.T) {
120121
}
121122

122123
func TestHarvestRoutes_Panic(t *testing.T) {
123-
harvester := NewRouteHarvester([]string{}, 1*time.Second)
124+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 1*time.Second)
124125
harvester.javaExtractRoutes = panicExtractRoutes
125126

126127
fileInfo := createTestFileInfo(svc.InstrumentableJava)
@@ -137,7 +138,7 @@ func TestHarvestRoutes_Panic(t *testing.T) {
137138
}
138139

139140
func TestHarvestRoutes_SlowButSuccessful(t *testing.T) {
140-
harvester := NewRouteHarvester([]string{}, 200*time.Millisecond) // Enough time for slow operation
141+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 200*time.Millisecond) // Enough time for slow operation
141142
harvester.javaExtractRoutes = slowButSuccessfulExtractRoutes
142143

143144
fileInfo := createTestFileInfo(svc.InstrumentableJava)
@@ -151,7 +152,7 @@ func TestHarvestRoutes_SlowButSuccessful(t *testing.T) {
151152
}
152153

153154
func TestHarvestRoutes_EmptyResult(t *testing.T) {
154-
harvester := NewRouteHarvester([]string{}, 1*time.Second)
155+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 1*time.Second)
155156
harvester.javaExtractRoutes = emptyResultExtractRoutes
156157

157158
fileInfo := createTestFileInfo(svc.InstrumentableJava)
@@ -165,7 +166,7 @@ func TestHarvestRoutes_EmptyResult(t *testing.T) {
165166
}
166167

167168
func TestHarvestRoutes_NonJavaLanguage(t *testing.T) {
168-
harvester := NewRouteHarvester([]string{}, 1*time.Second)
169+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 1*time.Second)
169170
// javaExtractRoutes should not be called for non-Java languages
170171
harvester.javaExtractRoutes = func(_ int32) (*RouteHarvesterResult, error) {
171172
t.Fatal("javaExtractRoutes should not be called for non-Java languages")
@@ -181,7 +182,7 @@ func TestHarvestRoutes_NonJavaLanguage(t *testing.T) {
181182
}
182183

183184
func TestHarvestRoutes_MultipleTimeouts(t *testing.T) {
184-
harvester := NewRouteHarvester([]string{}, 50*time.Millisecond)
185+
harvester := NewRouteHarvester(&services.RouteHarvestingConfig{}, []string{}, 50*time.Millisecond)
185186
harvester.javaExtractRoutes = timeoutExtractRoutes
186187

187188
fileInfo := createTestFileInfo(svc.InstrumentableJava)

pkg/obi/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ var DefaultConfig = Config{
206206
MinProcessAge: 5 * time.Second,
207207
DefaultOtlpGRPCPort: 4317,
208208
RouteHarvesterTimeout: 10 * time.Second,
209+
RouteHarvestConfig: services.RouteHarvestingConfig{
210+
JavaHarvestDelay: 60 * time.Second,
211+
},
209212
},
210213
NodeJS: NodeJSConfig{
211214
Enabled: true,

pkg/obi/config_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,9 @@ discovery:
257257
},
258258
DefaultOtlpGRPCPort: 4317,
259259
RouteHarvesterTimeout: 10 * time.Second,
260+
RouteHarvestConfig: services.RouteHarvestingConfig{
261+
JavaHarvestDelay: 60 * time.Second,
262+
},
260263
},
261264
NodeJS: NodeJSConfig{
262265
Enabled: true,

0 commit comments

Comments
 (0)