Skip to content

Commit 3b74d65

Browse files
authored
Fix deadlock during start (#744)
1 parent 900cfe3 commit 3b74d65

File tree

5 files changed

+65
-28
lines changed

5 files changed

+65
-28
lines changed

pkg/components/pipe/instrumenter.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,12 @@ func newGraphBuilder(
7878
DecoratedTraces: tracesReaderToRouter,
7979
}), swarm.WithID("ReadFromChannel"))
8080

81-
routerToKubeDecorator := newQueue("routerToKubeDecorator")
81+
routerToKubeDecorator := msg.NewQueue[[]request.Span](
82+
msg.ChannelBufferLen(config.ChannelBufferLen),
83+
msg.Name("routerToKubeDecorator"),
84+
// make sure that we are able to wait for the informer sync timeout before failing the pipeline
85+
// if a message gets bocked while the Kube decorator starts
86+
msg.SendTimeout(config.Attributes.Kubernetes.InformersSyncTimeout+20*time.Second))
8287
swi.Add(transform.RoutesProvider(
8388
config.Routes,
8489
tracesReaderToRouter,

pkg/export/attributes/attr_selector.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ func NewCustomAttrSelector(
111111
cfg *SelectorConfig,
112112
extraDefinitionsProvider func(groups AttrGroups, extraGroupAttributes GroupAttributes) map[Section]AttrReportGroup,
113113
) (*AttrSelector, error) {
114-
cfg.SelectionCfg.Normalize()
115114
extraGroupAttributes := NewGroupAttributes(cfg.ExtraGroupAttributesCfg)
116115

117116
definitions := getDefinitions(groups, extraGroupAttributes)

pkg/pipe/msg/queue.go

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package msg
66

77
import (
88
"fmt"
9+
"log/slog"
910
"strings"
1011
"sync"
1112
"sync/atomic"
@@ -14,20 +15,22 @@ import (
1415

1516
// if a Send operation takes more than this time, we panic informing about a deadlock
1617
// in the user-provide pipeline
17-
const sendTimeout = 20 * time.Second
18+
const defaultSendTimeout = 20 * time.Second
1819

1920
const unnamed = "(unnamed)"
2021

2122
type queueConfig struct {
2223
channelBufferLen int
2324
closingAttempts int
2425
name string
26+
sendTimeout time.Duration
2527
}
2628

2729
var defaultQueueConfig = queueConfig{
2830
channelBufferLen: 1,
2931
closingAttempts: 1,
3032
name: unnamed,
33+
sendTimeout: defaultSendTimeout,
3134
}
3235

3336
// QueueOpts allow configuring some operation of a queue
@@ -47,6 +50,17 @@ func Name(name string) QueueOpts {
4750
}
4851
}
4952

53+
// SendTimeout sets the timeout for Send operations. This is useful for detecting
54+
// deadlocks derived from a wrong Pipeline construction. It panics if after
55+
// a send operation, the channel is blocked for more than this timeout.
56+
// Some nodes might require too long to initialize. For example the Kubernetes Decorator
57+
// at start, has to download a whole snapshot
58+
func SendTimeout(to time.Duration) QueueOpts {
59+
return func(c *queueConfig) {
60+
c.sendTimeout = to
61+
}
62+
}
63+
5064
// ClosingAttempts sets the number of invocations to MarkCloseable before the channel is
5165
// effectively closed.
5266
// This is useful when multiple nodes are sending messages to the same queue, and we want
@@ -88,7 +102,7 @@ func NewQueue[T any](opts ...QueueOpts) *Queue[T] {
88102
for _, opt := range opts {
89103
opt(&cfg)
90104
}
91-
return &Queue[T]{cfg: &cfg, remainingClosers: cfg.closingAttempts, sendTimeout: time.NewTimer(sendTimeout)}
105+
return &Queue[T]{cfg: &cfg, remainingClosers: cfg.closingAttempts, sendTimeout: time.NewTimer(cfg.sendTimeout)}
92106
}
93107

94108
// Send a message to all subscribers of this queue.
@@ -114,11 +128,36 @@ func (q *Queue[T]) chainedSend(o T, bypassPath []string) {
114128
if len(q.dsts) == 0 {
115129
return
116130
}
117-
q.sendTimeout.Reset(sendTimeout)
131+
132+
// instead of directly panicking in sendTimeout, we first warn at 0.75*sendTimeout,
133+
// to get logged about other blocked senders before panicking
134+
q.sendTimeout.Reset(3 * q.cfg.sendTimeout / 4)
135+
var blocked []dst[T]
118136
for _, d := range q.dsts {
119137
select {
120138
case d.ch <- o:
121-
// good!
139+
// good!
140+
case <-q.sendTimeout.C:
141+
slog.With(
142+
"timeout", q.cfg.sendTimeout,
143+
"queueLen", len(d.ch), "queueCap", cap(d.ch),
144+
"sendPath", strings.Join(bypassPath, "->"),
145+
"dstName", d.name).
146+
Warn("subscriber channel is taking too long to respond")
147+
// reset timeout to a small amount to detect any other possible blocked subscriber
148+
q.sendTimeout.Reset(time.Second)
149+
blocked = append(blocked, d)
150+
}
151+
}
152+
if len(blocked) == 0 {
153+
return
154+
}
155+
// if we confirm that the blocker candidates are actually blocked, we panic
156+
q.sendTimeout.Reset(q.cfg.sendTimeout / 4)
157+
for _, d := range blocked {
158+
select {
159+
case d.ch <- o:
160+
// good!
122161
case <-q.sendTimeout.C:
123162
panic(fmt.Sprintf("sending through queue path %s. Subscriber channel %s is blocked",
124163
strings.Join(bypassPath, "->"), d.name))

pkg/pipe/msg/queue_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ func TestBlockingPanics(t *testing.T) {
403403
q1.Send(1)
404404
q1.Send(2)
405405

406-
time.Sleep(2 * sendTimeout)
406+
time.Sleep(2 * defaultSendTimeout)
407407
}, "a deadlock should have been detected")
408408
})
409409
}

pkg/transform/k8s.go

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"go.opentelemetry.io/obi/pkg/kubeflags"
2424
"go.opentelemetry.io/obi/pkg/pipe/msg"
2525
"go.opentelemetry.io/obi/pkg/pipe/swarm"
26+
"go.opentelemetry.io/obi/pkg/pipe/swarm/swarms"
2627
)
2728

2829
var containerInfoForPID = container.InfoForPID
@@ -144,24 +145,13 @@ type metadataDecorator struct {
144145
func (md *metadataDecorator) nodeLoop(ctx context.Context) {
145146
// output channel must be closed so later stages in the pipeline can finish in cascade
146147
defer md.output.Close()
147-
klog().Debug("starting kubernetes span decoration loop")
148-
for {
149-
select {
150-
case <-ctx.Done():
151-
klog().Debug("context done, stopping kubernetes span decoration loop")
152-
return
153-
case spans, ok := <-md.input:
154-
if !ok {
155-
klog().Debug("input channel closed, stopping kubernetes span decoration loop")
156-
return
157-
}
158-
// in-place decoration and forwarding
159-
for i := range spans {
160-
md.do(&spans[i])
161-
}
162-
md.output.Send(spans)
148+
swarms.ForEachInput(ctx, md.input, klog().Debug, func(spans []request.Span) {
149+
// in-place decoration and forwarding
150+
for i := range spans {
151+
md.do(&spans[i])
163152
}
164-
}
153+
md.output.Send(spans)
154+
})
165155
}
166156

167157
func (md *metadataDecorator) do(span *request.Span) {
@@ -172,11 +162,15 @@ func (md *metadataDecorator) do(span *request.Span) {
172162
span.Service.Metadata = map[attr.Name]string{}
173163
}
174164
// override the peer and host names from Kubernetes metadata, if found
175-
if name, _ := md.db.ServiceNameNamespaceForIP(span.Host); name != "" {
176-
span.HostName = name
165+
if span.Host != "" {
166+
if name, _ := md.db.ServiceNameNamespaceForIP(span.Host); name != "" {
167+
span.HostName = name
168+
}
177169
}
178-
if name, _ := md.db.ServiceNameNamespaceForIP(span.Peer); name != "" {
179-
span.PeerName = name
170+
if span.Peer != "" {
171+
if name, _ := md.db.ServiceNameNamespaceForIP(span.Peer); name != "" {
172+
span.PeerName = name
173+
}
180174
}
181175
}
182176

0 commit comments

Comments
 (0)