Skip to content

Commit 3f94ea6

Browse files
authored
[Profiler] Signal-based profiler/Non-Signal-based: prevent deadlock (#5808)
## Summary of changes Prevent deadlock betwen signal-based profilers (walltime/manual cpu profilers) and non-signal based profilers (exception, contention....) ## Reason for change When an exception occurs, the thread can be interrupted by a signal-based profiler (walltime/manual cpu). It can be interrupted while holding the lock used to update the `dl-iterate-phdr` cache. ``` Thread 18 (LWP 995): #0 __syscall_cp_c (nr=202, u=140244538814536, v=128, w=-1, x=0, y=0, z=0) at ./arch/x86_64/syscall_arch.h:61 #1 0x00007f8dba343ccd in __futex4_cp (to=0x0, val=-1, op=128, addr=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at src/thread/__timedwait.c:24 #2 __timedwait_cp (addr=addr@entry=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>, val=val@entry=-1, clk=clk@entry=0, at=at@entry=0x0, priv=priv@entry=128) at src/thread/__timedwait.c:52 #3 0x00007f8dba343d74 in __timedwait (addr=addr@entry=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>, val=-1, clk=clk@entry=0, at=at@entry=0x0, priv=128) at src/thread/__timedwait.c:68 #4 0x00007f8dba3463e6 in __pthread_rwlock_timedrdlock (at=<optimized out>, rw=<optimized out>) at src/thread/pthread_rwlock_timedrdlock.c:18 #5 __pthread_rwlock_timedrdlock (rw=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>, at=0x0) at src/thread/pthread_rwlock_timedrdlock.c:3 #6 0x00007f8d398f3ca8 in std::__glibcxx_rwlock_rdlock (__rwlock=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:73 #7 std::__shared_mutex_pthread::lock_shared (this=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:224 #8 std::shared_mutex::lock_shared (this=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:421 #9 std::shared_lock<std::shared_mutex>::shared_lock (this=0x7f4ca05a2ac0, __m=...) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:722 #10 LibrariesInfoCache::DlIteratePhdrImpl (this=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>, callback=0x7f8d3997d900 <_Ux86_64_dwarf_callback>, data=0x7f4ca05a2b20) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LibrariesInfoCache.cpp:104 #11 0x00007f8d3997e4ee in _Ux86_64_dwarf_find_proc_info (as=0x7f8d39eb2a00 <local_addr_space>, ip=140246691112115, pi=0x7f4ca05a3170, need_unwind_info=1, arg=0x7f4ca05a3411) at /project/obj/libunwind-prefix/src/libunwind/src/dwarf/Gfind_proc_info-lsb.c:807 #12 0x00007f8d3997e690 in fetch_proc_info (c=0x7f4ca05a3018, ip=140246691112115) at /project/obj/libunwind-prefix/src/libunwind/src/dwarf/Gparser.c:473 #13 0x00007f8d3998113d in find_reg_state (sr=0x7f4ca05a2dc0, c=0x7f4ca05a3018) at /project/obj/libunwind-prefix/src/libunwind/src/dwarf/Gparser.c:1024 #14 _Ux86_64_dwarf_step (c=c@entry=0x7f4ca05a3018) at /project/obj/libunwind-prefix/src/libunwind/src/dwarf/Gparser.c:1069 #15 0x00007f8d3997d13a in _Ux86_64_step (cursor=0x7f4ca05a3018) at /project/obj/libunwind-prefix/src/libunwind/src/x86_64/Gstep.c:75 #16 0x00007f8d398f55c8 in LinuxStackFramesCollector::CollectStackManually (this=this@entry=0x7f8d392dc6d0, ctx=ctx@entry=0x7f4ca05a3880) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LinuxStackFramesCollector.cpp:288 #17 0x00007f8d398f53dc in LinuxStackFramesCollector::CollectCallStackCurrentThread (this=this@entry=0x7f8d392dc6d0, ctx=ctx@entry=0x7f4ca05a3880) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LinuxStackFramesCollector.cpp:227 #18 0x00007f8d398f4672 in LinuxStackFramesCollector::CollectStackSampleSignalHandler (signal=<optimized out>, info=<optimized out>, context=0x7f4ca05a3880) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LinuxStackFramesCollector.cpp:373 #19 0x00007f8d398fb871 in ProfilerSignalManager::CallCustomHandler (this=0x7f8d39eaf928 <ProfilerSignalManager::Get(int)::signalManagers+1944>, signal=10, info=0x7f4ca05a39b0, context=0x7f4ca05a3880) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/ProfilerSignalManager.cpp:197 #20 ProfilerSignalManager::SignalHandler (signal=10, info=0x7f4ca05a39b0, context=0x7f4ca05a3880) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/ProfilerSignalManager.cpp:188 #21 <signal handler called> #22 __pthread_rwlock_unlock (rw=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at src/thread/pthread_rwlock_unlock.c:5 #23 0x00007f8d398f3bf9 in std::__glibcxx_rwlock_unlock (__rwlock=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:77 #24 std::__shared_mutex_pthread::unlock (this=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:208 #25 std::shared_mutex::unlock (this=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/shared_mutex:417 #26 std::unique_lock<std::shared_mutex>::unlock (this=0x7f4ca05a3e20) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/bits/unique_lock.h:194 #27 std::unique_lock<std::shared_mutex>::~unique_lock (this=0x7f4ca05a3e20) at /usr/lib/gcc/x86_64-alpine-linux-musl/10.3.1/../../../../include/c++/10.3.1/bits/unique_lock.h:103 #28 LibrariesInfoCache::UpdateCache (this=0x7f8d39eaf048 <LibrariesInfoCache::Get()::Instance>) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LibrariesInfoCache.cpp:88 #29 0x00007f8d398f4e59 in LinuxStackFramesCollector::CollectStackSampleImplementation (this=0x7f8d3b91bc90, pThreadInfo=0x7f4ca06b9900, pHR=0x7f8d3a63c510, selfCollect=true) at /p--Type <RET> for more, q to quit, c to continue without paging-- roject/profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LinuxStackFramesCollector.cpp:100 #30 0x00007f8d399637ba in StackFramesCollectorBase::CollectStackSample (this=0x7f8d3b91bc90, pThreadInfo=0x7f4ca06b9900, pHR=0x7f4ca05a3fdc) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native/StackFramesCollectorBase.cpp:185 #31 0x00007f8d3992acb9 in ExceptionsProvider::OnExceptionThrown (this=0x7f8d392a7160, thrownObjectId=139969739182080) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native/ExceptionsProvider.cpp:149 #32 0x00007f8d39917045 in CorProfilerCallback::ExceptionThrown (this=0x7f8d392c0d20, thrownObjectId=139969739182080) at /project/profiler/src/ProfilerEngine/Datadog.Profiler.Native/CorProfilerCallback.cpp:1734 ``` ## Implementation details - move the call which updates the cache after acquiring the thread lock - call Update before sending signal ## Test coverage ## Other details <!-- Fixes #{issue} --> <!-- ⚠️ Note: where possible, please obtain 2 approvals prior to merging. Unless CODEOWNERS specifies otherwise, for external teams it is typically best to have one review from a team member, and one review from apm-dotnet. Trivial changes do not require 2 reviews. -->
1 parent 8b5987d commit 3f94ea6

File tree

5 files changed

+181
-2
lines changed

5 files changed

+181
-2
lines changed

profiler/src/Demos/Samples.Computer01/ComputerService.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ public class ComputerService
4444
private NullThreadNameBugCheck _nullThreadNameBugCheck;
4545
private MethodsSignature _methodsSignature;
4646
private SigSegvHandlerExecution _sigsegvHandler;
47+
private LinuxDlIteratePhdrDeadlock _linuxDlIteratePhdrDeadlock;
4748

4849
#if NET5_0_OR_GREATER
4950
private OpenLdapCrash _openldapCrash;
@@ -181,6 +182,10 @@ public void StartService(Scenario scenario, int nbThreads, int parameter)
181182
StartStringConcat(parameter);
182183
break;
183184

185+
case Scenario.LinuxDlIteratePhdrDeadlock:
186+
StartLinuxDlIteratePhdrDeadlock();
187+
break;
188+
184189
default:
185190
throw new ArgumentOutOfRangeException(nameof(scenario), $"Unsupported scenario #{_scenario}");
186191
}
@@ -311,6 +316,10 @@ public void StopService()
311316
case Scenario.StringConcat:
312317
StopStringConcat();
313318
break;
319+
320+
case Scenario.LinuxDlIteratePhdrDeadlock:
321+
StopLinuxDlIteratePhdrDeadlock();
322+
break;
314323
}
315324
}
316325

@@ -575,6 +584,12 @@ private void StartLinuxMallocDeadlock()
575584
_linuxMallockDeadlock.Start();
576585
}
577586

587+
private void StartLinuxDlIteratePhdrDeadlock()
588+
{
589+
_linuxDlIteratePhdrDeadlock = new LinuxDlIteratePhdrDeadlock();
590+
_linuxDlIteratePhdrDeadlock.Start();
591+
}
592+
578593
private void StartMeasureAllocations()
579594
{
580595
_measureAllocations = new MeasureAllocations();
@@ -749,6 +764,11 @@ private void StopLinuxMallocDeadlock()
749764
_linuxMallockDeadlock.Stop();
750765
}
751766

767+
private void StopLinuxDlIteratePhdrDeadlock()
768+
{
769+
_linuxDlIteratePhdrDeadlock.Stop();
770+
}
771+
752772
private void StopMeasureAllocations()
753773
{
754774
_measureAllocations.Stop();
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// <copyright file="LinuxDlIteratePhdrDeadlock.cs" company="Datadog">
2+
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2022 Datadog, Inc.
4+
// </copyright>
5+
6+
using System;
7+
using System.Diagnostics;
8+
using System.Runtime.InteropServices;
9+
using System.Threading;
10+
using System.Threading.Tasks;
11+
12+
namespace Samples.Computer01
13+
{
14+
internal class LinuxDlIteratePhdrDeadlock
15+
{
16+
private ManualResetEvent _stopEvent;
17+
private Task _exceptionTask;
18+
private Thread _worker;
19+
20+
public void Start()
21+
{
22+
if (_stopEvent != null)
23+
{
24+
throw new InvalidOperationException("Already running...");
25+
}
26+
27+
_stopEvent = new ManualResetEvent(false);
28+
29+
_worker = new Thread(ExecuteCallToDlOpenDlClose)
30+
{
31+
IsBackground = false // set to false to prevent the app from shutting down. The test will fail
32+
};
33+
_worker.Start();
34+
35+
_exceptionTask = Task.Factory.StartNew(
36+
() =>
37+
{
38+
var nbException = 0;
39+
var loggingClock = Stopwatch.StartNew();
40+
while (!IsEventSet())
41+
{
42+
if (loggingClock.ElapsedMilliseconds >= 1000)
43+
{
44+
Console.WriteLine($"* Nb thrown exception {nbException}");
45+
Thread.Sleep(TimeSpan.FromMilliseconds(500));
46+
nbException = 0;
47+
loggingClock.Restart();
48+
}
49+
50+
for (var i = 0; i < 20; i++)
51+
{
52+
try
53+
{
54+
throw new Exception("dl_iterate_phdr deadlock exception");
55+
}
56+
catch { }
57+
nbException++;
58+
}
59+
60+
// wait a bit randomly (23 is a prime number chosen randomly)
61+
Thread.Sleep(TimeSpan.FromMilliseconds(23));
62+
}
63+
},
64+
TaskCreationOptions.LongRunning);
65+
}
66+
67+
public void Run()
68+
{
69+
Start();
70+
Thread.Sleep(TimeSpan.FromSeconds(10));
71+
Stop();
72+
}
73+
74+
public void Stop()
75+
{
76+
if (_stopEvent == null)
77+
{
78+
throw new InvalidOperationException("Not running...");
79+
}
80+
81+
_stopEvent.Set();
82+
83+
_worker.Join();
84+
_exceptionTask.Wait();
85+
86+
_stopEvent.Dispose();
87+
_stopEvent = null;
88+
}
89+
90+
[DllImport("libdl.so", EntryPoint = "dlopen")]
91+
private static extern IntPtr Dlopen(string filename, int flags);
92+
93+
[DllImport("libdl.so", EntryPoint = "dlclose")]
94+
private static extern void DlClose(IntPtr handle);
95+
96+
private void ExecuteCallToDlOpenDlClose()
97+
{
98+
var loggingClock = Stopwatch.StartNew();
99+
var counter = 0;
100+
101+
while (!IsEventSet())
102+
{
103+
if (loggingClock.ElapsedMilliseconds >= 1000)
104+
{
105+
Console.WriteLine($"* Nb execution {counter}");
106+
Thread.Sleep(TimeSpan.FromMilliseconds(500));
107+
counter = 0;
108+
loggingClock.Restart();
109+
}
110+
111+
var handle = Dlopen("libc.so.6", 2);
112+
Thread.Sleep(TimeSpan.FromMilliseconds(10));
113+
DlClose(handle);
114+
115+
counter++;
116+
}
117+
}
118+
119+
private bool IsEventSet()
120+
{
121+
return _stopEvent.WaitOne(0);
122+
}
123+
}
124+
}

profiler/src/Demos/Samples.Computer01/Program.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ public enum Scenario
4040
Obfuscation,
4141
ThreadSpikes,
4242
StringConcat, // parameter = number of strings to concatenate
43+
LinuxDlIteratePhdrDeadlock,
4344
}
4445

4546
public class Program
@@ -76,6 +77,7 @@ public static void Main(string[] args)
7677
// 24: use an obfuscated library
7778
// 25: create thread spikes
7879
// 26: string concatenation
80+
// 27: custom dl_iterate_phdr deadlock
7981
//
8082
Console.WriteLine($"{Environment.NewLine}Usage:{Environment.NewLine} > {Process.GetCurrentProcess().ProcessName} " +
8183
$"[--service] [--iterations <number of iterations to execute>] " +

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/LinuxStackFramesCollector.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,15 @@ StackSnapshotResultBuffer* LinuxStackFramesCollector::CollectStackSampleImplemen
9797
// Otherwise, the CPU consumption to collect the callstack, will be accounted as "user app CPU time"
9898
auto timerId = pThreadInfo->GetTimerId();
9999

100-
_plibrariesInfo->UpdateCache();
101-
102100
if (selfCollect)
103101
{
104102
// In case we are self-unwinding, we do not want to be interrupted by the signal-based profilers (walltime and cpu)
105103
// This will crashing in libunwind (accessing a memory area which was unmapped)
106104
// This lock is acquired by the signal-based profiler (see StackSamplerLoop->StackSamplerLoopManager)
107105
pThreadInfo->GetStackWalkLock().Acquire();
108106

107+
_plibrariesInfo->UpdateCache();
108+
109109
on_leave
110110
{
111111
pThreadInfo->GetStackWalkLock().Release();
@@ -143,6 +143,8 @@ StackSnapshotResultBuffer* LinuxStackFramesCollector::CollectStackSampleImplemen
143143
}
144144
};
145145

146+
_plibrariesInfo->UpdateCache();
147+
146148
std::unique_lock<std::mutex> stackWalkInProgressLock(s_stackWalkInProgressMutex);
147149

148150
const auto threadId = static_cast<::pid_t>(pThreadInfo->GetOsThreadId());
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
// <copyright file="DlIteratePhdrDeadlock.cs" company="Datadog">
2+
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2022 Datadog, Inc.
4+
// </copyright>
5+
6+
using Datadog.Profiler.IntegrationTests.Helpers;
7+
using Datadog.Profiler.SmokeTests;
8+
using Xunit;
9+
using Xunit.Abstractions;
10+
11+
namespace Datadog.Profiler.IntegrationTests.LinuxOnly
12+
{
13+
[Trait("Category", "LinuxOnly")]
14+
public class DlIteratePhdrDeadlock
15+
{
16+
private const string ScenarioLinuxDliteratePhdrDeadlock = "--scenario 27";
17+
private readonly ITestOutputHelper _output;
18+
19+
public DlIteratePhdrDeadlock(ITestOutputHelper output)
20+
{
21+
_output = output;
22+
}
23+
24+
[TestAppFact("Samples.Computer01")]
25+
public void CheckApplicationDoesNotEndUpInDeadlock(string appName, string framework, string appAssembly)
26+
{
27+
var runner = new SmokeTestRunner(appName, framework, appAssembly, ScenarioLinuxDliteratePhdrDeadlock, _output);
28+
runner.RunAndCheck();
29+
}
30+
}
31+
}

0 commit comments

Comments
 (0)