Skip to content

Commit abda34b

Browse files
tonyhutterbehlendorf
authored andcommitted
CI: Add ZTS -O option, log Setup Testing Machines step
Add a -O option to zfs-test.sh to dump debug information on test timeout. The debug info includes: - 30 lines from 'top' - /proc/<PID>/stack output of process with highest CPU usage - Last lines strace-ing process with highest CPU usage - /proc/sysrq-trigger kernel stack traces All debug information gets dumped to /dev/kmsg (Linux only). In addition, print out the VM console lines from the "Setup Testing Machines" step. We have often see VMs timeout at this step and don't know why. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Tony Hutter <[email protected]> Closes #17753
1 parent 9079f98 commit abda34b

File tree

5 files changed

+93
-13
lines changed

5 files changed

+93
-13
lines changed

.github/workflows/scripts/qemu-3-deps-vm.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ function archlinux() {
2020
sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \
2121
fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \
2222
parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \
23-
samba sysstat rng-tools rsync wget xxhash
23+
samba strace sysstat rng-tools rsync wget xxhash
2424
echo "##[endgroup]"
2525
}
2626

@@ -43,7 +43,8 @@ function debian() {
4343
lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \
4444
python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \
4545
python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \
46-
rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev
46+
rsync samba strace sysstat uuid-dev watchdog wget xfslibs-dev xxhash \
47+
zlib1g-dev
4748
echo "##[endgroup]"
4849
}
4950

@@ -87,8 +88,8 @@ function rhel() {
8788
libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \
8889
parted perf python3 python3-cffi python3-devel python3-packaging \
8990
kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \
90-
rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \
91-
zlib-devel
91+
rpm-build rsync samba strace sysstat systemd watchdog wget xfsprogs-devel \
92+
xxhash zlib-devel
9293
echo "##[endgroup]"
9394
}
9495

.github/workflows/scripts/qemu-5-setup.sh

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,19 +108,30 @@ echo '*/5 * * * * /root/cronjob.sh' > crontab.txt
108108
sudo crontab crontab.txt
109109
rm crontab.txt
110110

111-
# check if the machines are okay
112-
echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)"
113-
for ((i=1; i<=VMs; i++)); do
114-
.github/workflows/scripts/qemu-wait-for-vm.sh vm$i
115-
done
116-
echo "All $VMs VMs are up now."
117-
118111
# Save the VM's serial output (ttyS0) to /var/tmp/console.txt
119112
# - ttyS0 on the VM corresponds to a local /dev/pty/N entry
120113
# - use 'virsh ttyconsole' to lookup the /dev/pty/N entry
121114
for ((i=1; i<=VMs; i++)); do
122115
mkdir -p $RESPATH/vm$i
123116
read "pty" <<< $(sudo virsh ttyconsole vm$i)
117+
118+
# Create the file so we can tail it, even if there's no output.
119+
touch $RESPATH/vm$i/console.txt
120+
124121
sudo nohup bash -c "cat $pty > $RESPATH/vm$i/console.txt" &
122+
123+
# Write all VM boot lines to the console to aid in debugging failed boots.
124+
# The boot lines from all the VMs will be munged together, so prepend each
125+
# line with the vm hostname (like 'vm1:').
126+
(while IFS=$'\n' read -r line; do echo "vm$i: $line" ; done < <(sudo tail -f $RESPATH/vm$i/console.txt)) &
127+
125128
done
126129
echo "Console logging for ${VMs}x $OS started."
130+
131+
132+
# check if the machines are okay
133+
echo "Waiting for vm's to come up... (${VMs}x CPU=$CPU RAM=$RAM)"
134+
for ((i=1; i<=VMs; i++)); do
135+
.github/workflows/scripts/qemu-wait-for-vm.sh vm$i
136+
done
137+
echo "All $VMs VMs are up now."

.github/workflows/scripts/qemu-6-tests.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ fi
111111
sudo dmesg -c > dmesg-prerun.txt
112112
mount > mount.txt
113113
df -h > df-prerun.txt
114-
$TDIR/zfs-tests.sh -vK -s 3GB -T $TAGS
114+
$TDIR/zfs-tests.sh -vKO -s 3GB -T $TAGS
115115
RV=$?
116116
df -h > df-postrun.txt
117117
echo $RV > tests-exitcode.txt

scripts/zfs-tests.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ DEBUG=""
3838
CLEANUP="yes"
3939
CLEANUPALL="no"
4040
KMSG=""
41+
TIMEOUT_DEBUG=""
4142
LOOPBACK="yes"
4243
STACK_TRACER="no"
4344
FILESIZE="4G"
@@ -364,6 +365,7 @@ OPTIONS:
364365
-k Disable cleanup after test failure
365366
-K Log test names to /dev/kmsg
366367
-f Use files only, disables block device tests
368+
-O Dump debugging info to /dev/kmsg on test timeout
367369
-S Enable stack tracer (negative performance impact)
368370
-c Only create and populate constrained path
369371
-R Automatically rerun failing tests
@@ -402,7 +404,7 @@ $0 -x
402404
EOF
403405
}
404406

405-
while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
407+
while getopts 'hvqxkKfScRmOn:d:Ds:r:?t:T:u:I:' OPTION; do
406408
case $OPTION in
407409
h)
408410
usage
@@ -445,6 +447,9 @@ while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
445447
export NFS=1
446448
. "$nfsfile"
447449
;;
450+
O)
451+
TIMEOUT_DEBUG="yes"
452+
;;
448453
d)
449454
FILEDIR="$OPTARG"
450455
;;
@@ -773,6 +778,7 @@ msg "${TEST_RUNNER}" \
773778
"${DEBUG:+-D}" \
774779
"${KMEMLEAK:+-m}" \
775780
"${KMSG:+-K}" \
781+
"${TIMEOUT_DEBUG:+-O}" \
776782
"-c \"${RUNFILES}\"" \
777783
"-T \"${TAGS}\"" \
778784
"-i \"${STF_SUITE}\"" \
@@ -783,6 +789,7 @@ msg "${TEST_RUNNER}" \
783789
${DEBUG:+-D} \
784790
${KMEMLEAK:+-m} \
785791
${KMSG:+-K} \
792+
${TIMEOUT_DEBUG:+-O} \
786793
-c "${RUNFILES}" \
787794
-T "${TAGS}" \
788795
-i "${STF_SUITE}" \

tests/test-runner/bin/test-runner.py.in

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ from select import select
3434
from subprocess import PIPE
3535
from subprocess import Popen
3636
from subprocess import check_output
37+
from subprocess import run
3738
from threading import Timer
3839
from time import time, CLOCK_MONOTONIC
3940
from os.path import exists
@@ -187,6 +188,63 @@ User: %s
187188
''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user)
188189

189190
def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False):
191+
192+
"""
193+
We're about to kill a command due to a timeout.
194+
If we're running with the -O option, then dump debug info about the
195+
process with the highest CPU usage to /dev/kmsg (Linux only). This can
196+
help debug the timeout.
197+
198+
Debug info includes:
199+
- 30 lines from 'top'
200+
- /proc/<PID>/stack output of process with highest CPU usage
201+
- Last lines strace-ing process with highest CPU usage
202+
"""
203+
if exists("/dev/kmsg"):
204+
c = """
205+
TOP_OUT="$(COLUMNS=160 top -b -n 1 | head -n 30)"
206+
read -r PID CMD <<< $(echo "$TOP_OUT" | /usr/bin/awk \
207+
"/COMMAND/{
208+
print_next=1
209+
next
210+
}
211+
{
212+
if (print_next == 1) {
213+
print \\$1\\" \\"\\$12
214+
exit
215+
}
216+
}")
217+
echo "##### ZTS timeout debug #####"
218+
echo "----- top -----"
219+
echo "$TOP_OUT"
220+
echo "----- /proc/$PID/stack ($CMD)) -----"
221+
cat /proc/$PID/stack
222+
echo "----- strace ($CMD) -----"
223+
TMPFILE="$(mktemp --suffix=ZTS)"
224+
/usr/bin/strace -k --stack-traces -p $PID &> "$TMPFILE" &
225+
sleep 0.1
226+
killall strace
227+
tail -n 30 $TMPFILE
228+
rm "$TMPFILE"
229+
echo "##### /proc/sysrq-trigger stack #####"
230+
"""
231+
c = "sudo bash -c '" + c + "'"
232+
data = run(c, capture_output=True, shell=True, text=True)
233+
out = data.stdout
234+
try:
235+
kp = Popen([SUDO, "sh", "-c",
236+
"echo '" + out + "' > /dev/kmsg"])
237+
kp.wait()
238+
239+
"""
240+
Trigger kernel stack traces
241+
"""
242+
kp = Popen([SUDO, "sh", "-c",
243+
"echo l > /proc/sysrq-trigger"])
244+
kp.wait()
245+
except Exception:
246+
pass
247+
190248
"""
191249
Kill a running command due to timeout, or ^C from the keyboard. If
192250
sudo is required, this user was verified previously.
@@ -1129,6 +1187,9 @@ def parse_args():
11291187
parser.add_option('-o', action='callback', callback=options_cb,
11301188
default=BASEDIR, dest='outputdir', type='string',
11311189
metavar='outputdir', help='Specify an output directory.')
1190+
parser.add_option('-O', action='store_true', default=False,
1191+
dest='timeout_debug',
1192+
help='Dump debugging info to /dev/kmsg on test timeout')
11321193
parser.add_option('-i', action='callback', callback=options_cb,
11331194
default=TESTDIR, dest='testdir', type='string',
11341195
metavar='testdir', help='Specify a test directory.')

0 commit comments

Comments
 (0)