The following describes a debugging session to narrow down a systemd's watchdog timeout in a process which apparently behaves correctly and sends its watchdog refresh on a regular period.
The systemd service starts a process supposed to refresh the systemd watchdog at least every 30 seconds:
[Unit]
Description=XXX Manager SERVICE
Wants=...
After=...
Requires=...
[Service]
User=power
Group=power
Type=notify
AmbientCapabilities=CAP_DAC_OVERRIDE CAP_NET_RAW CAP_NET_ADMIN
RemainAfterExit=no
ExecStart= /usr/bin/XXX-Manager
Restart=on-failure
TimeoutStartSec=30s
WatchdogSec=30s
[Install]
WantedBy=network.target
WantedBy=multi-user.target
|
The source code of the process calls sd_notify() every 15 seconds (the systemd's documentation recommends to refresh the watchdog every half period to make sure to be on time). Hence the following loop in the main function of the source code:
[...]
// man systemd.service recommends notifying every half time of max
watchdogNotifyIntervalUsec = watchdogNotifyIntervalUsec / 2;
[...]
do{
[...]
notif_state_count ++;
if( notif_state_count == (watchdogNotifyIntervalUsec/1000000)) //Count until WatchDogTime/2
{
notif_state_count = 0 ;
[...]
sd_notify(0, "WATCHDOG=1");
}
}
sleep(1);
}while(...);
[...]
|
The above source code does not check the returned code from sd_notify() as recommended in the "Return value" paragraph of the documentation: "In order to support both service managers that implement this scheme and those which do not, it is generally recommended to ignore the return value of this call". The possible returned codes are not documented at all.
Despite the above correct source code, after some minutes, systemd kills the process with a SIGABRT signal claiming that it did not received a watchdog refresh message:
[ 1186.549392] systemd[1]: XXX-Manager.service: Watchdog timeout (limit 30s)! |
It is possible to spy the watchdog refresh messages received by systemd with strace tool. The message is sent over a Unix socket with UCRED ancillary data containing the pid of the sender process. Hence, we can capture the refresh messages received by systemd (pid = 1) from the process with something like this:
# pidof XXX-Manager 13118 # strace -tt -p1 2>&1 | grep "pid=13118" [root@sa415m system]# strace -tt -p1 2>&1 | grep "pid=13118" 21:11:14.039993 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 21:11:29.040413 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 21:11:44.041183 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 21:11:59.041363 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 21:12:14.039749 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 21:12:29.039450 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 21:12:44.040138 recvmsg(15, {msg_name(0)=NULL, msg_iov(1)=[{"WATCHDOG=1", 4096}], msg_controllen=24, [{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, {pid=13118, uid=2936, gid=2936}}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10 [ 2325.018719] systemd[1]: XXX-Manager.service: Watchdog timeout (limit 30s)! 21:13:20.264686 waitid(P_ALL, 0, {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=13118, si_uid=2936, si_status=SIGABRT, si_utime=0, si_stime=0}, WNOHANG|WEXITED|WNOWAIT, NULL) = 0 21:13:20.560058 waitid(P_PID, 13118, {si_signo=SIGCHLD, si_code=CLD_DUMPED, si_pid=13118, si_uid=2936, si_status=SIGABRT, si_utime=0, si_stime=0}, WEXITED, NULL) = 0 |
The above traces shows the watchdog refresh message every 15 seconds (cf. the timestamp on the left column). But at one moment, systemd does not receive it as we can see the trace on the end of the process at 21:13:20 which is 36 seconds after the last captured refresh message (at 21:12:44). As the configured period is 30 seconds, there is really an issue. To debug it, we decided to check the returned code from sd_notify() and display it in a log file. We realized that the returned value is 1 and at one moment, the service fails with a returned code equal to -24. The systemd-F¢s documentation does not provide any information of those values. So, we looked at the source code of sd_notify() which is quite simple (file src/libsystemd/sd-daemon/sd-daemon.c):
[...] _public_ int sd_pid_notify_with_fds( pid_t pid, int unset_environment, const char *state, const int *fds, unsigned n_fds) { union sockaddr_union sockaddr = {}; struct iovec iovec; struct msghdr msghdr = { .msg_iov = &iovec, .msg_iovlen = 1, .msg_name = &sockaddr, }; _cleanup_close_ int fd = -1; struct cmsghdr *cmsg = NULL; const char *e; bool send_ucred; int r, salen; if (!state) { r = -EINVAL; goto finish; } if (n_fds > 0 && !fds) { r = -EINVAL; goto finish; } e = getenv("NOTIFY_SOCKET"); if (!e) return 0; salen = sockaddr_un_set_path(&sockaddr.un, e); if (salen < 0) { r = salen; goto finish; } fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); if (fd < 0) { r = -errno; goto finish; } (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); iovec = IOVEC_MAKE_STRING(state); msghdr.msg_namelen = salen; send_ucred = (pid != 0 && pid != getpid_cached()) || getuid() != geteuid() || getgid() != getegid(); if (n_fds > 0 || send_ucred) { /* CMSG_SPACE(0) may return value different than zero, which results in miscalculated controllen. */ msghdr.msg_controllen = (n_fds > 0 ? CMSG_SPACE(sizeof(int) * n_fds) : 0) + (send_ucred ? CMSG_SPACE(sizeof(struct ucred)) : 0); msghdr.msg_control = alloca0(msghdr.msg_controllen); cmsg = CMSG_FIRSTHDR(&msghdr); if (n_fds > 0) { cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN(sizeof(int) * n_fds); memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * n_fds); if (send_ucred) assert_se(cmsg = CMSG_NXTHDR(&msghdr, cmsg)); } if (send_ucred) { struct ucred *ucred; cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_CREDENTIALS; cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); ucred = (struct ucred*) CMSG_DATA(cmsg); ucred->pid = pid != 0 ? pid : getpid_cached(); ucred->uid = getuid(); ucred->gid = getgid(); } } /* First try with fake ucred data, as requested */ if (sendmsg(fd, &msghdr, MSG_NOSIGNAL) >= 0) { r = 1; goto finish; } /* If that failed, try with our own ucred instead */ if (send_ucred) { msghdr.msg_controllen -= CMSG_SPACE(sizeof(struct ucred)); if (msghdr.msg_controllen == 0) msghdr.msg_control = NULL; if (sendmsg(fd, &msghdr, MSG_NOSIGNAL) >= 0) { r = 1; goto finish; } } r = -errno; finish: if (unset_environment) unsetenv("NOTIFY_SOCKET"); return r; } _public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state) { return sd_pid_notify_with_fds(pid, unset_environment, state, NULL, 0); } _public_ int sd_notify(int unset_environment, const char *state) { return sd_pid_notify_with_fds(0, unset_environment, state, NULL, 0); } [...] |
The above source code shows that a socket is systematically created to send the message to systemd. If the call to socket() fails, the service returns -errno. In the Linux header files, 24 is EMFILE = Too many open files. We check the opened files of the process in /proc/<pid>/fd and we could see that it opens up to 1024 files which is the maximum allowed for a process on this system (the process is multi-threaded). This is why socket() fails and sd_notify() returns -24 (i.E. -EMFILE).
As a conclusion, because we don't check the return code of sd_notify() as recommended by systemd's documentation, we got an issue that was quite difficult to understand because the watchdog refresh was not the real issue. It was merely the consequence of a file descriptor leak.
The author is an engineer in computer sciences located in France. He can be contacted here.