Last update: 09-Apr-2022
Author: R. Koucha
Bothering ioctl() calls in systemd
Introduction

In a project, we observe that systemd is continuously calling ioctl(...TCGETS...) upon watchdog refreshes: there are 9 calls. Moreover those calls fail with the ENOTTY errno. The issue has been narrowed down. It has been introduced in the recent versions of systemd because on former versions, we do not have those useless ioctl() calls. The problem appears at least in version 243. On some systems running version 237, the issue does not show up.

On a card equiped with an ARM V7 CPU, each of those calls may cost from 300 us up to ~2 ms! This is not negligible as we have 9 calls upon each watchdog refresh. This means that we can consume from 2.7 ms to 18 ms of CPU for nothing upon each watchdog refresh !

Looking at the systemd code, we can see that reading the file /proc/<pid>/cgroup in memory (where <pid> is the identifier of the process refreshing its watchdog) triggers a call to ioctl(...TCGETS...) for each lines. As the file has 9 lines, hence the 9 calls to ioctl(). The latter comes from a centralized service which read a line in any file which may be a tty or not. So, upon each line, the service calls isatty() service and the latter calls tcgetattr() which triggers the ioctl() system call.

We can plan a patch to change this behaviour.

Observation

When we spy systemd with strace, we can see that it receives numerous watchdog refreshes. The system call sequence triggered upon those refreshes is:

  1. epoll() to wait for a message
  2. recvmsg() to receive the message
  3. open(/proc/<pid>/cgroup) where pid is the pid of the process which sent the watchdog refresh
  4. read(content of /proc/<pid>/cgroup)
  5. 9 calls to ioctl(...TCGETS...) which return -1 with errno = ENOTTY
  6. close(/proc/<pid>/cgroup)
# strace -tt -fp 1
strace: Process 1 attached
12:16:02.589103 epoll_wait(4, [{events=EPOLLIN, data={u32=1404960, u64=1404960}}], 149, -1) = 1
12:16:03.364571 clock_gettime(CLOCK_BOOTTIME, {tv_sec=18146, tv_nsec=17605315}) = 0
12:16:03.365663 recvmsg(15, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="WATCHDOG=1", iov_len=4096}], msg_iovlen=1, msg_control=[{cmsg_len=24, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=1379, uid=2926, gid=2926}}], msg_controllen=24, msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10
12:16:03.366434 openat(AT_FDCWD, /proc/1379/cgroup, O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 11
12:16:03.367548 fstat64(11, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
12:16:03.368033 read(11, "9:perf_event:/\n8:blkio:/\n7:devic"..., 1024) = 223
12:16:03.368311 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.370182 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.370494 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.371221 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.371727 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.371956 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.372352 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.372480 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.373048 ioctl(11, TCGETS, 0xbed5cc64) = -1 ENOTTY (Inappropriate ioctl for device)
12:16:03.373793 read(11, "", 1024) = 0
12:16:03.376118 close(11) = 0
[...]
Analysis

We can see lots of calls to ioctl(11, TCGETS, 0xbed5cc64)) and the file descriptor#11 is the result of openat(AT_FDCWD, "/proc/<pid>/cgroup", O_RDONLY|O_LARGEFILE|O_CLOEXEC) where <pid> is the process identifier of the process which just sent its watchdog refresh message.

We can see as much calls to ioctl() as there are lines in /proc/<pid>/cgroup (i.e. 9):

# cat /proc/1925/cgroup
9:perf_event:/
8:blkio:/
7:devices:/system.slice/locationmgr.service
6:cpu,cpuacct:/
5:freezer:/
4:debug:/
3:net_cls,net_prio:/
2:memory:/system.slice/locationmgr.service
1:name=systemd:/system.slice/locationmgr.service

This makes guess that there is one call per line read in /proc/<pid>/cgroup.

In systemd source code, we don't see direct calls to ioctl(...TCGETS...). So, this may be called by a service used by systemd. It is likely the GLIBC where we find such a call in tcgetattr() or isatty() which calls the latter:

Here is the source code of tcgetattr() which calls the iotctl() in sysdeps/unix/sysv/linux/tcgetattr.c:

[...]
/* Put the state of FD into *TERMIOS_P.  */
int
__tcgetattr (int fd, struct termios *termios_p)
{
  struct __kernel_termios k_termios;

  int retval;

  retval = INLINE_SYSCALL (ioctl, 3, fd, TCGETS, &k_termios);

  if (__glibc_likely (retval == 0))
    {
      termios_p->c_iflag = k_termios.c_iflag;
      termios_p->c_oflag = k_termios.c_oflag;
      termios_p->c_cflag = k_termios.c_cflag;
      termios_p->c_lflag = k_termios.c_lflag;
      termios_p->c_line = k_termios.c_line;

#ifdef _HAVE_STRUCT_TERMIOS_C_ISPEED
# ifdef _HAVE_C_ISPEED
      termios_p->c_ispeed = k_termios.c_ispeed;
# else
      termios_p->c_ispeed = k_termios.c_cflag & (CBAUD | CBAUDEX);
# endif
#endif

#ifdef _HAVE_STRUCT_TERMIOS_C_OSPEED
# ifdef _HAVE_C_OSPEED
      termios_p->c_ospeed = k_termios.c_ospeed;
# else
      termios_p->c_ospeed = k_termios.c_cflag & (CBAUD | CBAUDEX);
# endif
#endif

      if (sizeof (cc_t) == 1 || _POSIX_VDISABLE == 0
        || (unsigned char) _POSIX_VDISABLE == (unsigned char) -1)
      memset (__mempcpy (&termios_p->c_cc[0], &k_termios.c_cc[0],
                     __KERNEL_NCCS * sizeof (cc_t)),
            _POSIX_VDISABLE, (NCCS - __KERNEL_NCCS) * sizeof (cc_t));
      else
      {
        memcpy (&termios_p->c_cc[0], &k_termios.c_cc[0],

              __KERNEL_NCCS * sizeof (cc_t));

        for (size_t cnt = __KERNEL_NCCS; cnt < NCCS; ++cnt)
          termios_p->c_cc[cnt] = _POSIX_VDISABLE;
      }

    }

  return retval;
}

weak_alias (__tcgetattr, tcgetattr)

Here is the source code of isatty() which calls the tcgetattr() in sysdeps/posix/isatty.c:

[...]
#include <unistd.h>
#include <termios.h>

/* Return 1 if FD is a terminal, 0 if not.  */
int
__isatty (int fd)
{
  struct termios term;

  return __tcgetattr (fd, &term) == 0;
}

weak_alias (__isatty, isatty)

Upon receipt of a watchdog refresh, systemd is in function manager_dispatch_notify_fd() defined in src/core/manager.c:

       /* Notify every unit that might be interested, which might be multiple. */
       u1 = manager_get_unit_by_pid_cgroup(m, ucred->pid);

The latter is defined in src/core/cgroup.c:

Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) {
        _cleanup_free_ char *cgroup = NULL;

        assert(m);

        if (!pid_is_valid(pid))
                return NULL;
 
        if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0)
                return NULL;
 
        return manager_get_unit_by_cgroup(m, cgroup);
}

The latter builds the /proc/<pid>/cgroup pathname and opens/reads it with the call to cg_pid_get_path() defined in src/basic/cgroup-util.c. The goal is to catch the line with the controller field equal to "name=systemd" in order to get the pathname of the service file which launched the process (i.e. here it is a service called locationmgr.service):

int cg_pid_get_path(const char *controller, pid_t pid, char **path) {

        _cleanup_fclose_ FILE *f = NULL;
        const char *fs, *controller_str;
        int unified, r;
        size_t cs = 0;

        assert(path);
        assert(pid >= 0);

        if (controller) {
                if (!cg_controller_is_valid(controller))
                        return -EINVAL;
        } else
                controller = SYSTEMD_CGROUP_CONTROLLER;

        unified = cg_unified_controller(controller);
        if (unified < 0)
                return unified;

        if (unified == 0) {
                if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
                        controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
                else
                        controller_str = controller;

                cs = strlen(controller_str);
        }

        fs = procfs_file_alloca(pid, "cgroup");
        r = fopen_unlocked(fs, "re", &f);
        if (r == -ENOENT)
               return -ESRCH;

        if (r < 0)
                return r; 

        for (;;) {
                _cleanup_free_ char *line = NULL;
                char *e, *p;

                r = read_line(f, LONG_LINE_MAX, &line);
                if (r < 0)
                        return r;

                if (r == 0)
                        break;

                if (unified) {
                        e = startswith(line, "0:");
                        if (!e)
                                continue;

                        e = strchr(e, ':');
                        if (!e)
                                continue;

                } else {

                        char *l;
                        size_t k;
                        const char *word, *state;
                        bool found = false;

                        l = strchr(line, ':');
                        if (!l)
                                continue;

                        l++;
                        e = strchr(l, ':');
                        if (!e)
                                continue;

                        *e = 0;
                        FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
                                if (k == cs && memcmp(word, controller_str, cs) == 0) {
                                        found = true;
                                        break;
                                }

                        if (!found)
                                continue;
                }

                p = strdup(e + 1);
                if (!p)
                       return -ENOMEM;

                /* Truncate suffix indicating the process is a zombie */
                e = endswith(p, " (deleted)");

                if (e)
                        *e = 0;

                *path = p;
                return 0;
        }

        return -ENODATA;
}

In the preceding function, the reading of the file is done line by line by a call to read_line(). The latter is an inlined function defined in src/basic/fileio.h:

static inline int read_line(FILE *f, size_t limit, char **ret) {
        return read_line_full(f, limit, 0, ret);
}

This calls read_line_full() defined in src/basic/fileio.c where we can see the call to isatty():

int read_line_full(FILE *f, size_t limit, ReadLineFlags flags, char **ret) {
        size_t n = 0, allocated = 0, count = 0;
        _cleanup_free_ char *buffer = NULL;
        int r, tty = -1;

        assert(f);

        /* Something like a bounded version of getline().
         *
         * Considers EOF, \n, \r and \0 end of line delimiters (or combinations of these), and does not include these
         * delimiters in the string returned. Specifically, recognizes the following combinations of markers as line
         * endings:
         *
         *       \n        (UNIX)
         *       \r        (old MacOS)
         *       \0        (C strings)
         *       \n\0
         *       \r\0
         *       \r\n      (Windows)
         *       \n\r
         *       \r\n\0
         *       \n\r\0
         *
         * Returns the number of bytes read from the files (i.e. including delimiters   this hence usually differs from
         * the number of characters in the returned string). When EOF is hit, 0 is returned.
         *
         * The input parameter limit is the maximum numbers of characters in the returned string, i.e. excluding
         * delimiters. If the limit is hit we fail and return -ENOBUFS.
         *
         * If a line shall be skipped ret may be initialized as NULL. */

        if (ret) {
                if (!GREEDY_REALLOC(buffer, allocated, 1))
                        return -ENOMEM;
        }

        {
                _unused_ _cleanup_(funlockfilep) FILE *flocked = f;
                EndOfLineMarker previous_eol = EOL_NONE;
                flockfile(f);

                for (;;) {
                        EndOfLineMarker eol;
                        char c;

                        if (n >= limit)
                                return -ENOBUFS;

                        if (count >= INT_MAX) /* We couldn't return the counter anymore as "int", hence refuse this */
                                return -ENOBUFS;
 
                        r = safe_fgetc(f, &c);
                        if (r < 0)
                                return r;

                        if (r == 0) /* EOF is definitely EOL */
                                break;

                        eol = categorize_eol(c, flags);

                        if (FLAGS_SET(previous_eol, EOL_ZERO) ||
                            (eol == EOL_NONE && previous_eol != EOL_NONE) ||
                            (eol != EOL_NONE && (previous_eol & eol) != 0)) {
                                /* Previous char was a NUL? This is not an EOL, but the previous char was? This type of
                                 * EOL marker has been seen right before?  In either of these three cases we are
                                 * done. But first, let's put this character back in the queue. (Note that we have to
                                 * cast this to (unsigned char) here as ungetc() expects a positive 'int', and if we
                                 * are on an architecture where 'char' equals 'signed char' we need to ensure we don't
                                 * pass a negative value here. That said, to complicate things further ungetc() is
                                 * actually happy with most negative characters and implicitly casts them back to
                                 * positive ones as needed, except for \xff (aka -1, aka EOF), which it refuses. What a
                                 * godawful API!) */
                                assert_se(ungetc((unsigned char) c, f) != EOF);
                                break;
                        }

                        count++;

                        if (eol != EOL_NONE) {
                                /* If we are on a tty, we can't wait for more input. But we expect only
                                 * \n as the single EOL marker, so there is no need to wait. We check
                                 * this condition last to avoid isatty() check if not necessary. */

                                if (tty < 0)
                                        tty = isatty(fileno(f));

                                if (tty > 0)
                                        break;
                        }

                        if (eol != EOL_NONE) {
                                previous_eol |= eol;
                                continue;
                        }

                        if (ret) {
                                if (!GREEDY_REALLOC(buffer, allocated, n + 2))
                                        return -ENOMEM;

                                buffer[n] = c;
                        }

                        n++;
                }
        }
 
        if (ret) {
                buffer[n] = 0;
                *ret = TAKE_PTR(buffer);
        }

        return (int) count;
}
Behaviour in former versions

On former systemds version, we didn’t have those ioctl() calls. So, this is a change which slows down systemd compared to its former versions.

On target where the ioctl() problem occurs, we run systemd version 243:

# systemd-analyze --version
systemd 243(243-63-gaad5087+)
-PAM -AUDIT +SELINUX -IMA -APPARMOR -SMACK -SYSVINIT -UTMP -LIBCRYPTSETUP -GCRYPT -GNUTLS -ACL -XZ -LZ4 -SECCOMP +BLKID -ELFUTILS +KMOD -IDN2 -IDN -PCRE2 default-hierarchy=legacy

On a PC running under Ubuntu 18.04, the systemd's version is older (i.e. 237):

$ systemd-analyze --version
systemd 237
+PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD -IDN2 +IDN -PCRE2 default-hierarchy=hybrid

In this version, we don't see the ioctl() calls upon watchdog refreshes:

$ sudo strace -fp 1
strace: Process 1 attached
epoll_wait(4, [{EPOLLIN, {u32=1548051072, u64=94821541044864}}], 95, -1) = 1
recvmsg(54, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="WATCHDOG=1", iov_len=4096}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=1427, uid=101, gid=103}}], msg_controllen=32, msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10
openat(AT_FDCWD, "/proc/1427/cgroup", O_RDONLY|O_CLOEXEC) = 18
fstat(18, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(18, "12:blkio:/system.slice/systemd-r"..., 1024) = 422
close(18)                               = 0
timerfd_settime(22, TFD_TIMER_ABSTIME, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=1018163, tv_nsec=625556000}}, NULL) = 0
epoll_wait(4, [{EPOLLIN, {u32=1548051072, u64=94821541044864}}], 95, -1) = 1
recvmsg(54, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="WATCHDOG=1", iov_len=4096}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=1901, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10
openat(AT_FDCWD, "/proc/1901/cgroup", O_RDONLY|O_CLOEXEC) = 18
fstat(18, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(18, "12:blkio:/system.slice/snapd.ser"..., 1024) = 345
close(18)                               = 0
epoll_wait(4, [{EPOLLIN, {u32=1548051072, u64=94821541044864}}], 95, -1) = 1
recvmsg(54, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="WATCHDOG=1", iov_len=4096}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=784, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10
openat(AT_FDCWD, "/proc/784/cgroup", O_RDONLY|O_CLOEXEC) = 18
fstat(18, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(18, "12:blkio:/system.slice/systemd-u"..., 1024) = 401
close(18)                               = 0
epoll_wait(4, [{EPOLLIN, {u32=1548051072, u64=94821541044864}}], 95, -1) = 1
recvmsg(54, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="WATCHDOG=1", iov_len=4096}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=2110, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CMSG_CLOEXEC}, MSG_TRUNC|MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 10
openat(AT_FDCWD, "/proc/2110/cgroup", O_RDONLY|O_CLOEXEC) = 18
fstat(18, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(18, "12:blkio:/system.slice/systemd-l"..., 1024) = 408
close(18)                               = 0
timerfd_settime(22, TFD_TIMER_ABSTIME, {it_interval={tv_sec=0, tv_nsec=0}, it_value={tv_sec=1018223, tv_nsec=625556000}}, NULL) = 0
epoll_wait(4, ^Cstrace: Process 1 detached
 <detached ...>
Proposed solution

The flags passed to read_line_full() could define a new bit called READ_LINE_NOTTY (in src/basic/fileio.h) in order to differenciate inputs from files and inputs from terminals:

typedef enum ReadLineFlags {
        READ_LINE_ONLY_NUL = 1 << 0,
        READ_LINE_NOTTY = 1 << 1
} ReadLineFlags;

A service called read_line_notty() could be added to src/basic/fileio.h to specify this new bit:


static inline int read_line_notty(FILE *f, size_t limit, char **ret) {
        return read_line_full(f, limit, READ_LINE_NOTTY, ret);
}

And the function read_line_full() would take this bit in account to avoid calling ioctl() in src/basic/fileio.c:

int read_line_full(FILE *f, size_t limit, ReadLineFlags flags, char **ret) {
        size_t n = 0, allocated = 0, count = 0;
        _cleanup_free_ char *buffer = NULL;
        int r, tty = -1;

        assert(f);

        /* Something like a bounded version of getline().
         *
         * Considers EOF, \n, \r and \0 end of line delimiters (or combinations of these), and does not include these
         * delimiters in the string returned. Specifically, recognizes the following combinations of markers as line
         * endings:
         *
         *       \n        (UNIX)
         *       \r        (old MacOS)
         *       \0        (C strings)
         *       \n\0
         *       \r\0
         *       \r\n      (Windows)
         *       \n\r
         *       \r\n\0
         *       \n\r\0
         *
         * Returns the number of bytes read from the files (i.e. including delimiters   this hence usually differs from
         * the number of characters in the returned string). When EOF is hit, 0 is returned.
         *
         * The input parameter limit is the maximum numbers of characters in the returned string, i.e. excluding
         * delimiters. If the limit is hit we fail and return -ENOBUFS.
         *
         * If a line shall be skipped ret may be initialized as NULL. */

        if (ret) {
                if (!GREEDY_REALLOC(buffer, allocated, 1))
                        return -ENOMEM;
        }

        {
                _unused_ _cleanup_(funlockfilep) FILE *flocked = f;
                EndOfLineMarker previous_eol = EOL_NONE;
                flockfile(f);

                for (;;) {
                        EndOfLineMarker eol;
                        char c;

                        if (n >= limit)
                                return -ENOBUFS;

                        if (count >= INT_MAX) /* We couldn't return the counter anymore as "int", hence refuse this */
                                return -ENOBUFS;

                        r = safe_fgetc(f, &c);
                        if (r < 0)
                                return r;

                        if (r == 0) /* EOF is definitely EOL */
                                break;

                        eol = categorize_eol(c, flags);

                        if (FLAGS_SET(previous_eol, EOL_ZERO) ||
                            (eol == EOL_NONE && previous_eol != EOL_NONE) ||
                            (eol != EOL_NONE && (previous_eol & eol) != 0)) {
                                /* Previous char was a NUL? This is not an EOL, but the previous char was? This type of
                                 * EOL marker has been seen right before?  In either of these three cases we are
                                 * done. But first, let's put this character back in the queue. (Note that we have to
                                 * cast this to (unsigned char) here as ungetc() expects a positive 'int', and if we
                                 * are on an architecture where 'char' equals 'signed char' we need to ensure we don't
                                 * pass a negative value here. That said, to complicate things further ungetc() is
                                 * actually happy with most negative characters and implicitly casts them back to
                                 * positive ones as needed, except for \xff (aka -1, aka EOF), which it refuses. What a
                                 * godawful API!) */
                                assert_se(ungetc((unsigned char) c, f) != EOF);
                                break;
                        }

                        count++;

                        if (eol != EOL_NONE && !FLAGS_SET(flags, READ_LINE_NOTTY)) {
                                /* If we are on a tty, we can't wait for more input. But we expect only
                                 * \n as the single EOL marker, so there is no need to wait. We check
                                 * this condition last to avoid isatty() check if not necessary. */

                                if (tty < 0)
                                        tty = isatty(fileno(f));

                                if (tty > 0)
                                        break;
                        }

                        if (eol != EOL_NONE) {
                                previous_eol |= eol;
                                continue;
                        }

                        if (ret) {
                                if (!GREEDY_REALLOC(buffer, allocated, n + 2))
                                        return -ENOMEM;

                                buffer[n] = c;
                        }

                        n++;
                }
        }

        if (ret) {
                buffer[n] = 0;

                *ret = TAKE_PTR(buffer);
        }

        return (int) count;
}

Finally, we would replace the call to read_line() by read_line_notty() in cg_pid_get_path() defined in src/basic/cgroup-util.c:

int cg_pid_get_path(const char *controller, pid_t pid, char **path) {

        _cleanup_fclose_ FILE *f = NULL;
        const char *fs, *controller_str;
        int unified, r;
        size_t cs = 0;

        assert(path);
        assert(pid >= 0);

        if (controller) {
                if (!cg_controller_is_valid(controller))
                        return -EINVAL;
        } else
                controller = SYSTEMD_CGROUP_CONTROLLER;

        unified = cg_unified_controller(controller);
        if (unified < 0)
                return unified;

        if (unified == 0) {
                if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
                        controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
                else
                        controller_str = controller;

                cs = strlen(controller_str);
        }

        fs = procfs_file_alloca(pid, "cgroup");
        r = fopen_unlocked(fs, "re", &f);
        if (r == -ENOENT)
               return -ESRCH;

        if (r < 0)
                return r; 

        for (;;) {
                _cleanup_free_ char *line = NULL;
                char *e, *p;

                r = read_line_notty(f, LONG_LINE_MAX, &line);
                if (r < 0)
                        return r;

                if (r == 0)
                        break;

                if (unified) {
                        e = startswith(line, "0:");
                        if (!e)
                                continue;

                        e = strchr(e, ':');
                        if (!e)
                                continue;

                } else {

                        char *l;
                        size_t k;
                        const char *word, *state;
                        bool found = false;

                        l = strchr(line, ':');
                        if (!l)
                                continue;

                        l++;
                        e = strchr(l, ':');
                        if (!e)
                                continue;

                        *e = 0;
                        FOREACH_WORD_SEPARATOR(word, k, l, ",", state)
                                if (k == cs && memcmp(word, controller_str, cs) == 0) {
                                        found = true;
                                        break;
                                }

                        if (!found)
                                continue;
                }

                p = strdup(e + 1);
                if (!p)
                       return -ENOMEM;

                /* Truncate suffix indicating the process is a zombie */
                e = endswith(p, " (deleted)");

                if (e)
                        *e = 0;

                *path = p;
                return 0;
        }

        return -ENODATA;
}
Optimization ideas

The controllers listed from /proc/<pid>/cgroup systematically shows the line containing "name=systemd" in the last position. We could imagine a patch in the kernel to make it show up in the first position in order to have the systemd's service filename in the first line instead of the last. Hence, systemd would not systematically read all the lines of /proc/<pid>/cgroup but only the first one.

About the author

The author is an engineer in computer sciences located in France. He can be contacted here.