Unable to open the PTY device: no space left on device

All of the containers can not be login in, it reports there is no free space on the disk, but the filesystem of lxc server still has lots lof space:

root@dev-physical-0-18:~# lxc storage list
+---------------+--------+--------------------+-------------+---------+---------+
|     NAME      | DRIVER |       SOURCE       | DESCRIPTION | USED BY |  STATE  |
+---------------+--------+--------------------+-------------+---------+---------+
| lxc-storage   | dir    | /data/lxc/         |             | 24      | CREATED |
+---------------+--------+--------------------+-------------+---------+---------+
| ssd           | btrfs  | /ssd/lxd           |             | 6       | CREATED |
+---------------+--------+--------------------+-------------+---------+---------+
| ssd-benchmark | btrfs  | /ssd-benchmark/lxd |             | 4       | CREATED |
+---------------+--------+--------------------+-------------+---------+---------+
root@dev-physical-0-18:~# df -h 
Filesystem                         Size  Used Avail Use% Mounted on
tmpfs                               38G  3.2M   38G   1% /run
/dev/mapper/ubuntu--vg-ubuntu--lv  196G   73G  114G  40% /
tmpfs                              189G   64K  189G   1% /dev/shm
tmpfs                              5.0M     0  5.0M   0% /run/lock
tmpfs                              4.0M     0  4.0M   0% /sys/fs/cgroup
tmpfs                              189G     0  189G   0% /run/qemu
/dev/sdd                           932G  145G  787G  16% /ssd-benchmark
/dev/sda                           2.8T  390G  2.4T  14% /ssd
/dev/sde2                          974M  164M  744M  19% /boot
/dev/sde1                          511M  5.3M  506M   2% /boot/efi
/dev/mapper/ubuntu--vg-lv--sas     3.1T  2.6T  580G  82% /data
tmpfs                               38G   12K   38G   1% /run/user/0
root@dev-physical-0-18:~# lxc exec dev-host-0-207-flink  bash 
Error: Unable to open the PTY device: no space left on device
root@dev-physical-0-18:~# lxc exec dev-host-0-209-flink-with-rocksdb-benchmark  bash 
Error: Unable to open the PTY device: no space left on device

Any ideas :smiling_face_with_tear: ?

Some information of strace command:

strace lxc exec dev-host-0-209-flink-with-rocksdb-benchmark  bash 

...
newfstatat(AT_FDCWD, "/root/snap/lxd/common", {st_mode=S_IFDIR|0755, st_size=4096, ...}, 0) = 0
readlinkat(AT_FDCWD, "/root/snap/lxd/current", "23339", 128) = 5
openat(AT_FDCWD, "/proc/self/mountinfo", O_RDONLY|O_CLOEXEC) = 3
epoll_ctl(4, EPOLL_CTL_ADD, 3, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=4190366984, u64=139899865132296}}) = 0
fcntl(3, F_GETFL)                       = 0x8000 (flags O_RDONLY|O_LARGEFILE)
fcntl(3, F_SETFL, O_RDONLY|O_NONBLOCK|O_LARGEFILE) = 0
read(3, "24 30 0:22 / /sys rw,nosuid,node"..., 4096) = 4087
read(3, "150 30 7:1 / /snap/core18/2409 r"..., 4096) = 1329
read(3, "", 2767)                       = 0
epoll_ctl(4, EPOLL_CTL_DEL, 3, 0xc00057552c) = 0
close(3)                                = 0
umask(022)                              = 000
newfstatat(AT_FDCWD, "/run/user/0", {st_mode=S_IFDIR|0700, st_size=160, ...}, 0) = 0
geteuid()                               = 0
getuid()                                = 0
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
setsockopt(3, SOL_SOCKET, SO_BROADCAST, [1], 4) = 0
connect(3, {sa_family=AF_UNIX, sun_path="/run/user/0/bus"}, 18) = 0
epoll_ctl(4, EPOLL_CTL_ADD, 3, {events=EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, data={u32=4190366984, u64=139899865132296}}) = 0
getsockname(3, {sa_family=AF_UNIX}, [112 => 2]) = 0
getpeername(3, {sa_family=AF_UNIX, sun_path="/run/user/0/bus"}, [112 => 18]) = 0
getuid()                                = 0
getpid()                                = 1391235
getuid()                                = 0
getgid()                                = 0
sendmsg(3, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0", iov_len=1}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=1391235, uid=0, gid=0}}], msg_controllen=32, msg_flags=0}, 0) = 1
write(3, "AUTH\r\n", 6)                 = 6
read(3, 0xc000316000, 4096)             = -1 EAGAIN (Resource temporarily unavailable)
epoll_pwait(4, [{events=EPOLLOUT, data={u32=4190366984, u64=139899865132296}}], 128, -1, NULL, 0) = 1
epoll_pwait(4, [{events=EPOLLIN|EPOLLOUT, data={u32=4190366984, u64=139899865132296}}], 128, -1, NULL, 0) = 1
read(3, "OK 5a9cd162e9f322068d2f778862f3c"..., 4096) = 37
write(3, "NEGOTIATE_UNIX_FD\r\n", 19)   = 19
read(3, 0xc000316000, 4096)             = -1 EAGAIN (Resource temporarily unavailable)
epoll_pwait(4, [{events=EPOLLIN|EPOLLOUT, data={u32=4190366984, u64=139899865132296}}], 128, 0, NULL, 0) = 1
read(3, "AGREE_UNIX_FD\r\n", 4096)      = 15
write(3, "BEGIN\r\n", 7)                = 7
futex(0xc00059c148, FUTEX_WAKE_PRIVATE, 1) = 1
write(3, "l\1\0\1\0\0\0\0\1\0\0\0n\0\0\0\2\1s\0\24\0\0\0org.free"..., 128) = 128
epoll_pwait(4, [], 128, 0, NULL, 824637291520) = 0
futex(0x55ef00974248, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
epoll_pwait(4, [{events=EPOLLIN|EPOLLOUT, data={u32=4190367400, u64=139899865132712}}], 128, 0, NULL, 0) = 1
nanosleep({tv_sec=0, tv_nsec=3000}, NULL) = 0
futex(0x55ef00974248, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x55ef00974248, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x55ef00974248, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x55ef00974248, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
futex(0x55ef00974248, FUTEX_WAIT_PRIVATE, 0, NULL) = 0
epoll_pwait(4, [], 128, 0, NULL, 0)     = 0
epoll_pwait(4,  <unfinished ...>)       = ?
Error: Unable to open the PTY device: no space left on device
+++ exited with 1 +++

And the container can’t be reboot:

root@dev-physical-0-18:~# lxc start dev-host-0-101-agent-for-162
Error: Failed to run: /snap/lxd/current/bin/lxd forkstart dev-host-0-101-agent-for-162 /var/snap/lxd/common/lxd/containers /var/snap/lxd/common/lxd/logs/dev-host-0-101-agent-for-162/lxc.conf: 
Try `lxc info --show-log dev-host-0-101-agent-for-162` for more info
root@dev-physical-0-18:~# lxc info --show-log dev-host-0-101-agent-for-162
Name: dev-host-0-101-agent-for-162
Status: STOPPED
Type: container
Architecture: x86_64
Created: 2022/04/07 07:26 UTC
Last Used: 2022/08/12 02:30 UTC

Log:

lxc dev-host-0-101-agent-for-162 20220812023000.925 ERROR    conf - ../src/src/lxc/conf.c:turn_into_dependent_mounts:3919 - No such file or directory - Failed to recursively turn old root mount tree into dependent mount. Continuing...
lxc dev-host-0-101-agent-for-162 20220812023001.743 ERROR    conf - ../src/src/lxc/conf.c:lxc_setup_console:2127 - No space left on device - Failed to allocate console from container's devpts instance
lxc dev-host-0-101-agent-for-162 20220812023001.743 ERROR    conf - ../src/src/lxc/conf.c:lxc_setup:4434 - Failed to setup console
lxc dev-host-0-101-agent-for-162 20220812023001.743 ERROR    start - ../src/src/lxc/start.c:do_start:1272 - Failed to setup container "dev-host-0-101-agent-for-162"
lxc dev-host-0-101-agent-for-162 20220812023001.744 ERROR    sync - ../src/src/lxc/sync.c:sync_wait:34 - An error occurred in another process (expected sequence number 4)
lxc dev-host-0-101-agent-for-162 20220812023001.871 WARN     network - ../src/src/lxc/network.c:lxc_delete_network_priv:3631 - Failed to rename interface with index 0 from "eth0" to its initial name "veth2347284d"
lxc dev-host-0-101-agent-for-162 20220812023001.872 ERROR    lxccontainer - ../src/src/lxc/lxccontainer.c:wait_on_daemonized_start:877 - Received container state "ABORTING" instead of "RUNNING"
lxc dev-host-0-101-agent-for-162 20220812023001.873 ERROR    start - ../src/src/lxc/start.c:__lxc_start:2107 - Failed to spawn container "dev-host-0-101-agent-for-162"
lxc dev-host-0-101-agent-for-162 20220812023001.874 WARN     start - ../src/src/lxc/start.c:lxc_abort:1036 - No such process - Failed to send SIGKILL via pidfd 44 for process 1505406
lxc 20220812023006.282 ERROR    af_unix - ../src/src/lxc/af_unix.c:lxc_abstract_unix_recv_fds_iov:218 - Connection reset by peer - Failed to receive response
lxc 20220812023006.282 ERROR    commands - ../src/src/lxc/commands.c:lxc_cmd_rsp_recv_fds:128 - Failed to receive file descriptors for command "get_state"

What does ls -lh /dev/pts look like?

the physical server return:

root@dev-physical-0-18:~# ll -lh /dev/pts
total 0
drwxr-xr-x  2 root         root      0 Aug 10 23:23 ./
drwxr-xr-x 22 root         root   4.5K Aug 11 00:22 ../
crw--w----  1 libvirt-qemu tty  136, 0 Aug 10 23:25 0
crw--w----  1 libvirt-qemu tty  136, 1 Aug 10 23:25 1
crw--w----  1 libvirt-qemu tty  136, 2 Aug 10 23:25 2
crw-------  1 root         tty  136, 3 Aug 12 10:34 3
crw-------  1 root         tty  136, 7 Aug 12 10:48 7
crw-------  1 root         tty  136, 8 Aug 12 10:19 8
crw-------  1 root         tty  136, 9 Aug 12 11:03 9
c---------  1 root         root   5, 2 Aug 10 23:23 ptmx
root@dev-physical-0-18:~# 

Can you show:

  • cat /proc/sys/kernel/pty/max
  • cat /proc/sys/kernel/pty/nr
  • cat /proc/sys/kernel/pty/reserve

I have already solved this problem. This is because I upgraded the LXD, LXCFS updated to 5.0.1, and the CPU virtualization feature provided by LXCFS prevents starting the erts-7.3 of the runtime erlang from our RabbitMQ.

The RabbitMQ’s daemon is wildly trying to restart, generating a lot of stop processes, exhausting the entire lxd user’s resources to log in or open a new pty session

root     2012833  0.0  0.0 113296  2084 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617224.6.sh
root     2013149  0.0  0.0   7488   768 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop
root     2075690  0.0  0.0 113296  2156 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617349.09.sh
root     2075939  0.0  0.0   7488   768 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop
root     2132693  0.0  0.0 113296  2116 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617467.17.sh
root     2132920  0.0  0.0   7488   772 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop
root     2185309  0.0  0.0 113296  2164 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617585.95.sh
root     2185499  0.0  0.0   7488   716 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop
root     2237869  0.0  0.0 113296  2084 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617710.68.sh
root     2238081  0.0  0.0   7488   684 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop
root     2289199  0.0  0.0 113296  2080 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617823.06.sh
root     2289410  0.0  0.0   7488   684 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop
root     2348253  0.0  0.0 113296  2084 ?        S    Aug16   0:00 /bin/bash /tmp/easyops_script/pkg_rabbitmq_stop_script_1660617946.46.sh
root     2348423  0.0  0.0   7488   684 ?        S    Aug16   0:00 /usr/local/easyops/erlang/lib/erlang/erts-7.3/bin/erlexec -pa /usr/local/easyops/rabbitmq/ebin -noinput +B -hidden -boot start_clean -sasl errlog_type error -mnesia dir "/data/easyops/rabbitmq/mnesia/rabbit@rabbitmq_node_1" -s rabbit_control_main -nodename rabbit@rabbitmq_node_1 -extra stop

I used the strace command to track and found that it was indeed LXCFS CPU virtualization and this problem needs to be fixed.