LXD 3.23 - Cluster setup, lxc exec has different behavior for containers and VMs

dig output below …

ubuntu@ctrlr:~$ dig @240.204.0.1 k8s-master1.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 k8s-master1.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 57678
;; flags: qr aa rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 1

;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 1280
;; QUESTION SECTION:
;k8s-master1.lxd.               IN      A

;; ANSWER SECTION:
k8s-master1.lxd.        0       IN      A       240.204.0.82

;; Query time: 0 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:07:04 UTC 2020
;; MSG SIZE  rcvd: 60

ubuntu@ctrlr:~$ dig @240.204.0.1 k8s-worker1.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 k8s-worker1.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 41044
;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;k8s-worker1.lxd.               IN      A

;; Query time: 5 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:07:37 UTC 2020
;; MSG SIZE  rcvd: 33

ubuntu@ctrlr:~$ dig @240.204.0.1 k8s-lb.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 k8s-lb.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 51110
;; flags: qr aa ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;k8s-lb.lxd.                    IN      A

;; ANSWER SECTION:
k8s-lb.lxd.             0       IN      A       240.205.0.177

;; Query time: 5 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:07:51 UTC 2020
;; MSG SIZE  rcvd: 54

ubuntu@ctrlr:~$ dig @240.204.0.1 k8s-worker4.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 k8s-worker4.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 39305
;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;k8s-worker4.lxd.               IN      A

;; Query time: 5 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:08:06 UTC 2020
;; MSG SIZE  rcvd: 33

forkdns.servers/servers.conf output below:

akriadmin@c4akri01:~/scripts$ ./run-physical-hosts.sh "sudo cat /var/snap/lxd/common/lxd/networks/lxdfan0/forkdns.servers/servers.conf"
[c4akri01]:
240.221.0.1
240.222.0.1
240.223.0.1
240.197.0.1
240.215.0.1
240.205.0.1
[c4akri02]:
240.215.0.1
240.205.0.1
240.221.0.1
240.222.0.1
240.223.0.1
240.204.0.1
[c4akri03]:
240.221.0.1
240.222.0.1
240.223.0.1
240.204.0.1
240.197.0.1
240.205.0.1
[c4akri04]:
240.204.0.1
240.197.0.1
240.215.0.1
240.221.0.1
240.222.0.1
240.223.0.1
[c4astore01]:
240.205.0.1
240.222.0.1
240.223.0.1
240.204.0.1
240.197.0.1
240.215.0.1
[c4astore02]:
240.215.0.1
240.205.0.1
240.221.0.1
240.223.0.1
240.204.0.1
240.197.0.1
[c4astore03]:
240.215.0.1
240.205.0.1
240.221.0.1
240.222.0.1
240.204.0.1
240.197.0.1
akriadmin@c4akri01:~/scripts$

OK so we can say that the reason your query for k8s-worker1.lxd is not working is because it is not in any of the dnsmasq leases files.

Can you show me the output of lxc config show k8s-worker1 --expanded please?

Can you also send me the output of lxc list so I can get a better picture of your instance list.

Also can also confirm that k8s-worker1 instance is using DHCP to configure its networking?

k8s-worker1 --expanded output:

akriadmin@c4akri01:~/scripts$ lxc config show k8s-worker1 --expanded
architecture: x86_64
config:
  image.architecture: amd64
  image.description: Ubuntu bionic amd64 (20200327_07:42)
  image.os: Ubuntu
  image.release: bionic
  image.serial: "20200327_07:42"
  image.type: disk-kvm.img
  limits.cpu: "24"
  limits.memory: 73728MB
  raw.qemu: -device vfio-pci,host=41:00.0
  volatile.base_image: ed698b985cce87c0527b061d81662009472d5df210cbe297c1d90ed607415c18
  volatile.eth0.host_name: tapdef2ce9c
  volatile.eth0.hwaddr: 00:16:3e:e4:d9:96
  volatile.vm.uuid: c55da613-56d4-4493-822e-68af3d1f119a
devices:
  eth0:
    name: eth0
    network: lxdfan0
    type: nic
  root:
    path: /
    pool: local
    size: 50GB
    type: disk
  sda:
    source: /dev/sda
    type: disk
  sdb:
    source: /dev/sdb
    type: disk
  sdc:
    source: /dev/sdc
    type: disk
  sdd:
    source: /dev/sdd
    type: disk
  sde:
    source: /dev/sde
    type: disk
  sdf:
    source: /dev/sdf
    type: disk
ephemeral: false
profiles:
- default
- compute-vm
stateful: false
description: ""
akriadmin@c4akri01:~/scripts$

Can you show me ip a output inside k8s-worker1 too please.

lxc list output below …
Yes, all instances are picking up IP addresses using DHCP. As I wrote earlier, the VMs are a straight-up launch of images:ubuntu/18.04 and images:ubuntu/19.10.

akriadmin@c4akri01:~/scripts$ lxc list
+----------------+---------+------------------------+------+-----------------+-----------+------------+
|      NAME      |  STATE  |          IPV4          | IPV6 |      TYPE       | SNAPSHOTS |  LOCATION  |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| ctrlr          | RUNNING | 240.204.0.186 (eth0)   |      | CONTAINER       | 0         | c4akri01   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| hdfs-datanode1 | RUNNING | 240.221.0.52 (enp5s0)  |      | VIRTUAL-MACHINE | 0         | c4astore01 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| hdfs-datanode2 | RUNNING | 240.222.0.137 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4astore02 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| hdfs-datanode3 | RUNNING | 240.223.0.249 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4astore03 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| hdfs-namenode  | RUNNING | 240.221.0.237 (eth0)   |      | CONTAINER       | 0         | c4astore01 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-lb         | RUNNING | 240.205.0.177 (eth0)   |      | CONTAINER       | 0         | c4akri04   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-master1    | RUNNING | 240.204.0.82 (eth0)    |      | CONTAINER       | 0         | c4akri01   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-master2    | RUNNING | 240.197.0.18 (eth0)    |      | CONTAINER       | 0         | c4akri02   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-master3    | RUNNING | 240.215.0.110 (eth0)   |      | CONTAINER       | 0         | c4akri03   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-worker1    | RUNNING | 240.204.0.142 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri01   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-worker2    | RUNNING | 240.197.0.215 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri02   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-worker3    | RUNNING | 240.215.0.200 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri03   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| k8s-worker4    | RUNNING | 240.205.0.196 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri04   |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| minio1         | RUNNING | 240.221.0.117 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4astore01 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| minio2         | RUNNING | 240.222.0.12 (enp5s0)  |      | VIRTUAL-MACHINE | 0         | c4astore02 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
| minio3         | RUNNING | 240.223.0.187 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4astore03 |
+----------------+---------+------------------------+------+-----------------+-----------+------------+
akriadmin@c4akri01:~/scripts$

ip a output on k8s-worker1

ubuntu@k8s-worker1:~$ ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
2: enp5s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000
    link/ether 00:16:3e:e4:d9:96 brd ff:ff:ff:ff:ff:ff
    inet 240.204.0.142/8 brd 240.255.255.255 scope global dynamic enp5s0
       valid_lft 2796sec preferred_lft 2796sec
    inet6 fe80::216:3eff:fee4:d996/64 scope link 
       valid_lft forever preferred_lft forever
ubuntu@k8s-worker1:~$

Thanks and is this happening with VMs using both ubuntu 18.04 and 19.10 images?

This is very weird, as if the VMs are getting a DHCP allocation from dnsmasq, then what is causing them to be removed from the leases file.

Out of interest, what are you using raw.qemu: -device vfio-pci,host=41:00.0 for?

I am doing pass-through of a GPU, and also 6 block devices into the VM instance … Stephane and ‘morphis’ guided me towards using this mechanism.

1 Like

Yes, same issue for both 18.04 and 19.10 images … (hdfs-datanode1 is 18.04, minio1 is 19.10):

ubuntu@ctrlr:~$ dig @240.204.0.1 hdfs-datanode1.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 hdfs-datanode1.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51460
;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;hdfs-datanode1.lxd.            IN      A

;; Query time: 5 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:40:21 UTC 2020
;; MSG SIZE  rcvd: 36

ubuntu@ctrlr:~$ dig @240.204.0.1 minio1.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 minio1.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 39275
;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;minio1.lxd.                    IN      A

;; Query time: 5 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:40:41 UTC 2020
;; MSG SIZE  rcvd: 28

ubuntu@ctrlr:~$ ssh 240.221.0.52 uname -a
Linux hdfs-datanode1 4.15.0-91-generic #92-Ubuntu SMP Fri Feb 28 11:09:48 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
ubuntu@ctrlr:~$ ssh 240.221.0.117 uname -a
Linux minio1 5.3.0-42-generic #34-Ubuntu SMP Fri Feb 28 05:49:40 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
ubuntu@ctrlr:~$

OK, can you place your LXD daemon into debug mode on the hosts running the problem VM(s) (you can just pick one of them, as long as you test the next part on a VM running on the same host as you changed to debug mode).

sudo snap set lxd daemon.debug=true
sudo systemctl reload snap.lxd.daemon

Then run on the host sudo journalctl -f and keep a lookout for dnsmasq related log entries.

Inside the VM then run:

netplan apply

This should make a fresh DHCP request.

Finally from the same LXD host, can you then send the contents of: /var/snap/lxd/common/lxd/networks/lxdfan0/dnsmasq.leases

We need to try and figure out why your setup differs from my local one and the dnsmasq.leases file is not being populated (or being cleared) of VM DHCP allocations.

On a lark, I tried the following:

  • Created an images:ubuntu/16.04 VM on a remote host (c4akri02)
  • Tried seeing whether I could resolve that VM’s name … turns out this works!

I had previously reported an issue with images:ubuntu/18.04, where the same IP address was getting allocated to multiple VM instances launched on the same host. Perhaps these issues are related? (and also impact ubuntu/19.10)

akriadmin@c4akri01:~/scripts$ lxc list | grep test-vm
| test-vm        | RUNNING | 240.197.0.177 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri02   |

ubuntu@ctrlr:~$ dig @240.204.0.1 test-vm.lxd

; <<>> DiG 9.11.3-1ubuntu1.11-Ubuntu <<>> @240.204.0.1 test-vm.lxd
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 55620
;; flags: qr aa ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0

;; QUESTION SECTION:
;test-vm.lxd.                   IN      A

;; ANSWER SECTION:
test-vm.lxd.            0       IN      A       240.197.0.177

;; Query time: 4 msec
;; SERVER: 240.204.0.1#53(240.204.0.1)
;; WHEN: Tue Mar 31 17:50:34 UTC 2020
;; MSG SIZE  rcvd: 56

ubuntu@ctrlr:~$

Link to the other topic:

Interesting, I feel this is likely the issue.

I’d like to compare the MAC addresses inside the VMs to those allocated by LXD.

Can you send me the output of lxc config get <vm name> volatile.eth0.hwaddr for each VM, and then also for each VM the output of ip link show enp5s0

Finally can you also get me the netplan config file for each VM, it should be in /etc/netplan/50-cloud-init.yaml

Thanks
Tom

Ah I missed this post, so yes it looks like the VM images had an issue. If possible it may be best to regenerate your VMS from fresh images. Although @stgraber may be able to advise how to fix existing VMS.

To reset things, you can do:

rm /var/lib/dbus/machine-id
> /etc/machine-id
1 Like

Okay, running these commands on the host that has both an 18.04 VM (k8s-worker2) and a 16.04 VM (test-vm) …

akriadmin@c4akri02:~$ lxc list | grep c4akri02
| k8s-master2    | RUNNING | 240.197.0.18 (eth0)    |      | CONTAINER       | 0         | c4akri02   |
| k8s-worker2    | RUNNING | 240.197.0.215 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri02   |
| test-vm        | RUNNING | 240.197.0.177 (enp5s0) |      | VIRTUAL-MACHINE | 0         | c4akri02   |

akriadmin@c4akri02:~$ lxc config get k8s-worker2 volatile.eth0.hwaddr
00:16:3e:58:5f:c9
akriadmin@c4akri02:~$ lxc exec k8s-worker2 -- ip link show enp5s0
2: enp5s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP mode DEFAULT group default qlen 1000
    link/ether 00:16:3e:58:5f:c9 brd ff:ff:ff:ff:ff:ff
akriadmin@c4akri02:~$ lxc exec k8s-worker2 -- cat /etc/netplan/50-cloud-init.yaml
cat: /etc/netplan/50-cloud-init.yaml: No such file or directory
akriadmin@c4akri02:~$ lxc exec k8s-worker2 -- ls /etc/netplan
10-lxc.yaml
akriadmin@c4akri02:~$ lxc exec k8s-worker2 -- cat /etc/netplan/10-lxc.yaml
network:
  version: 2
  ethernets:
    enp5s0: {dhcp4: true}
akriadmin@c4akri02:~$ 

akriadmin@c4akri02:~$ lxc config get test-vm volatile.eth0.hwaddr
00:16:3e:30:6a:31
akriadmin@c4akri02:~$ lxc exec test-vm -- ip link show enp5s0
2: enp5s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
    link/ether 00:16:3e:30:6a:31 brd ff:ff:ff:ff:ff:ff
akriadmin@c4akri02:~$ lxc exec test-vm -- ls /etc/netplan
ls: cannot access '/etc/netplan': No such file or directory
akriadmin@c4akri02:~$ lxc exec test-vm -- cat /etc/network/interfaces
# This file describes the network interfaces available on your system
# and how to activate them. For more information, see interfaces(5).

# The loopback network interface
auto lo
iface lo inet loopback

auto enp5s0
iface enp5s0 inet dhcp

source /etc/network/interfaces.d/*.cfg
akriadmin@c4akri02:~$

The commands must be run inside of affected VMs.