with ctrl-c yes, and then
systemctl start lxd.service lxd.socket
but now it is restarted, but i can’t start neither create containers
root@srv2:/usr/local/bin# lxc info --show-log local:testcuda
Name: testcuda
Remote: unix://
Architecture: x86_64
Created: 2020/04/29 12:47 UTC
Status: Stopped
Type: persistent
Profiles: all_gpu_250GB
Log:
lxc testcuda 20200429124712.759 ERROR conf - conf.c:lxc_map_ids:2999 - newuidmap failed to write mapping “newuidmap: uid range [0-65536) -> [786432-851968) not allowed”: newuidmap 19272 0 786432 65536
lxc testcuda 20200429124712.759 ERROR start - start.c:lxc_spawn:1708 - Failed to set up id mapping.
lxc testcuda 20200429124712.853 WARN network - network.c:lxc_delete_network_priv:2613 - Invalid argument - Failed to remove interface “vethSKTQU0” from “lxdbr0”
lxc testcuda 20200429124712.853 ERROR lxccontainer - lxccontainer.c:wait_on_daemonized_start:842 - Received container state “ABORTING” instead of “RUNNING”
lxc testcuda 20200429124712.854 ERROR start - start.c:__lxc_start:1939 - Failed to spawn container “testcuda”
lxc testcuda 20200429124712.861 ERROR conf - conf.c:lxc_map_ids:2999 - newuidmap failed to write mapping “newuidmap: uid range [0-65536) -> [786432-851968) not allowed”: newuidmap 19289 0 786432 65536 65536 0 1
lxc testcuda 20200429124712.861 ERROR conf - conf.c:userns_exec_1:4352 - Error setting up {g,u}id mappings for child process “19289”
lxc testcuda 20200429124712.862 WARN cgfsng - cgroups/cgfsng.c:cgfsng_payload_destroy:1122 - Failed to destroy cgroups
lxc 20200429124712.862 WARN commands - commands.c:lxc_cmd_rsp_recv:132 - Connection reset by peer - Failed to receive response for command “get_state”
This is very weird, LXD really shouldn’t pick such low uid/gid.
Can you show lxc config show --expanded testcuda
and also do a more simple:
- lxc init images:alpine/edge a1
- lxc config show --expanded a1
oot@srv2:/home/jpe# lxc config show --expanded testcuda
architecture: x86_64
config:
image.architecture: amd64
image.description: ubuntu 18.04 LTS amd64 (release) (20200407)
image.label: release
image.os: ubuntu
image.release: bionic
image.serial: “20200407”
image.version: “18.04”
limits.cpu: “6”
limits.memory: 26GB
nvidia.runtime: “true”
security.idmap.isolated: “true”
volatile.base_image: 2cfc5a5567b8d74c0986f3d8a77a2a78e58fe22ea9abd2693112031f85afa1a1
volatile.eth0.hwaddr: 00:16:3e:07:3b:00
volatile.idmap.base: “786432”
volatile.idmap.next: ‘[{“Isuid”:true,“Isgid”:false,“Hostid”:786432,“Nsid”:0,“Maprange”:65536},{“Isuid”:false,“Isgid”:true,“Hostid”:786432,“Nsid”:0,“Maprange”:65536}]’
volatile.last_state.idmap: ‘[{“Isuid”:true,“Isgid”:false,“Hostid”:786432,“Nsid”:0,“Maprange”:65536},{“Isuid”:false,“Isgid”:true,“Hostid”:786432,“Nsid”:0,“Maprange”:65536}]’
volatile.last_state.power: STOPPED
devices:
eth0:
name: eth0
nictype: bridged
parent: lxdbr0
type: nic
gpu:
type: gpu
root:
path: /
pool: cpool
size: 250GB
type: disk
ephemeral: false
profiles:
- all_gpu_250GB
stateful: false
description: “”
root@srv2:/home/jpe# lxc init images:alpine/edge a1
Creating a1
root@srv2:/home/jpe# lxc config show --expanded a1
architecture: x86_64
config:
image.architecture: amd64
image.description: Alpine edge amd64 (20200428_13:00)
image.os: Alpine
image.release: edge
image.serial: “20200428_13:00”
volatile.apply_template: create
volatile.base_image: 68d1dce55b29fc2bc75b558887e1c03799c74a73a3d433e473c8e23061002456
volatile.eth0.hwaddr: 00:16:3e:ff:68:0d
volatile.idmap.base: “0”
volatile.idmap.next: ‘[{“Isuid”:true,“Isgid”:false,“Hostid”:1000000,“Nsid”:0,“Maprange”:1000000000},{“Isuid”:false,“Isgid”:true,“Hostid”:1000000,“Nsid”:0,“Maprange”:1000000000}]’
volatile.last_state.idmap: ‘[{“Isuid”:true,“Isgid”:false,“Hostid”:1000000,“Nsid”:0,“Maprange”:1000000000},{“Isuid”:false,“Isgid”:true,“Hostid”:1000000,“Nsid”:0,“Maprange”:1000000000}]’
devices:
eth0:
name: eth0
nictype: bridged
parent: lxdbr0
type: nic
root:
path: /
pool: cpool
type: disk
ephemeral: false
profiles: - default
stateful: false
description: “”
this a1 container even runs!
it has something to do with the profile i use, because i just tested without specifying a profile and that works
the profile i use ->
lxc profile show all_gpu_250GB
config:
limits.cpu: “6”
limits.memory: 26GB
nvidia.runtime: “true”
security.idmap.isolated: “true”
description: Default LXD profile
devices:
eth0:
name: eth0
nictype: bridged
parent: lxdbr0
type: nic
gpu:
type: gpu
root:
path: /
pool: cpool
size: 250GB
type: disk
name: all_gpu_250GB
used_by:
- /1.0/containers/nbanar
- /1.0/containers/fivez
- /1.0/containers/mdebruyn
- /1.0/containers/elotfi
- /1.0/containers/madhumita
- /1.0/containers/walter
- /1.0/containers/tulkens
- /1.0/containers/testcuda
Pretty sure it’s isolated misbehaving but that’s odd as it should be restricted to its normal range…
Can you try:
- lxc launch images:alpine/edge a2 -c security.idmap.isolated=true
And confirm this fails similarly?
root@srv2:/home/jpe# lxc launch images:alpine/edge a2 -c security.idmap.isolated=true
Creating a2
Starting a2
Error: Failed to run: /usr/lib/lxd/lxd forkstart a2 /var/lib/lxd/containers /var/log/lxd/a2/lxc.conf:
Try lxc info --show-log local:a2
for more info
root@srv2:/home/jpe# lxc info --show-log local:a2
Name: a2
Remote: unix://
Architecture: x86_64
Created: 2020/04/29 14:50 UTC
Status: Stopped
Type: persistent
Profiles: default
Log:
lxc a2 20200429145033.633 ERROR conf - conf.c:lxc_map_ids:2999 - newuidmap failed to write mapping “newuidmap: uid range [0-65536) -> [851968-917504) not allowed”: newuidmap 32741 0 851968 65536
lxc a2 20200429145033.633 ERROR start - start.c:lxc_spawn:1708 - Failed to set up id mapping.
lxc a2 20200429145033.718 WARN network - network.c:lxc_delete_network_priv:2613 - Invalid argument - Failed to remove interface “veth3VE09O” from “lxdbr0”
lxc a2 20200429145033.718 ERROR lxccontainer - lxccontainer.c:wait_on_daemonized_start:842 - Received container state “ABORTING” instead of “RUNNING”
lxc a2 20200429145033.719 ERROR start - start.c:__lxc_start:1939 - Failed to spawn container “a2”
lxc a2 20200429145033.725 ERROR conf - conf.c:lxc_map_ids:2999 - newuidmap failed to write mapping “newuidmap: uid range [0-65536) -> [851968-917504) not allowed”: newuidmap 32757 0 851968 65536 65536 0 1
lxc a2 20200429145033.725 ERROR conf - conf.c:userns_exec_1:4352 - Error setting up {g,u}id mappings for child process “32757”
lxc a2 20200429145033.726 WARN cgfsng - cgroups/cgfsng.c:cgfsng_payload_destroy:1122 - Failed to destroy cgroups
lxc 20200429145033.726 WARN commands - commands.c:lxc_cmd_rsp_recv:132 - Connection reset by peer - Failed to receive response for command “get_state”
Ok, thanks, can you show lxc config show --expanded a2
?
What OS is this running on btw?
root@srv2:/home/jpe# lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 18.04.4 LTS
Release: 18.04
Codename: bionic
root@srv2:/home/jpe# lxc config show --expanded a2
architecture: x86_64
config:
image.architecture: amd64
image.description: Alpine edge amd64 (20200428_13:00)
image.os: Alpine
image.release: edge
image.serial: “20200428_13:00”
security.idmap.isolated: “true”
volatile.base_image: 68d1dce55b29fc2bc75b558887e1c03799c74a73a3d433e473c8e23061002456
volatile.eth0.hwaddr: 00:16:3e:b3:4e:a3
volatile.idmap.base: “851968”
volatile.idmap.next: ‘[{“Isuid”:true,“Isgid”:false,“Hostid”:851968,“Nsid”:0,“Maprange”:65536},{“Isuid”:false,“Isgid”:true,“Hostid”:851968,“Nsid”:0,“Maprange”:65536}]’
volatile.last_state.idmap: ‘[{“Isuid”:true,“Isgid”:false,“Hostid”:851968,“Nsid”:0,“Maprange”:65536},{“Isuid”:false,“Isgid”:true,“Hostid”:851968,“Nsid”:0,“Maprange”:65536}]’
volatile.last_state.power: STOPPED
devices:
eth0:
name: eth0
nictype: bridged
parent: lxdbr0
type: nic
root:
path: /
pool: cpool
type: disk
ephemeral: false
profiles:
- default
stateful: false
description: “”
Ok, worth noting that if you were to switch over to the snap, you’d workaround this issue as it doesn’t rely on subuid/subgid.
If that’s an option for you, you can move across with:
- snap install lxd
- lxd.migrate
I’m trying to reproduce the issue of idmap.isolated
not respecting the configured range now, might be something we fixed in a later release.
ok, i am willing to switch over to snap, i presume i first uninstall lxd? And what about the existing containers?
Nope, just run those two commands and your containers will be kept around and moved across, don’t remove anything, the migration tool does it for you.
lxd migrate takes its time…
What is it showing?
root@srv2:/home/jpe# lxd migrate
WARN[04-29|17:56:56] CGroup memory swap accounting is disabled, swap limits will be ignored.
so for more than an hour.
Ah, that explains it
ctrl+c that and run lxd.migrate
(note the dot instead of space)
Newer LXD versions would yell at you when running lxd migrate
telling you it’s invalid, but 3.0.3 that you’re running now, just spawns a new copy of the daemon…