Apparmor blocks systemd services in container

Required information

  • Distribution: Arch Linux
  • Distribution version: Recent
config: {}
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- candid_config_key
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- rbac
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
- network_dns_search
- container_nic_routed_limits
- instance_nic_bridged_vlan
- network_state_bond_bridge
- usedby_consistency
- custom_block_volumes
- clustering_failure_domains
- resources_gpu_mdev
- console_vga_type
- projects_limits_disk
- network_type_macvlan
- network_type_sriov
- container_syscall_intercept_bpf_devices
- network_type_ovn
- projects_networks
- projects_networks_restricted_uplinks
- custom_volume_backup
- backup_override_name
- storage_rsync_compression
- network_type_physical
- network_ovn_external_subnets
- network_ovn_nat
- network_ovn_external_routes_remove
- tpm_device_type
- storage_zfs_clone_copy_rebase
- gpu_mdev
- resources_pci_iommu
- resources_network_usb
- resources_disk_address
- network_physical_ovn_ingress_mode
- network_ovn_dhcp
- network_physical_routes_anycast
- projects_limits_instances
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
  addresses: []
  architectures:
  - x86_64
  - i686
  certificate: |
  certificate_fingerprint: 
  driver: lxc | qemu
  driver_version: 4.0.5 | 5.2.0
  firewall: nftables
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    netnsid_getifaddrs: "true"
    seccomp_listener: "true"
    seccomp_listener_continue: "true"
    shiftfs: "true"
    uevent_injection: "true"
    unpriv_fscaps: "true"
  kernel_version: 5.9.14-arch1-1
  lxc_features:
    cgroup2: "true"
    devpts_fd: "true"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    pidfd: "true"
    seccomp_allow_deny_syntax: "true"
    seccomp_notify: "true"
    seccomp_proxy_send_notify_fd: "true"
  os_name: Arch Linux
  os_version: ""
  project: default
  server: lxd
  server_clustered: false
  server_name: dalaran
  server_pid: 598
  server_version: "4.9"
  storage: btrfs
  storage_version: "5.9"

Issue description

I noticed that my containers don’t have internet access.
Then I checked the systemd services and noticed that many services failed to start:

â—Ź modprobe@drm.service                   loaded failed failed    Load Kernel Module drm                                              
â—Ź systemd-logind.service                 loaded failed failed    User Login Management                                
â—Ź systemd-networkd.service               loaded failed failed    Network Service                                                     
â—Ź systemd-resolved.service               loaded failed failed    Network Name Resolution                              
â—Ź systemd-udev-trigger.service           loaded failed failed

I ran dmesg | grep DENIED on the host and got:

[ 7094.105901] audit: type=1400 audit(1609276176.098:436): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-gaming3_</var/lib/lxd>" name="/run/systemd/unit-root/proc/" pid=76268 comm="(d-logind)" fstype="proc" srcname="proc" flags="rw, nosuid, nodev, noexec"
[ 7094.117601] audit: type=1400 audit(1609276176.108:439): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-gaming3_</var/lib/lxd>" name="/run/systemd/unit-root/proc/" pid=76273 comm="(d-logind)" fstype="proc" srcname="proc" flags="rw, nosuid, nodev, noexec"
[ 7094.129476] audit: type=1400 audit(1609276176.121:441): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-gaming3_</var/lib/lxd>" name="/run/systemd/unit-root/proc/" pid=76278 comm="(d-logind)" fstype="proc" srcname="proc" flags="rw, nosuid, nodev, noexec"
[ 7094.140361] audit: type=1400 audit(1609276176.131:444): apparmor="DENIED" operation="mount" info="failed flags match" error=-13 profile="lxd-gaming3_</var/lib/lxd>" name="/run/systemd/unit-root/proc/" pid=76282 comm="(d-logind)" fstype="proc" srcname="proc" flags="rw, nosuid, nodev, noexec"

So it seems that apparmor is blocking some things.

Steps to reproduce

  1. Start a new plain archlinux container.
  2. Run ping or pacman update etc.
  3. Investigate

Container config (expanded):

architecture: x86_64
config:
  environment.LANG: de_DE.UTF-8
  environment.TZ: Europe/Berlin
  image.architecture: amd64
  image.description: Archlinux current amd64 (20201229_04:18)
  image.os: Archlinux
  image.release: current
  image.serial: "20201229_04:18"
  image.type: squashfs
  image.variant: default
  volatile.base_image: a27a7cd6a2e245a509570a8b937cec65af97e0f354b8121ae9113fe3782693d7
  volatile.eth0.host_name: mac809cb104
  volatile.eth0.hwaddr: 00:16:3e:a6:45:1d
  volatile.eth0.last_state.created: "false"
  volatile.idmap.base: "0"
  volatile.idmap.current: '[{"Isuid":true,"Isgid":false,"Hostid":1000000,"Nsid":0,"Maprange":1000000000},{"Isuid":false,"Isgid":true,"Hostid":1000000,"Nsid":0,"Maprange":1000000000}]'
  volatile.idmap.next: '[{"Isuid":true,"Isgid":false,"Hostid":1000000,"Nsid":0,"Maprange":1000000000},{"Isuid":false,"Isgid":true,"Hostid":1000000,"Nsid":0,"Maprange":1000000000}]'
  volatile.last_state.idmap: '[]'
  volatile.last_state.power: RUNNING
  volatile.uuid: 33c0e567-8f99-4cbd-b90e-d41f43115f24
devices:
  eth0:
    name: eth0
    nictype: macvlan
    parent: enp3s0
    type: nic
  root:
    path: /
    pool: one
    type: disk
ephemeral: false
profiles:
- disk1
- macvlan1only
- timezone
stateful: false
description: ""

More:

Maybe related to: https://github.com/lxc/lxd/issues/8246

There have been multiple reports of this on the forum so far.

A new version of systemd is making more use of security measure to protect specific system units.
Those security measures are incompatible with apparmor confinement safety. An apparmor policy which would allow those operations would also open the door for a lot more mount operations that would let you bypass confinement in many cases.

If your container is unprivileged, setting security.nesting to true will do the trick and things will still be root-safe so long as your kernel doesn’t have a mount related security issue (as apparmor will be useless preventing such an issue at that point).

Doing the same on a security.privileged container though would make escaping the container and gaining root on the host trivial.

1 Like

Ok thx. I will try the security.nesting solution.

Systemd again, every time I had a severe problem (system not booting etc.) in the last years, it was always systemd… :neutral_face:

But will there be a long term fix/solution for this?

I guess this is the systemd issue?

One potential fix we’re considering is shipping a file in LXD images which disables those systemd features for the entire container. It’s not yet clear how viable that is and what that may break though.