diff options
| author | Bruce Ashfield <bruce.ashfield@gmail.com> | 2026-02-18 14:07:49 +0000 |
|---|---|---|
| committer | Bruce Ashfield <bruce.ashfield@gmail.com> | 2026-02-26 01:05:01 +0000 |
| commit | 9377aede3157a3e7b702dc389c15f27523b673e7 (patch) | |
| tree | 9ea01493815cfb58e642b65b5b31472235b5a09a /recipes-containers/vcontainer/files | |
| parent | fa4b171a436559787cfcebd4046a1354a1f5cacf (diff) | |
| download | meta-virtualization-9377aede3157a3e7b702dc389c15f27523b673e7.tar.gz | |
vxn: add containerd OCI runtime integration
Add shell-based OCI runtime (vxn-oci-runtime) that enables containerd
to manage Xen DomU containers through the standard runc shim. Non-terminal
container output flows back to ctr via the shim's pipe mechanism.
New files:
- vxn-oci-runtime: OCI runtime (create/start/state/kill/delete/features/logs)
- vxn-sendtty.c: SCM_RIGHTS helper for terminal mode PTY passing
- containerd-shim-vxn-v2: PATH trick wrapper for runc shim coexistence
- containerd-config-vxn.toml: CRI config (vxn default, runc fallback)
- vctr: convenience wrapper injecting --runtime io.containerd.vxn.v2
Key design:
- Monitor subprocess uses wait on xl console (not sleep-polling) for
instant reaction when domain dies, then extracts output markers and
writes to stdout (shim pipe -> containerd FIFO -> ctr client)
- cmd_state checks monitor PID liveness (not domain status) to prevent
premature cleanup race that killed monitor before output
- cmd_delete always destroys remnant domains (no --force needed)
- Coexists with runc: /usr/libexec/vxn/shim/runc symlink + PATH trick
Verified: vctr run --rm, vctr run -d, vxn standalone, vxn daemon mode.
Signed-off-by: Bruce Ashfield <bruce.ashfield@gmail.com>
Diffstat (limited to 'recipes-containers/vcontainer/files')
5 files changed, 782 insertions, 0 deletions
diff --git a/recipes-containers/vcontainer/files/containerd-config-vxn.toml b/recipes-containers/vcontainer/files/containerd-config-vxn.toml new file mode 100644 index 00000000..4dc84630 --- /dev/null +++ b/recipes-containers/vcontainer/files/containerd-config-vxn.toml | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | version = 2 | ||
| 2 | |||
| 3 | # Register vxn shim: containerd-shim-vxn-v2 (symlink to runc shim) | ||
| 4 | # with BinaryName pointing to vxn-oci-runtime. | ||
| 5 | # This allows: ctr run --runtime io.containerd.vxn.v2 ... | ||
| 6 | |||
| 7 | # CRI plugin: make vxn the default runtime for Kubernetes | ||
| 8 | [plugins."io.containerd.grpc.v1.cri".containerd] | ||
| 9 | default_runtime_name = "vxn" | ||
| 10 | |||
| 11 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.vxn] | ||
| 12 | runtime_type = "io.containerd.vxn.v2" | ||
| 13 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.vxn.options] | ||
| 14 | BinaryName = "/usr/bin/vxn-oci-runtime" | ||
| 15 | |||
| 16 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] | ||
| 17 | runtime_type = "io.containerd.runc.v2" | ||
| 18 | [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] | ||
| 19 | BinaryName = "runc" | ||
diff --git a/recipes-containers/vcontainer/files/containerd-shim-vxn-v2 b/recipes-containers/vcontainer/files/containerd-shim-vxn-v2 new file mode 100644 index 00000000..9a4669f9 --- /dev/null +++ b/recipes-containers/vcontainer/files/containerd-shim-vxn-v2 | |||
| @@ -0,0 +1,7 @@ | |||
| 1 | #!/bin/sh | ||
| 2 | # containerd-shim-vxn-v2 | ||
| 3 | # Wraps containerd-shim-runc-v2 so that when the shim execs "runc", | ||
| 4 | # it finds vxn-oci-runtime instead. PATH trick — no temp files, | ||
| 5 | # no symlink conflicts with the real runc package. | ||
| 6 | export PATH="/usr/libexec/vxn/shim:$PATH" | ||
| 7 | exec /usr/bin/containerd-shim-runc-v2 "$@" | ||
diff --git a/recipes-containers/vcontainer/files/vctr b/recipes-containers/vcontainer/files/vctr new file mode 100644 index 00000000..ca84644a --- /dev/null +++ b/recipes-containers/vcontainer/files/vctr | |||
| @@ -0,0 +1,16 @@ | |||
| 1 | #!/bin/sh | ||
| 2 | # vctr - convenience wrapper for ctr with vxn runtime | ||
| 3 | # Usage: vctr run <image> <cmd> (same as: ctr run --runtime io.containerd.vxn.v2 ...) | ||
| 4 | # vctr <any ctr command> (passed through to ctr) | ||
| 5 | |||
| 6 | VXN_RUNTIME="io.containerd.vxn.v2" | ||
| 7 | |||
| 8 | case "$1" in | ||
| 9 | run) | ||
| 10 | shift | ||
| 11 | exec ctr run --runtime "$VXN_RUNTIME" "$@" | ||
| 12 | ;; | ||
| 13 | *) | ||
| 14 | exec ctr "$@" | ||
| 15 | ;; | ||
| 16 | esac | ||
diff --git a/recipes-containers/vcontainer/files/vxn-oci-runtime b/recipes-containers/vcontainer/files/vxn-oci-runtime new file mode 100644 index 00000000..6158cddd --- /dev/null +++ b/recipes-containers/vcontainer/files/vxn-oci-runtime | |||
| @@ -0,0 +1,650 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | # SPDX-FileCopyrightText: Copyright (C) 2025 Bruce Ashfield | ||
| 3 | # | ||
| 4 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 5 | # | ||
| 6 | # vxn-oci-runtime | ||
| 7 | # OCI runtime for containerd integration via containerd-shim-runc-v2 | ||
| 8 | # | ||
| 9 | # This implements the OCI runtime CLI spec so containerd can manage | ||
| 10 | # Xen DomU containers through the built-in runc shim: | ||
| 11 | # | ||
| 12 | # containerd -> containerd-shim-runc-v2 -> vxn-oci-runtime create/start/state/kill/delete | ||
| 13 | # | | ||
| 14 | # v | ||
| 15 | # xl create/unpause/list/shutdown/destroy | ||
| 16 | # | | ||
| 17 | # v | ||
| 18 | # Xen DomU (vxn-init.sh) | ||
| 19 | # | ||
| 20 | # This is a standalone script — it does not source vrunner.sh or | ||
| 21 | # vcontainer-common.sh. The OCI runtime lifecycle (separate create/start/ | ||
| 22 | # state invocations) is fundamentally different from the all-in-one | ||
| 23 | # vrunner flow. | ||
| 24 | # | ||
| 25 | # State directory: /run/vxn-oci-runtime/containers/<container-id>/ | ||
| 26 | |||
| 27 | set -e | ||
| 28 | |||
| 29 | RUNTIME_ROOT="/run/vxn-oci-runtime" | ||
| 30 | OCI_VERSION="1.0.2" | ||
| 31 | BLOB_DIR="/usr/share/vxn" | ||
| 32 | |||
| 33 | # ============================================================================ | ||
| 34 | # Logging | ||
| 35 | # ============================================================================ | ||
| 36 | |||
| 37 | LOG_FILE="/var/log/vxn-oci-runtime.log" | ||
| 38 | VXN_LOG="/var/log/vxn-oci-runtime.log" | ||
| 39 | |||
| 40 | log() { | ||
| 41 | local ts | ||
| 42 | ts=$(date '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "-") | ||
| 43 | # Always write to our own log (shim overrides LOG_FILE via --log) | ||
| 44 | echo "[$ts] $*" >> "$VXN_LOG" 2>/dev/null || true | ||
| 45 | if [ "$LOG_FILE" != "$VXN_LOG" ]; then | ||
| 46 | echo "[$ts] $*" >> "$LOG_FILE" 2>/dev/null || true | ||
| 47 | fi | ||
| 48 | } | ||
| 49 | |||
| 50 | die() { | ||
| 51 | log "FATAL: $*" | ||
| 52 | echo "vxn-oci-runtime: $*" >&2 | ||
| 53 | exit 1 | ||
| 54 | } | ||
| 55 | |||
| 56 | # ============================================================================ | ||
| 57 | # Architecture Detection | ||
| 58 | # ============================================================================ | ||
| 59 | |||
| 60 | detect_arch() { | ||
| 61 | local arch | ||
| 62 | arch=$(uname -m) | ||
| 63 | case "$arch" in | ||
| 64 | aarch64) | ||
| 65 | VXN_ARCH="aarch64" | ||
| 66 | VXN_KERNEL="$BLOB_DIR/aarch64/Image" | ||
| 67 | VXN_INITRAMFS="$BLOB_DIR/aarch64/initramfs.cpio.gz" | ||
| 68 | VXN_ROOTFS="$BLOB_DIR/aarch64/rootfs.img" | ||
| 69 | VXN_TYPE="pvh" | ||
| 70 | ;; | ||
| 71 | x86_64) | ||
| 72 | VXN_ARCH="x86_64" | ||
| 73 | VXN_KERNEL="$BLOB_DIR/x86_64/bzImage" | ||
| 74 | VXN_INITRAMFS="$BLOB_DIR/x86_64/initramfs.cpio.gz" | ||
| 75 | VXN_ROOTFS="$BLOB_DIR/x86_64/rootfs.img" | ||
| 76 | VXN_TYPE="pv" | ||
| 77 | ;; | ||
| 78 | *) | ||
| 79 | die "Unsupported architecture: $arch" | ||
| 80 | ;; | ||
| 81 | esac | ||
| 82 | } | ||
| 83 | |||
| 84 | # ============================================================================ | ||
| 85 | # State Management | ||
| 86 | # ============================================================================ | ||
| 87 | |||
| 88 | state_dir() { | ||
| 89 | echo "$RUNTIME_ROOT/containers/$1" | ||
| 90 | } | ||
| 91 | |||
| 92 | load_state() { | ||
| 93 | local id="$1" | ||
| 94 | local dir | ||
| 95 | dir=$(state_dir "$id") | ||
| 96 | [ -f "$dir/state.json" ] || die "container $id does not exist" | ||
| 97 | } | ||
| 98 | |||
| 99 | read_state_field() { | ||
| 100 | local id="$1" | ||
| 101 | local field="$2" | ||
| 102 | local dir | ||
| 103 | dir=$(state_dir "$id") | ||
| 104 | # Use grep/sed — jq may not be available in all environments | ||
| 105 | grep -o "\"$field\"[[:space:]]*:[[:space:]]*\"[^\"]*\"" "$dir/state.json" 2>/dev/null | \ | ||
| 106 | sed 's/.*"'"$field"'"[[:space:]]*:[[:space:]]*"//;s/"$//' | ||
| 107 | } | ||
| 108 | |||
| 109 | read_state_pid() { | ||
| 110 | local id="$1" | ||
| 111 | local dir | ||
| 112 | dir=$(state_dir "$id") | ||
| 113 | grep -o '"pid"[[:space:]]*:[[:space:]]*[0-9]*' "$dir/state.json" 2>/dev/null | \ | ||
| 114 | grep -o '[0-9]*$' | ||
| 115 | } | ||
| 116 | |||
| 117 | write_state() { | ||
| 118 | local id="$1" | ||
| 119 | local status="$2" | ||
| 120 | local pid="$3" | ||
| 121 | local bundle="$4" | ||
| 122 | local created="$5" | ||
| 123 | local dir | ||
| 124 | dir=$(state_dir "$id") | ||
| 125 | cat > "$dir/state.json" <<EOF | ||
| 126 | { | ||
| 127 | "ociVersion": "$OCI_VERSION", | ||
| 128 | "id": "$id", | ||
| 129 | "status": "$status", | ||
| 130 | "pid": $pid, | ||
| 131 | "bundle": "$bundle", | ||
| 132 | "created": "$created", | ||
| 133 | "annotations": {} | ||
| 134 | } | ||
| 135 | EOF | ||
| 136 | } | ||
| 137 | |||
| 138 | # ============================================================================ | ||
| 139 | # OCI Runtime Commands | ||
| 140 | # ============================================================================ | ||
| 141 | |||
| 142 | cmd_create() { | ||
| 143 | local container_id="" | ||
| 144 | local bundle="" | ||
| 145 | local pid_file="" | ||
| 146 | local console_socket="" | ||
| 147 | |||
| 148 | # Parse arguments | ||
| 149 | while [ $# -gt 0 ]; do | ||
| 150 | case "$1" in | ||
| 151 | --bundle) bundle="$2"; shift 2 ;; | ||
| 152 | --bundle=*) bundle="${1#--bundle=}"; shift ;; | ||
| 153 | --pid-file) pid_file="$2"; shift 2 ;; | ||
| 154 | --pid-file=*) pid_file="${1#--pid-file=}"; shift ;; | ||
| 155 | --console-socket) console_socket="$2"; shift 2 ;; | ||
| 156 | --console-socket=*) console_socket="${1#--console-socket=}"; shift ;; | ||
| 157 | -*) log " DEBUG: unknown create flag: $1"; shift ;; | ||
| 158 | *) | ||
| 159 | if [ -z "$container_id" ]; then | ||
| 160 | container_id="$1" | ||
| 161 | fi | ||
| 162 | shift | ||
| 163 | ;; | ||
| 164 | esac | ||
| 165 | done | ||
| 166 | |||
| 167 | [ -n "$container_id" ] || die "create: container ID required" | ||
| 168 | [ -n "$bundle" ] || die "create: --bundle required" | ||
| 169 | [ -f "$bundle/config.json" ] || die "create: $bundle/config.json not found" | ||
| 170 | |||
| 171 | log "CREATE: id=$container_id bundle=$bundle console_socket=$console_socket" | ||
| 172 | # Debug: log what the shim gives us | ||
| 173 | log " DEBUG: fd0=$(readlink /proc/$$/fd/0 2>/dev/null) fd1=$(readlink /proc/$$/fd/1 2>/dev/null) fd2=$(readlink /proc/$$/fd/2 2>/dev/null)" | ||
| 174 | log " DEBUG: bundle dir: $(ls -F $bundle/ 2>/dev/null | tr '\n' ' ')" | ||
| 175 | local _taskdir | ||
| 176 | _taskdir=$(dirname "$bundle") | ||
| 177 | log " DEBUG: task dir ($bundle): $(ls -F $bundle/ 2>/dev/null | tr '\n' ' ')" | ||
| 178 | log " DEBUG: parent dir ($_taskdir): $(ls -F $_taskdir/ 2>/dev/null | tr '\n' ' ')" | ||
| 179 | log " DEBUG: all pipes: $(find /run -type p 2>/dev/null | tr '\n' ' ')" | ||
| 180 | |||
| 181 | detect_arch | ||
| 182 | |||
| 183 | local dir | ||
| 184 | dir=$(state_dir "$container_id") | ||
| 185 | mkdir -p "$dir" | ||
| 186 | |||
| 187 | # Read config.json — parse process.args, process.env, process.cwd, process.terminal | ||
| 188 | local config="$bundle/config.json" | ||
| 189 | local entrypoint="" env_vars="" cwd="/" terminal="false" | ||
| 190 | |||
| 191 | if command -v jq >/dev/null 2>&1; then | ||
| 192 | entrypoint=$(jq -r '(.process.args // []) | join(" ")' "$config" 2>/dev/null) | ||
| 193 | cwd=$(jq -r '.process.cwd // "/"' "$config" 2>/dev/null) | ||
| 194 | env_vars=$(jq -r '(.process.env // []) | join("\n")' "$config" 2>/dev/null) | ||
| 195 | terminal=$(jq -r '.process.terminal // false' "$config" 2>/dev/null) | ||
| 196 | else | ||
| 197 | # Fallback: grep/sed parsing | ||
| 198 | entrypoint=$(grep -o '"args"[[:space:]]*:[[:space:]]*\[[^]]*\]' "$config" 2>/dev/null | \ | ||
| 199 | sed 's/"args"[[:space:]]*:[[:space:]]*\[//;s/\]$//' | \ | ||
| 200 | tr ',' '\n' | sed 's/^ *"//;s/"$//' | tr '\n' ' ' | sed 's/ $//') | ||
| 201 | cwd=$(grep -o '"cwd"[[:space:]]*:[[:space:]]*"[^"]*"' "$config" 2>/dev/null | \ | ||
| 202 | sed 's/"cwd"[[:space:]]*:[[:space:]]*"//;s/"$//') | ||
| 203 | [ -z "$cwd" ] && cwd="/" | ||
| 204 | if grep -q '"terminal"[[:space:]]*:[[:space:]]*true' "$config" 2>/dev/null; then | ||
| 205 | terminal="true" | ||
| 206 | fi | ||
| 207 | fi | ||
| 208 | |||
| 209 | log " entrypoint='$entrypoint' cwd='$cwd' terminal=$terminal" | ||
| 210 | |||
| 211 | # Create ext4 disk image from bundle/rootfs/ | ||
| 212 | local rootfs_dir="$bundle/rootfs" | ||
| 213 | local input_img="$dir/input.img" | ||
| 214 | |||
| 215 | if [ -d "$rootfs_dir" ] && [ -n "$(ls -A "$rootfs_dir" 2>/dev/null)" ]; then | ||
| 216 | # Calculate size: rootfs size + 50% headroom, minimum 64MB | ||
| 217 | local rootfs_size_kb | ||
| 218 | rootfs_size_kb=$(du -sk "$rootfs_dir" 2>/dev/null | awk '{print $1}') | ||
| 219 | local img_size_kb=$(( (rootfs_size_kb * 3 / 2) )) | ||
| 220 | [ "$img_size_kb" -lt 65536 ] && img_size_kb=65536 | ||
| 221 | |||
| 222 | log " Creating ext4 image: ${img_size_kb}KB from $rootfs_dir" | ||
| 223 | mke2fs -t ext4 -d "$rootfs_dir" -b 4096 "$input_img" "${img_size_kb}K" \ | ||
| 224 | >> "$LOG_FILE" 2>&1 || die "create: failed to create ext4 image" | ||
| 225 | else | ||
| 226 | die "create: $rootfs_dir is empty or does not exist" | ||
| 227 | fi | ||
| 228 | |||
| 229 | # Encode entrypoint as base64 for kernel cmdline | ||
| 230 | local cmd_b64="" | ||
| 231 | if [ -n "$entrypoint" ]; then | ||
| 232 | cmd_b64=$(echo -n "$entrypoint" | base64 -w0) | ||
| 233 | fi | ||
| 234 | |||
| 235 | # Domain name: vxn-oci-<short-id> | ||
| 236 | local domname="vxn-oci-${container_id}" | ||
| 237 | # Xen domain names have a max length — truncate if needed | ||
| 238 | if [ ${#domname} -gt 64 ]; then | ||
| 239 | domname="vxn-oci-${container_id:0:55}" | ||
| 240 | fi | ||
| 241 | echo "$domname" > "$dir/domname" | ||
| 242 | |||
| 243 | # Memory and vCPUs — configurable via environment | ||
| 244 | local xen_memory="${VXN_OCI_MEMORY:-512}" | ||
| 245 | local xen_vcpus="${VXN_OCI_VCPUS:-2}" | ||
| 246 | |||
| 247 | # Generate Xen domain config | ||
| 248 | local config_cfg="$dir/config.cfg" | ||
| 249 | local kernel_extra="console=hvc0 quiet loglevel=0 init=/init vcontainer.blk=xvd vcontainer.init=/vxn-init.sh" | ||
| 250 | [ -n "$cmd_b64" ] && kernel_extra="$kernel_extra docker_cmd=$cmd_b64" | ||
| 251 | kernel_extra="$kernel_extra docker_input=oci" | ||
| 252 | |||
| 253 | # Terminal mode: suppress boot messages for raw console I/O | ||
| 254 | if [ "$terminal" = "true" ]; then | ||
| 255 | kernel_extra="$kernel_extra docker_interactive=1" | ||
| 256 | fi | ||
| 257 | |||
| 258 | cat > "$config_cfg" <<XENEOF | ||
| 259 | # Auto-generated Xen domain config for vxn-oci-runtime | ||
| 260 | name = "$domname" | ||
| 261 | type = "$VXN_TYPE" | ||
| 262 | memory = $xen_memory | ||
| 263 | vcpus = $xen_vcpus | ||
| 264 | |||
| 265 | kernel = "$VXN_KERNEL" | ||
| 266 | ramdisk = "$VXN_INITRAMFS" | ||
| 267 | extra = "$kernel_extra" | ||
| 268 | |||
| 269 | disk = [ 'format=raw,vdev=xvda,access=ro,target=$VXN_ROOTFS', 'format=raw,vdev=xvdb,access=ro,target=$input_img' ] | ||
| 270 | vif = [] | ||
| 271 | |||
| 272 | serial = 'pty' | ||
| 273 | |||
| 274 | on_poweroff = "destroy" | ||
| 275 | on_reboot = "destroy" | ||
| 276 | on_crash = "destroy" | ||
| 277 | XENEOF | ||
| 278 | |||
| 279 | log " Xen config written to $config_cfg" | ||
| 280 | |||
| 281 | # Create domain in paused state (OCI spec: create does not start) | ||
| 282 | xl create -p "$config_cfg" >> "$LOG_FILE" 2>&1 || die "create: xl create -p failed" | ||
| 283 | |||
| 284 | log " Domain $domname created (paused)" | ||
| 285 | |||
| 286 | # Get domid and read Xen console PTY from xenstore | ||
| 287 | local domid pty_path | ||
| 288 | domid=$(xl domid "$domname" 2>/dev/null) || die "create: failed to get domid for $domname" | ||
| 289 | pty_path=$(xenstore-read "/local/domain/$domid/console/tty" 2>/dev/null) || true | ||
| 290 | log " domid=$domid pty=$pty_path" | ||
| 291 | |||
| 292 | if [ -n "$pty_path" ]; then | ||
| 293 | echo "$pty_path" > "$dir/pty" | ||
| 294 | fi | ||
| 295 | |||
| 296 | # Terminal mode: send PTY fd to shim via console-socket (SCM_RIGHTS) | ||
| 297 | if [ -n "$console_socket" ] && [ -n "$pty_path" ]; then | ||
| 298 | if command -v vxn-sendtty >/dev/null 2>&1; then | ||
| 299 | vxn-sendtty "$console_socket" "$pty_path" \ | ||
| 300 | || log " WARNING: vxn-sendtty failed (socket=$console_socket pty=$pty_path)" | ||
| 301 | log " Sent PTY fd to console-socket" | ||
| 302 | else | ||
| 303 | log " WARNING: vxn-sendtty not found, cannot send PTY to shim" | ||
| 304 | fi | ||
| 305 | fi | ||
| 306 | |||
| 307 | # Persistent log dir — survives container deletion by shim | ||
| 308 | local logdir="/var/log/vxn-oci-runtime/containers/$container_id" | ||
| 309 | mkdir -p "$logdir" | ||
| 310 | |||
| 311 | # Monitor process: tracks domain lifecycle and captures output. | ||
| 312 | # | ||
| 313 | # Non-terminal mode: xl console captures the domain's serial output. | ||
| 314 | # When the domain dies, xl console exits (PTY closes). We immediately | ||
| 315 | # extract content between OUTPUT_START/END markers and write to stdout. | ||
| 316 | # stdout is the shim's pipe → containerd copies to client FIFO → ctr. | ||
| 317 | # | ||
| 318 | # CRITICAL: We use "wait" on xl console instead of polling xl list. | ||
| 319 | # Polling with sleep 5 was too slow — the shim detected "stopped" and | ||
| 320 | # killed the monitor before it had a chance to output. Using wait gives | ||
| 321 | # us instant reaction when the domain dies. | ||
| 322 | # | ||
| 323 | # Terminal mode (console-socket): the shim owns the PTY exclusively. | ||
| 324 | # We just wait for the domain to exit without capturing console. | ||
| 325 | local _dn="$domname" _logdir="$logdir" _csock="$console_socket" | ||
| 326 | ( | ||
| 327 | if [ -z "$_csock" ]; then | ||
| 328 | # Non-terminal: capture console to persistent log dir | ||
| 329 | xl console "$_dn" > "$_logdir/console.log" 2>&1 & | ||
| 330 | _cpid=$! | ||
| 331 | |||
| 332 | # Wait for xl console to exit — domain death closes the PTY, | ||
| 333 | # which causes xl console to exit immediately. No polling delay. | ||
| 334 | wait $_cpid 2>/dev/null | ||
| 335 | |||
| 336 | # Extract output between markers and write to stdout. | ||
| 337 | # stdout IS the shim's pipe (confirmed: fd1=pipe). The shim's | ||
| 338 | # io.Copy goroutine reads from this pipe and writes to the | ||
| 339 | # containerd client FIFO. ctr reads from the FIFO. | ||
| 340 | if [ -f "$_logdir/console.log" ]; then | ||
| 341 | _relay=false | ||
| 342 | while IFS= read -r _line; do | ||
| 343 | _line="${_line%%$'\r'}" | ||
| 344 | case "$_line" in | ||
| 345 | *===OUTPUT_START===*) _relay=true; continue ;; | ||
| 346 | *===OUTPUT_END===*) _relay=false; continue ;; | ||
| 347 | *) [ "$_relay" = "true" ] && printf '%s\n' "$_line" ;; | ||
| 348 | esac | ||
| 349 | done < "$_logdir/console.log" | ||
| 350 | fi | ||
| 351 | else | ||
| 352 | # Terminal mode: shim owns PTY — just wait for domain death | ||
| 353 | while xl list "$_dn" >/dev/null 2>&1; do sleep 2; done | ||
| 354 | fi | ||
| 355 | ) & | ||
| 356 | local monitor_pid=$! | ||
| 357 | |||
| 358 | # Write monitor PID to --pid-file (runc shim monitors /proc/<pid>) | ||
| 359 | # Use printf — shim parses with strconv.Atoi which rejects trailing newlines | ||
| 360 | if [ -n "$pid_file" ]; then | ||
| 361 | printf '%s' "$monitor_pid" > "$pid_file" | ||
| 362 | fi | ||
| 363 | printf '%s' "$monitor_pid" > "$dir/monitor.pid" | ||
| 364 | |||
| 365 | log " monitor PID=$monitor_pid" | ||
| 366 | |||
| 367 | # Write OCI state | ||
| 368 | local created | ||
| 369 | created=$(date -u '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo "1970-01-01T00:00:00Z") | ||
| 370 | write_state "$container_id" "created" "$monitor_pid" "$bundle" "$created" | ||
| 371 | |||
| 372 | log "CREATE: done" | ||
| 373 | } | ||
| 374 | |||
| 375 | cmd_start() { | ||
| 376 | local container_id="$1" | ||
| 377 | [ -n "$container_id" ] || die "start: container ID required" | ||
| 378 | |||
| 379 | log "START: id=$container_id" | ||
| 380 | load_state "$container_id" | ||
| 381 | |||
| 382 | local dir | ||
| 383 | dir=$(state_dir "$container_id") | ||
| 384 | local domname | ||
| 385 | domname=$(cat "$dir/domname") | ||
| 386 | |||
| 387 | # Verify domain exists and is paused | ||
| 388 | xl list "$domname" >/dev/null 2>&1 || die "start: domain $domname not found" | ||
| 389 | |||
| 390 | # Unpause the domain | ||
| 391 | xl unpause "$domname" >> "$LOG_FILE" 2>&1 || die "start: xl unpause failed" | ||
| 392 | |||
| 393 | # Update state | ||
| 394 | local pid bundle created | ||
| 395 | pid=$(read_state_pid "$container_id") | ||
| 396 | bundle=$(read_state_field "$container_id" "bundle") | ||
| 397 | created=$(read_state_field "$container_id" "created") | ||
| 398 | write_state "$container_id" "running" "$pid" "$bundle" "$created" | ||
| 399 | |||
| 400 | log "START: done" | ||
| 401 | } | ||
| 402 | |||
| 403 | cmd_state() { | ||
| 404 | local container_id="$1" | ||
| 405 | [ -n "$container_id" ] || die "state: container ID required" | ||
| 406 | |||
| 407 | local dir | ||
| 408 | dir=$(state_dir "$container_id") | ||
| 409 | [ -f "$dir/state.json" ] || die "container $container_id does not exist" | ||
| 410 | |||
| 411 | # Read stored state | ||
| 412 | local status pid bundle created | ||
| 413 | status=$(read_state_field "$container_id" "status") | ||
| 414 | pid=$(read_state_pid "$container_id") | ||
| 415 | bundle=$(read_state_field "$container_id" "bundle") | ||
| 416 | created=$(read_state_field "$container_id" "created") | ||
| 417 | |||
| 418 | # The monitor process (init PID) is the authority for task liveness. | ||
| 419 | # Even after the Xen domain exits, the monitor may still be extracting | ||
| 420 | # output from console.log and writing it to stdout (the shim's pipe). | ||
| 421 | # Only report "stopped" when the monitor PID is actually dead. | ||
| 422 | # This prevents the shim from triggering kill/delete while the monitor | ||
| 423 | # is still outputting — which was the root cause of the I/O race. | ||
| 424 | if [ "$status" = "running" ] || [ "$status" = "created" ]; then | ||
| 425 | local monitor_alive=false | ||
| 426 | if [ -n "$pid" ] && [ "$pid" -gt 0 ] 2>/dev/null; then | ||
| 427 | if kill -0 "$pid" 2>/dev/null; then | ||
| 428 | monitor_alive=true | ||
| 429 | fi | ||
| 430 | fi | ||
| 431 | if [ "$monitor_alive" = "false" ]; then | ||
| 432 | status="stopped" | ||
| 433 | write_state "$container_id" "stopped" "$pid" "$bundle" "$created" | ||
| 434 | fi | ||
| 435 | fi | ||
| 436 | |||
| 437 | # Output OCI state JSON to stdout | ||
| 438 | cat <<EOF | ||
| 439 | {"ociVersion":"$OCI_VERSION","id":"$container_id","status":"$status","pid":${pid:-0},"bundle":"$bundle","created":"$created","annotations":{}} | ||
| 440 | EOF | ||
| 441 | } | ||
| 442 | |||
| 443 | cmd_kill() { | ||
| 444 | local container_id="$1" | ||
| 445 | local signal="${2:-SIGTERM}" | ||
| 446 | [ -n "$container_id" ] || die "kill: container ID required" | ||
| 447 | |||
| 448 | log "KILL: id=$container_id signal=$signal" | ||
| 449 | load_state "$container_id" | ||
| 450 | |||
| 451 | local dir | ||
| 452 | dir=$(state_dir "$container_id") | ||
| 453 | local domname | ||
| 454 | domname=$(cat "$dir/domname") | ||
| 455 | |||
| 456 | # Normalize signal: accept both numeric and symbolic forms | ||
| 457 | case "$signal" in | ||
| 458 | 9|SIGKILL|KILL) | ||
| 459 | xl destroy "$domname" >> "$LOG_FILE" 2>&1 || true | ||
| 460 | ;; | ||
| 461 | 2|SIGINT|INT) | ||
| 462 | xl destroy "$domname" >> "$LOG_FILE" 2>&1 || true | ||
| 463 | ;; | ||
| 464 | 15|SIGTERM|TERM|"") | ||
| 465 | xl shutdown "$domname" >> "$LOG_FILE" 2>&1 || true | ||
| 466 | # Wait briefly for graceful shutdown, then force destroy | ||
| 467 | local i | ||
| 468 | for i in 1 2 3 4 5 6 7 8 9 10; do | ||
| 469 | xl list "$domname" >/dev/null 2>&1 || break | ||
| 470 | sleep 1 | ||
| 471 | done | ||
| 472 | xl destroy "$domname" >> "$LOG_FILE" 2>&1 || true | ||
| 473 | ;; | ||
| 474 | *) | ||
| 475 | # Unknown signal — treat as SIGTERM | ||
| 476 | xl shutdown "$domname" >> "$LOG_FILE" 2>&1 || true | ||
| 477 | ;; | ||
| 478 | esac | ||
| 479 | |||
| 480 | # Update state | ||
| 481 | local pid bundle created | ||
| 482 | pid=$(read_state_pid "$container_id") | ||
| 483 | bundle=$(read_state_field "$container_id" "bundle") | ||
| 484 | created=$(read_state_field "$container_id" "created") | ||
| 485 | write_state "$container_id" "stopped" "$pid" "$bundle" "$created" | ||
| 486 | |||
| 487 | log "KILL: done" | ||
| 488 | } | ||
| 489 | |||
| 490 | cmd_delete() { | ||
| 491 | local container_id="" | ||
| 492 | local force=false | ||
| 493 | |||
| 494 | # Parse arguments | ||
| 495 | while [ $# -gt 0 ]; do | ||
| 496 | case "$1" in | ||
| 497 | --force|-f) force=true; shift ;; | ||
| 498 | -*) shift ;; | ||
| 499 | *) | ||
| 500 | if [ -z "$container_id" ]; then | ||
| 501 | container_id="$1" | ||
| 502 | fi | ||
| 503 | shift | ||
| 504 | ;; | ||
| 505 | esac | ||
| 506 | done | ||
| 507 | |||
| 508 | [ -n "$container_id" ] || die "delete: container ID required" | ||
| 509 | |||
| 510 | log "DELETE: id=$container_id force=$force" | ||
| 511 | |||
| 512 | local dir | ||
| 513 | dir=$(state_dir "$container_id") | ||
| 514 | [ -d "$dir" ] || die "container $container_id does not exist" | ||
| 515 | |||
| 516 | # Clean up Xen domain if still present. | ||
| 517 | # The shim only calls delete after the init PID (monitor) has exited, | ||
| 518 | # meaning the task is complete. The domain may still be shutting down — | ||
| 519 | # always destroy it as part of cleanup. | ||
| 520 | if [ -f "$dir/domname" ]; then | ||
| 521 | local domname | ||
| 522 | domname=$(cat "$dir/domname") | ||
| 523 | if xl list "$domname" >/dev/null 2>&1; then | ||
| 524 | xl destroy "$domname" >> "$LOG_FILE" 2>&1 || true | ||
| 525 | fi | ||
| 526 | fi | ||
| 527 | |||
| 528 | # Kill monitor process (also kills console capture child) | ||
| 529 | if [ -f "$dir/monitor.pid" ]; then | ||
| 530 | local mpid | ||
| 531 | mpid=$(cat "$dir/monitor.pid") | ||
| 532 | kill "$mpid" 2>/dev/null || true | ||
| 533 | fi | ||
| 534 | |||
| 535 | # Remove state directory (includes disk images) | ||
| 536 | rm -rf "$dir" | ||
| 537 | |||
| 538 | log "DELETE: done" | ||
| 539 | } | ||
| 540 | |||
| 541 | cmd_features() { | ||
| 542 | cat <<EOF | ||
| 543 | { | ||
| 544 | "ociVersionMin": "1.0.0", | ||
| 545 | "ociVersionMax": "$OCI_VERSION", | ||
| 546 | "hooks": [], | ||
| 547 | "mountOptions": [], | ||
| 548 | "linux": { | ||
| 549 | "namespaces": [], | ||
| 550 | "capabilities": [], | ||
| 551 | "cgroup": { | ||
| 552 | "v1": false, | ||
| 553 | "v2": false | ||
| 554 | }, | ||
| 555 | "seccomp": { | ||
| 556 | "enabled": false | ||
| 557 | }, | ||
| 558 | "apparmor": { | ||
| 559 | "enabled": false | ||
| 560 | }, | ||
| 561 | "selinux": { | ||
| 562 | "enabled": false | ||
| 563 | } | ||
| 564 | }, | ||
| 565 | "annotations": { | ||
| 566 | "io.containerd.runc.v2.runtime_type": "vm" | ||
| 567 | } | ||
| 568 | } | ||
| 569 | EOF | ||
| 570 | } | ||
| 571 | |||
| 572 | cmd_logs() { | ||
| 573 | local container_id="$1" | ||
| 574 | [ -n "$container_id" ] || die "logs: container ID required" | ||
| 575 | |||
| 576 | # Check persistent log dir first, then state dir | ||
| 577 | local logfile="" | ||
| 578 | local logdir="/var/log/vxn-oci-runtime/containers/$container_id" | ||
| 579 | local dir | ||
| 580 | dir=$(state_dir "$container_id") | ||
| 581 | |||
| 582 | if [ -f "$logdir/console.log" ]; then | ||
| 583 | logfile="$logdir/console.log" | ||
| 584 | elif [ -f "$dir/console.log" ]; then | ||
| 585 | logfile="$dir/console.log" | ||
| 586 | else | ||
| 587 | die "no logs for $container_id" | ||
| 588 | fi | ||
| 589 | |||
| 590 | # Extract content between OUTPUT_START/END markers (non-terminal mode) | ||
| 591 | local relay=false | ||
| 592 | while IFS= read -r line; do | ||
| 593 | line="${line%%$'\r'}" | ||
| 594 | case "$line" in | ||
| 595 | *===OUTPUT_START===*) relay=true; continue ;; | ||
| 596 | *===OUTPUT_END===*) relay=false; continue ;; | ||
| 597 | *) | ||
| 598 | if [ "$relay" = "true" ]; then | ||
| 599 | printf '%s\n' "$line" | ||
| 600 | fi | ||
| 601 | ;; | ||
| 602 | esac | ||
| 603 | done < "$logfile" | ||
| 604 | } | ||
| 605 | |||
| 606 | # ============================================================================ | ||
| 607 | # Main | ||
| 608 | # ============================================================================ | ||
| 609 | |||
| 610 | mkdir -p "$RUNTIME_ROOT/containers" 2>/dev/null || true | ||
| 611 | |||
| 612 | # Parse global options before command | ||
| 613 | while [ $# -gt 0 ]; do | ||
| 614 | case "$1" in | ||
| 615 | --root) RUNTIME_ROOT="$2"; shift 2 ;; | ||
| 616 | --root=*) RUNTIME_ROOT="${1#--root=}"; shift ;; | ||
| 617 | --log) LOG_FILE="$2"; shift 2 ;; | ||
| 618 | --log=*) LOG_FILE="${1#--log=}"; shift ;; | ||
| 619 | --log-format) shift 2 ;; # accepted but ignored | ||
| 620 | --log-format=*) shift ;; | ||
| 621 | --systemd-cgroup) shift ;; # accepted but ignored | ||
| 622 | -*) shift ;; # skip other global flags | ||
| 623 | *) break ;; # first non-flag is the command | ||
| 624 | esac | ||
| 625 | done | ||
| 626 | |||
| 627 | command="${1:-}" | ||
| 628 | shift || true | ||
| 629 | |||
| 630 | case "$command" in | ||
| 631 | create) cmd_create "$@" ;; | ||
| 632 | start) cmd_start "$@" ;; | ||
| 633 | state) cmd_state "$@" ;; | ||
| 634 | kill) cmd_kill "$@" ;; | ||
| 635 | delete) cmd_delete "$@" ;; | ||
| 636 | features) cmd_features "$@" ;; | ||
| 637 | logs) cmd_logs "$@" ;; | ||
| 638 | --version|version) | ||
| 639 | echo "vxn-oci-runtime version 1.0.0" | ||
| 640 | echo "spec: $OCI_VERSION" | ||
| 641 | ;; | ||
| 642 | *) | ||
| 643 | if [ -n "$command" ]; then | ||
| 644 | log "Unknown command: $command (args: $*)" | ||
| 645 | fi | ||
| 646 | echo "Usage: vxn-oci-runtime <command> [args...]" >&2 | ||
| 647 | echo "Commands: create, start, state, kill, delete, logs" >&2 | ||
| 648 | exit 1 | ||
| 649 | ;; | ||
| 650 | esac | ||
diff --git a/recipes-containers/vcontainer/files/vxn-sendtty.c b/recipes-containers/vcontainer/files/vxn-sendtty.c new file mode 100644 index 00000000..a253b129 --- /dev/null +++ b/recipes-containers/vcontainer/files/vxn-sendtty.c | |||
| @@ -0,0 +1,90 @@ | |||
| 1 | /* | ||
| 2 | * SPDX-FileCopyrightText: Copyright (C) 2025 Bruce Ashfield | ||
| 3 | * SPDX-License-Identifier: GPL-2.0-only | ||
| 4 | * | ||
| 5 | * vxn-sendtty - Send a PTY fd to a containerd shim via SCM_RIGHTS | ||
| 6 | * | ||
| 7 | * Usage: vxn-sendtty <console-socket-path> <pty-path> | ||
| 8 | * | ||
| 9 | * Opens pty-path, connects to console-socket (Unix socket), and sends | ||
| 10 | * the PTY fd via sendmsg() with SCM_RIGHTS. This is the OCI runtime | ||
| 11 | * protocol for terminal mode (--console-socket): the shim receives the | ||
| 12 | * PTY master and bridges it to the user's terminal. | ||
| 13 | * | ||
| 14 | * Shell can't do SCM_RIGHTS natively, hence this small C helper. | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <stdio.h> | ||
| 18 | #include <stdlib.h> | ||
| 19 | #include <string.h> | ||
| 20 | #include <unistd.h> | ||
| 21 | #include <fcntl.h> | ||
| 22 | #include <sys/socket.h> | ||
| 23 | #include <sys/un.h> | ||
| 24 | |||
| 25 | int main(int argc, char *argv[]) | ||
| 26 | { | ||
| 27 | int pty_fd, sock_fd, rc; | ||
| 28 | struct sockaddr_un addr; | ||
| 29 | struct msghdr msg; | ||
| 30 | struct iovec iov; | ||
| 31 | char buf[1] = {0}; | ||
| 32 | char cmsg_buf[CMSG_SPACE(sizeof(int))]; | ||
| 33 | struct cmsghdr *cmsg; | ||
| 34 | |||
| 35 | if (argc != 3) { | ||
| 36 | fprintf(stderr, "Usage: %s <console-socket-path> <pty-path>\n", | ||
| 37 | argv[0]); | ||
| 38 | return 1; | ||
| 39 | } | ||
| 40 | |||
| 41 | pty_fd = open(argv[2], O_RDWR | O_NOCTTY); | ||
| 42 | if (pty_fd < 0) { | ||
| 43 | perror("open pty"); | ||
| 44 | return 1; | ||
| 45 | } | ||
| 46 | |||
| 47 | sock_fd = socket(AF_UNIX, SOCK_STREAM, 0); | ||
| 48 | if (sock_fd < 0) { | ||
| 49 | perror("socket"); | ||
| 50 | close(pty_fd); | ||
| 51 | return 1; | ||
| 52 | } | ||
| 53 | |||
| 54 | memset(&addr, 0, sizeof(addr)); | ||
| 55 | addr.sun_family = AF_UNIX; | ||
| 56 | strncpy(addr.sun_path, argv[1], sizeof(addr.sun_path) - 1); | ||
| 57 | |||
| 58 | if (connect(sock_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { | ||
| 59 | perror("connect"); | ||
| 60 | close(pty_fd); | ||
| 61 | close(sock_fd); | ||
| 62 | return 1; | ||
| 63 | } | ||
| 64 | |||
| 65 | memset(&msg, 0, sizeof(msg)); | ||
| 66 | iov.iov_base = buf; | ||
| 67 | iov.iov_len = sizeof(buf); | ||
| 68 | msg.msg_iov = &iov; | ||
| 69 | msg.msg_iovlen = 1; | ||
| 70 | msg.msg_control = cmsg_buf; | ||
| 71 | msg.msg_controllen = sizeof(cmsg_buf); | ||
| 72 | |||
| 73 | cmsg = CMSG_FIRSTHDR(&msg); | ||
| 74 | cmsg->cmsg_level = SOL_SOCKET; | ||
| 75 | cmsg->cmsg_type = SCM_RIGHTS; | ||
| 76 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | ||
| 77 | memcpy(CMSG_DATA(cmsg), &pty_fd, sizeof(int)); | ||
| 78 | |||
| 79 | rc = sendmsg(sock_fd, &msg, 0); | ||
| 80 | if (rc < 0) { | ||
| 81 | perror("sendmsg"); | ||
| 82 | close(pty_fd); | ||
| 83 | close(sock_fd); | ||
| 84 | return 1; | ||
| 85 | } | ||
| 86 | |||
| 87 | close(pty_fd); | ||
| 88 | close(sock_fd); | ||
| 89 | return 0; | ||
| 90 | } | ||
