From c35c9680404dd523f5077505e461def00ae1688b Mon Sep 17 00:00:00 2001 From: Miruna Paun Date: Thu, 13 Jul 2017 18:20:00 +0200 Subject: Removed all mentioned of the word "Platform" LXCR-7891 Where it didn't do more harm then good to do so. Signed-off-by: Miruna Paun --- .../doc/hypervisor_virtualization.xml | 741 +++++++++++++++++++++ 1 file changed, 741 insertions(+) create mode 100644 doc/book-enea-nfv-access-guide/doc/hypervisor_virtualization.xml (limited to 'doc/book-enea-nfv-access-guide/doc/hypervisor_virtualization.xml') diff --git a/doc/book-enea-nfv-access-guide/doc/hypervisor_virtualization.xml b/doc/book-enea-nfv-access-guide/doc/hypervisor_virtualization.xml new file mode 100644 index 0000000..f7f186c --- /dev/null +++ b/doc/book-enea-nfv-access-guide/doc/hypervisor_virtualization.xml @@ -0,0 +1,741 @@ + + + + Hypervisor Virtualization + + The KVM, Kernel-based Virtual Machine, is a virtualization + infrastructure for the Linux kernel which turns it into a hypervisor. KVM + requires a processor with a hardware virtualization extension. + + KVM uses QEMU, an open source machine emulator and virtualizer, to + virtualize a complete system. With KVM it is possible to run multiple guests + of a variety of operating systems, each with a complete set of virtualized + hardware. + +
+ Launching a Virtual Machine + + QEMU can make use of KVM when running a target architecture that is + the same as the host architecture. For instance, when running + qemu-system-x86_64 on an x86-64 compatible processor (containing + virtualization extensions Intel VT or AMD-V), you can take advantage of + the KVM acceleration, giving you benefit for your host and your guest + system. + + Enea Linux includes an optimizied version of QEMU with KVM-only + support. To use KVM pass --enable-kvm to QEMU. + + The following is an example of starting a guest: + + taskset -c 0,1 qemu-system-x86_64 \ +-cpu host -M q35 -smp cores=2,sockets=1 \ +-vcpu 0,affinity=0 -vcpu 1,affinity=1 \ +-enable-kvm -nographic \ +-kernel bzImage \ +-drive file=enea-image-virtualization-guest-qemux86-64.ext4,if=virtio,format=raw \ +-append 'root=/dev/vda console=ttyS0,115200' \ +-m 4096 \ +-object memory-backend-file,id=mem,size=4096M,mem-path=/dev/hugepages,share=on \ +-numa node,memdev=mem -mem-prealloc +
+ +
+ Main QEMU boot options + + Below are detailed all the pertinent boot options for the QEMU + emulator: + + + + SMP - at least 2 cores should be enabled in order to isolate + application(s) running in virtual machine(s) on specific cores for + better performance. + + -smp cores=2,threads=1,sockets=1 \ + + + + CPU affinity - associate virtual CPUs with physical CPUs and + optionally assign a default real time priority to the virtual CPU + process in the host kernel. This option allows you to start qemu vCPUs + on isolated physical CPUs. + + -vcpu 0,affinity=0 \ + + + + Hugepages - KVM guests can be deployed with huge page memory + support in order to reduce memory consumption and improve performance, + by reducing CPU cache usage. By using huge pages for a KVM guest, less + memory is used for page tables and TLB (Translation Lookaside Buffer) + misses are reduced, thereby significantly increasing performance, + especially for memory-intensive situations. + + -object memory-backend-file,id=mem,size=4096M,mem-path=/dev/hugepages,share=on \ + + + + Memory preallocation - preallocate huge pages at startup time + can improve performance but it may affect the qemu boot time. + + -mem-prealloc \ + + + + Enable realtime characteristics - run qemu with realtime + features. While that mildly implies that "-realtime" alone might do + something, it's just an identifier for options that are partially + realtime. If you're running in a realtime or low latency environment, + you don't want your pages to be swapped out and mlock does that, thus + mlock=on. If you want VM density, then you may want swappable VMs, + thus mlock=off. + + -realtime mlock=on \ + + + + If the hardware does not have an IOMMU (known as "Intel VT-d" on + Intel-based machines and "AMD I/O Virtualization Technology" on AMD-based + machines), it will not be possible to assign devices in KVM. + Virtualization Technology features (VT-d, VT-x, etc.) must be enabled from + BIOS on the host target before starting a virtual machine. +
+ +
+ Networking in guest + +
+ Using vhost-user support + + The goal of vhost-user is to implement a Virtio transport, staying + as close as possible to the vhost paradigm of using shared memory, + ioeventfds and irqfds. A UNIX domain socket based mechanism allows the + set up of resources used by a number of Vrings shared between two + userspace processes, which will be placed in shared memory. + + To run QEMU with the vhost-user backend, you have to provide the + named UNIX domain socket which needs to be already opened by the + backend: + + -object memory-backend-file,id=mem,size=4096M,mem-path=/dev/hugepages,share=on \ +-chardev socket,id=char0,path=/var/run/openvswitch/vhost-user1 \ +-netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \ +-device virtio-net-pci,netdev=mynet1,mac=52:54:00:00:00:01 \ + + The vHost User standard uses a client-server model. The server + creates and manages the vHost User sockets and the client connects to + the sockets created by the server. It is recommended to use QEMU as + server so the vhost-user client can be restarted without affecting the + server, otherwise if the server side dies all clients need to be + restarted. + + Using vhost-user in QEMU as server will offer the flexibility to + stop and start the virtual machine with no impact on virtual switch from + the host (vhost-user-client). + + -chardev socket,id=char0,path=/var/run/openvswitch/vhost-user1,server \ +
+ +
+ Using TAP Interfaces + + QEMU can use TAP interfaces to provide full networking capability + for the guest OS: + + -netdev tap,id=net0,ifname=tap0,script=no,downscript=no \ +-device virtio-net-pci,netdev=net0,mac=22:EA:FB:A8:25:AE \ +
+ +
+ VFIO passthrough VF (SR-IOV) to guest + + KVM hypervisor support for attaching PCI devices on the host + system to guests. PCI passthrough allows guests to have exclusive access + to PCI devices for a range of tasks. PCI passthrough allows PCI devices + to appear and behave as if they were physically attached to the guest + operating system. + + Preparing an Intel system for PCI passthrough: + + + + Enable the Intel VT-d extensions in BIOS + + + + Activate Intel VT-d in the kernel by using + intel_iommu=on as a kernel boot parameter + + + + Allow unsafe interrupts in case the system doesn't support + interrupt remapping. This can be done using + vfio_iommu_type1.allow_unsafe_interrupts=1 as a + boot kernel parameter. + + + + Create guest with direct passthrough via VFIO framework like + so: + + -device vfio-pci,host=0000:03:10.2 \ + + On the host, one or more VirtualFunctions (VFs) must be created in + order to be allocated for a guest network to access, before starting + QEMU: + + $ echo 2 > /sys/class/net/eno3/device/sriov_numvfs +$ modprobe vfio_pci +$ dpdk-devbind.py --bind=vfio-pci 0000:03:10.2 +
+ +
+ Multi-queue + +
+ QEMU multi queue support configuration + + -chardev socket,id=char0,path=/var/run/openvswitch/vhost-user1 \ +-netdev type=vhost-user,id=net0,chardev=char0,queues=2 \ +-device virtio-net-pci,netdev=net0,mac=22:EA:FB:A8:25:AE,mq=on,vectors=6 +where vectors is calculated as: 2 + 2 * queues number. +
+ +
+ Inside guest + + Linux kernel virtio-net driver (one queue is enabled by + default): + + $ ethtool -L combined 2 eth0 +DPDK Virtio PMD +$ testpmd -c 0x7 -- -i --rxq=2 --txq=2 --nb-cores=2 ... + + For QEMU documentation please see: https://qemu.weilnetz.de/doc/qemu-doc.html. +
+
+
+ +
+ Libvirt + + One way to manage guests in Enea NFV Access is by using + libvirt. Libvirt is used in conjunction with a daemon + (libvirtd) and a command line utility (virsh) to manage + virtualized environments. + + The libvirt library is a hypervisor-independent virtualization API + and toolkit that is able to interact with the virtualization capabilities + of a range of operating systems. Libvirt provides a common, generic and + stable layer to securely manage domains on a node. As nodes may be + remotely located, libvirt provides all methods required to provision, + create, modify, monitor, control, migrate and stop the domains, within the + limits of hypervisor support for these operations. + + The libvirt daemon runs on the Enea NFV Access host. All tools built + on libvirt API connect to the daemon to request the desired operation, and + to collect information about the configuration and resources of the host + system and guests. virsh is a command line interface + tool for managing guests and the hypervisor. The virsh tool is built on + the libvirt management API. + + Major functionality provided by + libvirt + + The following is a summary from the libvirt home + page describing the major libvirt features: + + + + VM management: Various domain + lifecycle operations such as start, stop, pause, save, restore, and + migrate. Hotplug operations for many device types including disk and + network interfaces, memory, and cpus. + + + + Remote machine support: All + libvirt functionality is accessible on any machine running the libvirt + daemon, including remote machines. A variety of network transports are + supported for connecting remotely, with the simplest being + SSH, which requires no extra explicit + configuration. For more information, see: http://libvirt.org/remote.html. + + + + Network interface management: + Any host running the libvirt daemon can be used to manage physical and + logical network interfaces. Enumerate existing interfaces, as well as + configure (and create) interfaces, bridges, vlans, and bond devices. + For more details see: https://fedorahosted.org/netcf/. + + + + Virtual NAT and Route based + networking: Any host running the libvirt daemon can manage + and create virtual networks. Libvirt virtual networks use firewall + rules to act as a router, providing VMs transparent access to the host + machines network. For more information, see: http://libvirt.org/archnetwork.html. + + + + Storage management: Any host + running the libvirt daemon can be used to manage various types of + storage: create file images of various formats (raw, qcow2, etc.), + mount NFS shares, enumerate existing LVM volume groups, create new LVM + volume groups and logical volumes, partition raw disk devices, mount + iSCSI shares, and much more. For more details, see: http://libvirt.org/storage.html. + + + + Libvirt Configuration: A + properly running libvirt requires that the following elements be in + place: + + + + Configuration files, located in the directory + /etc/libvirt. They include the daemon's + configuration file libvirtd.conf, and + hypervisor-specific configuration files, like + qemu.conf for the QEMU. + + + + A running libvirtd daemon. The daemon is started + automatically in Enea NFV Access host. + + + + Configuration files for the libvirt domains, or guests, to + be managed by the KVM host. The specifics for guest domains shall + be defined in an XML file of a format specified at http://libvirt.org/formatdomain.html. + XML formats for other structures are specified at http://libvirt.org/format.html. + + + + + +
+ Booting a KVM Guest + + There are several ways to boot a KVM guest. Here we describe how + to boot using a raw image. A direct kernel boot can be performed by + transferring the guest kernel and the file system files to the host and + specifying a <kernel> and an + <initrd> element inside the + <os> element of the guest XML file, as in the + following example: + + <os> + <kernel>bzImage</kernel> +</os> +<devices> + <disk type='file' device='disk'> + <driver name='qemu' type='raw' cache='none'/> + <source file='enea-image-virtualization-guest-qemux86-64.ext4'/> + <target dev='vda' bus='virtio'/> + </disk> +</devices> +
+ +
+ Starting a Guest + + Command virsh create starts a guest: + + virsh create example-guest-x86.xml + + If further configurations are needed before the guest is reachable + through ssh, a console can be started using command + virsh console. The example below shows how to start a + console where kvm-example-guest is the name of the guest defined in the + guest XML file: + + virsh console kvm-example-guest + + This requires that the guest domain has a console configured in + the guest XML file: + + <os> + <cmdline>console=ttyS0,115200</cmdline> +</os> +<devices> + <console type='pty'> + <target type='serial' port='0'/> + </console> +</devices> +
+ +
+ Isolation + + It may be desirable to isolate execution in a guest, to a specific + guest core. It might also be desirable to run a guest on a specific host + core. + + To pin the virtual CPUs of the guest to specific cores, configure + the <cputune> contents as follows: + + + + First explicitly state on which host core each guest core + shall run, by mapping vcpu to + cpuset in the <vcpupin> + tag. + + + + In the <cputune> tag it is further + possible to specify on which CPU the emulator shall run by adding + the cpuset to the <emulatorpin> tag. + + <vcpu placement='static'>2</vcpu> +<cputune> + <vcpupin vcpu='0' cpuset='2'/> + <vcpupin vcpu='1' cpuset='3'/> + <emulatorpin cpuset="2"/> +</cputune> + + libvirt will group all threads belonging to + a qemu instance into cgroups that will be created for that purpose. + It is possible to supply a base name for those cgroups using the + <resource> tag: + + <resource> + <partition>/rt</partition> +</resource> + + +
+ +
+ Networking using libvirt + + Command virsh net-create starts a network. If + any networks are listed in the guest XML file, those networks must be + started before the guest is started. As an example, if the network is + defined in a file named example-net.xml, it is started as + follows: + + virsh net-create example-net.xml +<network> + <name>sriov</name> + <forward mode='hostdev' managed='yes'> + <pf dev='eno3'/> + </forward> +</network> + + libvirt is a virtualization API that supports + virtual network creation. These networks can be connected to guests and + containers by referencing the network in the guest XML file. It is + possible to have a virtual network persistently running on the host by + starting the network with command virsh net-define + instead of the previously mentioned virsh + net-create. + + An example for the sample network defined in + meta-vt/recipes-example/virt-example/files/example-net.xml: + + virsh net-define example-net.xml + + Command virsh net-autostart enables a + persistent network to start automatically when the libvirt daemon + starts: + + virsh net-autostart example-net + + Guest configuration file (xml) must be updated to access newly + created network like so: + + <interface type='network'> + <source network='sriov'/> + </interface> + + The following presented here are a few modes of network access + from guest using virsh: + + + + vhost-user interface + + See the Open vSwitch chapter on how to create vhost-user + interface using Open vSwitch. Currently there is no Open vSwitch + support for networks that are managed by libvirt (e.g. NAT). As of + now, only bridged networks are supported (those where the user has + to manually create the bridge). + + <interface type='vhostuser'> + <mac address='00:00:00:00:00:01'/> + <source type='unix' path='/var/run/openvswitch/vhost-user1' mode='client'/> + <model type='virtio'/> + <driver queues='1'> + <host mrg_rxbuf='off'/> + </driver> + </interface> + + + + PCI passthrough + (SR-IOV) + + KVM hypervisor support for attaching PCI devices on the host + system to guests. PCI passthrough allows guests to have exclusive + access to PCI devices for a range of tasks. PCI passthrough allows + PCI devices to appear and behave as if they were physically attached + to the guest operating system. + + Preparing an Intel system for PCI passthrough is done like + so: + + + + Enable the Intel VT-d extensions in BIOS + + + + Activate Intel VT-d in the kernel by using + intel_iommu=on as a kernel boot + parameter + + + + Allow unsafe interrupts in case the system doesn't support + interrupt remapping. This can be done using + vfio_iommu_type1.allow_unsafe_interrupts=1 as + a boot kernel parameter. + + + + VFs must be created on the host before starting the + guest: + + $ echo 2 > /sys/class/net/eno3/device/sriov_numvfs +$ modprobe vfio_pci +$ dpdk-devbind.py --bind=vfio-pci 0000:03:10.0 + <interface type='hostdev' managed='yes'> + <source> + <address type='pci' domain='0x0' bus='0x03' slot='0x10' function='0x0'/> + </source> + <mac address='52:54:00:6d:90:02'/> + </interface> + + + + Bridge interface + + In case an OVS bridge exists on host, it can be used to + connect the guest: + + <interface type='bridge'> + <mac address='52:54:00:71:b1:b6'/> + <source bridge='ovsbr0'/> + <virtualport type='openvswitch'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + </interface> + + For further details on the network XML format, see http://libvirt.org/formatnetwork.html. + + +
+ +
+ Libvirt guest configuration examples + +
+ Guest configuration with vhost-user interface + + <domain type='kvm'> + <name>vm_vhost</name> + <uuid>4a9b3f53-fa2a-47f3-a757-dd87720d9d1d</uuid> + <memory unit='KiB'>4194304</memory> + <currentMemory unit='KiB'>4194304</currentMemory> + <memoryBacking> + <hugepages> + <page size='1' unit='G' nodeset='0'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>2</vcpu> + <cputune> + <shares>4096</shares> + <vcpupin vcpu='0' cpuset='4'/> + <vcpupin vcpu='1' cpuset='5'/> + <emulatorpin cpuset='4,5'/> + </cputune> + <os> + <type arch='x86_64' machine='pc'>hvm</type> + <kernel>/mnt/qemu/bzImage</kernel> + <cmdline>root=/dev/vda console=ttyS0,115200</cmdline> + <boot dev='hd'/> + </os> + <features> + <acpi/> + <apic/> + </features> + <cpu mode='host-model'> + <model fallback='allow'/> + <topology sockets='2' cores='1' threads='1'/> + <numa> + <cell id='0' cpus='0-1' memory='4194304' unit='KiB' memAccess='shared'/> + </numa> + </cpu> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <disk type='file' device='disk'> + <driver name='qemu' type='raw' cache='none'/> + <source file='/mnt/qemu/enea-image-virtualization-guest-qemux86-64.ext4'/> + <target dev='vda' bus='virtio'/> + </disk> + <interface type='vhostuser'> + <mac address='00:00:00:00:00:01'/> + <source type='unix' path='/var/run/openvswitch/vhost-user1' mode='client'/> + <model type='virtio'/> + <driver queues='1'> + <host mrg_rxbuf='off'/> + </driver> + </interface> + <serial type='pty'> + <target port='0'/> + </serial> + <console type='pty'> + <target type='serial' port='0'/> + </console> + </devices> +</domain> +
+ +
+ Guest configuration with PCI passthrough + + <domain type='kvm'> + <name>vm_sriov1</name> + <uuid>4a9b3f53-fa2a-47f3-a757-dd87720d9d1d</uuid> + <memory unit='KiB'>4194304</memory> + <currentMemory unit='KiB'>4194304</currentMemory> + <memoryBacking> + <hugepages> + <page size='1' unit='G' nodeset='0'/> + </hugepages> + </memoryBacking> + <vcpu>2</vcpu> + <os> + <type arch='x86_64' machine='q35'>hvm</type> + <kernel>/mnt/qemu/bzImage</kernel> + <cmdline>root=/dev/vda console=ttyS0,115200</cmdline> + <boot dev='hd'/> + </os> + <features> + <acpi/> + <apic/> + </features> + <cpu mode='host-model'> + <model fallback='allow'/> + <topology sockets='1' cores='2' threads='1'/> + <numa> + <cell id='0' cpus='0' memory='4194304' unit='KiB' memAccess='shared'/> + </numa> + </cpu> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <disk type='file' device='disk'> + <driver name='qemu' type='raw' cache='none'/> + <source file='/mnt/qemu/enea-image-virtualization-guest-qemux86-64.ext4'/> + <target dev='vda' bus='virtio'/> + </disk> + <interface type='hostdev' managed='yes'> + <source> + <address type='pci' domain='0x0' bus='0x03' slot='0x10' function='0x0'/> + </source> + <mac address='52:54:00:6d:90:02'/> + </interface> + <serial type='pty'> + <target port='0'/> + </serial> + <console type='pty'> + <target type='serial' port='0'/> + </console> + </devices> +</domain> +
+ +
+ Guest configuration with bridge interface + + <domain type='kvm'> + <name>vm_bridge</name> + <uuid>4a9b3f53-fa2a-47f3-a757-dd87720d9d1d</uuid> + <memory unit='KiB'>4194304</memory> + <currentMemory unit='KiB'>4194304</currentMemory> + <memoryBacking> + <hugepages> + <page size='1' unit='G' nodeset='0'/> + </hugepages> + </memoryBacking> + <vcpu placement='static'>2</vcpu> + <cputune> + <shares>4096</shares> + <vcpupin vcpu='0' cpuset='4'/> + <vcpupin vcpu='1' cpuset='5'/> + <emulatorpin cpuset='4,5'/> + </cputune> + <os> + <type arch='x86_64' machine='q35'>hvm</type> + <kernel>/mnt/qemu/bzImage</kernel> + <cmdline>root=/dev/vda console=ttyS0,115200</cmdline> + <boot dev='hd'/> + </os> + <features> + <acpi/> + <apic/> + </features> + <cpu mode='host-model'> + <model fallback='allow'/> + <topology sockets='2' cores='1' threads='1'/> + <numa> + <cell id='0' cpus='0-1' memory='4194304' unit='KiB' memAccess='shared'/> + </numa> + </cpu> + <on_poweroff>destroy</on_poweroff> + <on_reboot>restart</on_reboot> + <on_crash>destroy</on_crash> + <devices> + <emulator>/usr/bin/qemu-system-x86_64</emulator> + <disk type='file' device='disk'> + <driver name='qemu' type='raw' cache='none'/> + <source file='/mnt/qemu/enea-image-virtualization-guest-qemux86-64.ext4'/> + <target dev='vda' bus='virtio'/> + </disk> + <interface type='bridge'> + <mac address='52:54:00:71:b1:b6'/> + <source bridge='ovsbr0'/> + <virtualport type='openvswitch'/> + <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> + </interface> + <serial type='pty'> + <target port='0'/> + </serial> + <console type='pty'> + <target type='serial' port='0'/> + </console> + </devices> +</domain> +
+
+
+
\ No newline at end of file -- cgit v1.2.3-54-g00ecf