1/*P:100 2 * This is the Launcher code, a simple program which lays out the "physical" 3 * memory for the new Guest by mapping the kernel image and the virtual 4 * devices, then opens /dev/lguest to tell the kernel about the Guest and 5 * control it. 6:*/ 7#define _LARGEFILE64_SOURCE 8#define _GNU_SOURCE 9#include <stdio.h> 10#include <string.h> 11#include <unistd.h> 12#include <err.h> 13#include <stdint.h> 14#include <stdlib.h> 15#include <elf.h> 16#include <sys/mman.h> 17#include <sys/param.h> 18#include <sys/types.h> 19#include <sys/stat.h> 20#include <sys/wait.h> 21#include <sys/eventfd.h> 22#include <fcntl.h> 23#include <stdbool.h> 24#include <errno.h> 25#include <ctype.h> 26#include <sys/socket.h> 27#include <sys/ioctl.h> 28#include <sys/time.h> 29#include <time.h> 30#include <netinet/in.h> 31#include <net/if.h> 32#include <linux/sockios.h> 33#include <linux/if_tun.h> 34#include <sys/uio.h> 35#include <termios.h> 36#include <getopt.h> 37#include <assert.h> 38#include <sched.h> 39#include <limits.h> 40#include <stddef.h> 41#include <signal.h> 42#include <pwd.h> 43#include <grp.h> 44#include <sys/user.h> 45#include <linux/pci_regs.h> 46 47#ifndef VIRTIO_F_ANY_LAYOUT 48#define VIRTIO_F_ANY_LAYOUT 27 49#endif 50 51/*L:110 52 * We can ignore the 43 include files we need for this program, but I do want 53 * to draw attention to the use of kernel-style types. 54 * 55 * As Linus said, "C is a Spartan language, and so should your naming be." I 56 * like these abbreviations, so we define them here. Note that u64 is always 57 * unsigned long long, which works on all Linux systems: this means that we can 58 * use %llu in printf for any u64. 59 */ 60typedef unsigned long long u64; 61typedef uint32_t u32; 62typedef uint16_t u16; 63typedef uint8_t u8; 64/*:*/ 65 66#define VIRTIO_CONFIG_NO_LEGACY 67#define VIRTIO_PCI_NO_LEGACY 68#define VIRTIO_BLK_NO_LEGACY 69#define VIRTIO_NET_NO_LEGACY 70 71/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */ 72#include "../../include/uapi/linux/virtio_config.h" 73#include "../../include/uapi/linux/virtio_net.h" 74#include "../../include/uapi/linux/virtio_blk.h" 75#include "../../include/uapi/linux/virtio_console.h" 76#include "../../include/uapi/linux/virtio_rng.h" 77#include <linux/virtio_ring.h> 78#include "../../include/uapi/linux/virtio_pci.h" 79#include <asm/bootparam.h> 80#include "../../include/linux/lguest_launcher.h" 81 82#define BRIDGE_PFX "bridge:" 83#ifndef SIOCBRADDIF 84#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 85#endif 86/* We can have up to 256 pages for devices. */ 87#define DEVICE_PAGES 256 88/* This will occupy 3 pages: it must be a power of 2. */ 89#define VIRTQUEUE_NUM 256 90 91/*L:120 92 * verbose is both a global flag and a macro. The C preprocessor allows 93 * this, and although I wouldn't recommend it, it works quite nicely here. 94 */ 95static bool verbose; 96#define verbose(args...) \ 97 do { if (verbose) printf(args); } while(0) 98/*:*/ 99 100/* The pointer to the start of guest memory. */ 101static void *guest_base; 102/* The maximum guest physical address allowed, and maximum possible. */ 103static unsigned long guest_limit, guest_max, guest_mmio; 104/* The /dev/lguest file descriptor. */ 105static int lguest_fd; 106 107/* a per-cpu variable indicating whose vcpu is currently running */ 108static unsigned int __thread cpu_id; 109 110/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */ 111#define MAX_PCI_DEVICES 32 112 113/* This is our list of devices. */ 114struct device_list { 115 /* Counter to assign interrupt numbers. */ 116 unsigned int next_irq; 117 118 /* Counter to print out convenient device numbers. */ 119 unsigned int device_num; 120 121 /* PCI devices. */ 122 struct device *pci[MAX_PCI_DEVICES]; 123}; 124 125/* The list of Guest devices, based on command line arguments. */ 126static struct device_list devices; 127 128struct virtio_pci_cfg_cap { 129 struct virtio_pci_cap cap; 130 u32 pci_cfg_data; /* Data for BAR access. */ 131}; 132 133struct virtio_pci_mmio { 134 struct virtio_pci_common_cfg cfg; 135 u16 notify; 136 u8 isr; 137 u8 padding; 138 /* Device-specific configuration follows this. */ 139}; 140 141/* This is the layout (little-endian) of the PCI config space. */ 142struct pci_config { 143 u16 vendor_id, device_id; 144 u16 command, status; 145 u8 revid, prog_if, subclass, class; 146 u8 cacheline_size, lat_timer, header_type, bist; 147 u32 bar[6]; 148 u32 cardbus_cis_ptr; 149 u16 subsystem_vendor_id, subsystem_device_id; 150 u32 expansion_rom_addr; 151 u8 capabilities, reserved1[3]; 152 u32 reserved2; 153 u8 irq_line, irq_pin, min_grant, max_latency; 154 155 /* Now, this is the linked capability list. */ 156 struct virtio_pci_cap common; 157 struct virtio_pci_notify_cap notify; 158 struct virtio_pci_cap isr; 159 struct virtio_pci_cap device; 160 struct virtio_pci_cfg_cap cfg_access; 161}; 162 163/* The device structure describes a single device. */ 164struct device { 165 /* The name of this device, for --verbose. */ 166 const char *name; 167 168 /* Any queues attached to this device */ 169 struct virtqueue *vq; 170 171 /* Is it operational */ 172 bool running; 173 174 /* Has it written FEATURES_OK but not re-checked it? */ 175 bool wrote_features_ok; 176 177 /* PCI configuration */ 178 union { 179 struct pci_config config; 180 u32 config_words[sizeof(struct pci_config) / sizeof(u32)]; 181 }; 182 183 /* Features we offer, and those accepted. */ 184 u64 features, features_accepted; 185 186 /* Device-specific config hangs off the end of this. */ 187 struct virtio_pci_mmio *mmio; 188 189 /* PCI MMIO resources (all in BAR0) */ 190 size_t mmio_size; 191 u32 mmio_addr; 192 193 /* Device-specific data. */ 194 void *priv; 195}; 196 197/* The virtqueue structure describes a queue attached to a device. */ 198struct virtqueue { 199 struct virtqueue *next; 200 201 /* Which device owns me. */ 202 struct device *dev; 203 204 /* Name for printing errors. */ 205 const char *name; 206 207 /* The actual ring of buffers. */ 208 struct vring vring; 209 210 /* The information about this virtqueue (we only use queue_size on) */ 211 struct virtio_pci_common_cfg pci_config; 212 213 /* Last available index we saw. */ 214 u16 last_avail_idx; 215 216 /* How many are used since we sent last irq? */ 217 unsigned int pending_used; 218 219 /* Eventfd where Guest notifications arrive. */ 220 int eventfd; 221 222 /* Function for the thread which is servicing this virtqueue. */ 223 void (*service)(struct virtqueue *vq); 224 pid_t thread; 225}; 226 227/* Remember the arguments to the program so we can "reboot" */ 228static char **main_args; 229 230/* The original tty settings to restore on exit. */ 231static struct termios orig_term; 232 233/* 234 * We have to be careful with barriers: our devices are all run in separate 235 * threads and so we need to make sure that changes visible to the Guest happen 236 * in precise order. 237 */ 238#define wmb() __asm__ __volatile__("" : : : "memory") 239#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") 240#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory") 241 242/* Wrapper for the last available index. Makes it easier to change. */ 243#define lg_last_avail(vq) ((vq)->last_avail_idx) 244 245/* 246 * The virtio configuration space is defined to be little-endian. x86 is 247 * little-endian too, but it's nice to be explicit so we have these helpers. 248 */ 249#define cpu_to_le16(v16) (v16) 250#define cpu_to_le32(v32) (v32) 251#define cpu_to_le64(v64) (v64) 252#define le16_to_cpu(v16) (v16) 253#define le32_to_cpu(v32) (v32) 254#define le64_to_cpu(v64) (v64) 255 256/* 257 * A real device would ignore weird/non-compliant driver behaviour. We 258 * stop and flag it, to help debugging Linux problems. 259 */ 260#define bad_driver(d, fmt, ...) \ 261 errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__) 262#define bad_driver_vq(vq, fmt, ...) \ 263 errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \ 264 vq->name, ## __VA_ARGS__) 265 266/* Is this iovec empty? */ 267static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 268{ 269 unsigned int i; 270 271 for (i = 0; i < num_iov; i++) 272 if (iov[i].iov_len) 273 return false; 274 return true; 275} 276 277/* Take len bytes from the front of this iovec. */ 278static void iov_consume(struct device *d, 279 struct iovec iov[], unsigned num_iov, 280 void *dest, unsigned len) 281{ 282 unsigned int i; 283 284 for (i = 0; i < num_iov; i++) { 285 unsigned int used; 286 287 used = iov[i].iov_len < len ? iov[i].iov_len : len; 288 if (dest) { 289 memcpy(dest, iov[i].iov_base, used); 290 dest += used; 291 } 292 iov[i].iov_base += used; 293 iov[i].iov_len -= used; 294 len -= used; 295 } 296 if (len != 0) 297 bad_driver(d, "iovec too short!"); 298} 299 300/*L:100 301 * The Launcher code itself takes us out into userspace, that scary place where 302 * pointers run wild and free! Unfortunately, like most userspace programs, 303 * it's quite boring (which is why everyone likes to hack on the kernel!). 304 * Perhaps if you make up an Lguest Drinking Game at this point, it will get 305 * you through this section. Or, maybe not. 306 * 307 * The Launcher sets up a big chunk of memory to be the Guest's "physical" 308 * memory and stores it in "guest_base". In other words, Guest physical == 309 * Launcher virtual with an offset. 310 * 311 * This can be tough to get your head around, but usually it just means that we 312 * use these trivial conversion functions when the Guest gives us its 313 * "physical" addresses: 314 */ 315static void *from_guest_phys(unsigned long addr) 316{ 317 return guest_base + addr; 318} 319 320static unsigned long to_guest_phys(const void *addr) 321{ 322 return (addr - guest_base); 323} 324 325/*L:130 326 * Loading the Kernel. 327 * 328 * We start with couple of simple helper routines. open_or_die() avoids 329 * error-checking code cluttering the callers: 330 */ 331static int open_or_die(const char *name, int flags) 332{ 333 int fd = open(name, flags); 334 if (fd < 0) 335 err(1, "Failed to open %s", name); 336 return fd; 337} 338 339/* map_zeroed_pages() takes a number of pages. */ 340static void *map_zeroed_pages(unsigned int num) 341{ 342 int fd = open_or_die("/dev/zero", O_RDONLY); 343 void *addr; 344 345 /* 346 * We use a private mapping (ie. if we write to the page, it will be 347 * copied). We allocate an extra two pages PROT_NONE to act as guard 348 * pages against read/write attempts that exceed allocated space. 349 */ 350 addr = mmap(NULL, getpagesize() * (num+2), 351 PROT_NONE, MAP_PRIVATE, fd, 0); 352 353 if (addr == MAP_FAILED) 354 err(1, "Mmapping %u pages of /dev/zero", num); 355 356 if (mprotect(addr + getpagesize(), getpagesize() * num, 357 PROT_READ|PROT_WRITE) == -1) 358 err(1, "mprotect rw %u pages failed", num); 359 360 /* 361 * One neat mmap feature is that you can close the fd, and it 362 * stays mapped. 363 */ 364 close(fd); 365 366 /* Return address after PROT_NONE page */ 367 return addr + getpagesize(); 368} 369 370/* Get some bytes which won't be mapped into the guest. */ 371static unsigned long get_mmio_region(size_t size) 372{ 373 unsigned long addr = guest_mmio; 374 size_t i; 375 376 if (!size) 377 return addr; 378 379 /* Size has to be a power of 2 (and multiple of 16) */ 380 for (i = 1; i < size; i <<= 1); 381 382 guest_mmio += i; 383 384 return addr; 385} 386 387/* 388 * This routine is used to load the kernel or initrd. It tries mmap, but if 389 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 390 * it falls back to reading the memory in. 391 */ 392static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 393{ 394 ssize_t r; 395 396 /* 397 * We map writable even though for some segments are marked read-only. 398 * The kernel really wants to be writable: it patches its own 399 * instructions. 400 * 401 * MAP_PRIVATE means that the page won't be copied until a write is 402 * done to it. This allows us to share untouched memory between 403 * Guests. 404 */ 405 if (mmap(addr, len, PROT_READ|PROT_WRITE, 406 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 407 return; 408 409 /* pread does a seek and a read in one shot: saves a few lines. */ 410 r = pread(fd, addr, len, offset); 411 if (r != len) 412 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 413} 414 415/* 416 * This routine takes an open vmlinux image, which is in ELF, and maps it into 417 * the Guest memory. ELF = Embedded Linking Format, which is the format used 418 * by all modern binaries on Linux including the kernel. 419 * 420 * The ELF headers give *two* addresses: a physical address, and a virtual 421 * address. We use the physical address; the Guest will map itself to the 422 * virtual address. 423 * 424 * We return the starting address. 425 */ 426static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 427{ 428 Elf32_Phdr phdr[ehdr->e_phnum]; 429 unsigned int i; 430 431 /* 432 * Sanity checks on the main ELF header: an x86 executable with a 433 * reasonable number of correctly-sized program headers. 434 */ 435 if (ehdr->e_type != ET_EXEC 436 || ehdr->e_machine != EM_386 437 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 438 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 439 errx(1, "Malformed elf header"); 440 441 /* 442 * An ELF executable contains an ELF header and a number of "program" 443 * headers which indicate which parts ("segments") of the program to 444 * load where. 445 */ 446 447 /* We read in all the program headers at once: */ 448 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 449 err(1, "Seeking to program headers"); 450 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 451 err(1, "Reading program headers"); 452 453 /* 454 * Try all the headers: there are usually only three. A read-only one, 455 * a read-write one, and a "note" section which we don't load. 456 */ 457 for (i = 0; i < ehdr->e_phnum; i++) { 458 /* If this isn't a loadable segment, we ignore it */ 459 if (phdr[i].p_type != PT_LOAD) 460 continue; 461 462 verbose("Section %i: size %i addr %p\n", 463 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 464 465 /* We map this section of the file at its physical address. */ 466 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), 467 phdr[i].p_offset, phdr[i].p_filesz); 468 } 469 470 /* The entry point is given in the ELF header. */ 471 return ehdr->e_entry; 472} 473 474/*L:150 475 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed 476 * to jump into it and it will unpack itself. We used to have to perform some 477 * hairy magic because the unpacking code scared me. 478 * 479 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 480 * a small patch to jump over the tricky bits in the Guest, so now we just read 481 * the funky header so we know where in the file to load, and away we go! 482 */ 483static unsigned long load_bzimage(int fd) 484{ 485 struct boot_params boot; 486 int r; 487 /* Modern bzImages get loaded at 1M. */ 488 void *p = from_guest_phys(0x100000); 489 490 /* 491 * Go back to the start of the file and read the header. It should be 492 * a Linux boot header (see Documentation/x86/boot.txt) 493 */ 494 lseek(fd, 0, SEEK_SET); 495 read(fd, &boot, sizeof(boot)); 496 497 /* Inside the setup_hdr, we expect the magic "HdrS" */ 498 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) 499 errx(1, "This doesn't look like a bzImage to me"); 500 501 /* Skip over the extra sectors of the header. */ 502 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); 503 504 /* Now read everything into memory. in nice big chunks. */ 505 while ((r = read(fd, p, 65536)) > 0) 506 p += r; 507 508 /* Finally, code32_start tells us where to enter the kernel. */ 509 return boot.hdr.code32_start; 510} 511 512/*L:140 513 * Loading the kernel is easy when it's a "vmlinux", but most kernels 514 * come wrapped up in the self-decompressing "bzImage" format. With a little 515 * work, we can load those, too. 516 */ 517static unsigned long load_kernel(int fd) 518{ 519 Elf32_Ehdr hdr; 520 521 /* Read in the first few bytes. */ 522 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 523 err(1, "Reading kernel"); 524 525 /* If it's an ELF file, it starts with "\177ELF" */ 526 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 527 return map_elf(fd, &hdr); 528 529 /* Otherwise we assume it's a bzImage, and try to load it. */ 530 return load_bzimage(fd); 531} 532 533/* 534 * This is a trivial little helper to align pages. Andi Kleen hated it because 535 * it calls getpagesize() twice: "it's dumb code." 536 * 537 * Kernel guys get really het up about optimization, even when it's not 538 * necessary. I leave this code as a reaction against that. 539 */ 540static inline unsigned long page_align(unsigned long addr) 541{ 542 /* Add upwards and truncate downwards. */ 543 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 544} 545 546/*L:180 547 * An "initial ram disk" is a disk image loaded into memory along with the 548 * kernel which the kernel can use to boot from without needing any drivers. 549 * Most distributions now use this as standard: the initrd contains the code to 550 * load the appropriate driver modules for the current machine. 551 * 552 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 553 * kernels. He sent me this (and tells me when I break it). 554 */ 555static unsigned long load_initrd(const char *name, unsigned long mem) 556{ 557 int ifd; 558 struct stat st; 559 unsigned long len; 560 561 ifd = open_or_die(name, O_RDONLY); 562 /* fstat() is needed to get the file size. */ 563 if (fstat(ifd, &st) < 0) 564 err(1, "fstat() on initrd '%s'", name); 565 566 /* 567 * We map the initrd at the top of memory, but mmap wants it to be 568 * page-aligned, so we round the size up for that. 569 */ 570 len = page_align(st.st_size); 571 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 572 /* 573 * Once a file is mapped, you can close the file descriptor. It's a 574 * little odd, but quite useful. 575 */ 576 close(ifd); 577 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 578 579 /* We return the initrd size. */ 580 return len; 581} 582/*:*/ 583 584/* 585 * Simple routine to roll all the commandline arguments together with spaces 586 * between them. 587 */ 588static void concat(char *dst, char *args[]) 589{ 590 unsigned int i, len = 0; 591 592 for (i = 0; args[i]; i++) { 593 if (i) { 594 strcat(dst+len, " "); 595 len++; 596 } 597 strcpy(dst+len, args[i]); 598 len += strlen(args[i]); 599 } 600 /* In case it's empty. */ 601 dst[len] = '\0'; 602} 603 604/*L:185 605 * This is where we actually tell the kernel to initialize the Guest. We 606 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 607 * the base of Guest "physical" memory, the top physical page to allow and the 608 * entry point for the Guest. 609 */ 610static void tell_kernel(unsigned long start) 611{ 612 unsigned long args[] = { LHREQ_INITIALIZE, 613 (unsigned long)guest_base, 614 guest_limit / getpagesize(), start, 615 (guest_mmio+getpagesize()-1) / getpagesize() }; 616 verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n", 617 guest_base, guest_base + guest_limit, 618 guest_limit, guest_mmio); 619 lguest_fd = open_or_die("/dev/lguest", O_RDWR); 620 if (write(lguest_fd, args, sizeof(args)) < 0) 621 err(1, "Writing to /dev/lguest"); 622} 623/*:*/ 624 625/*L:200 626 * Device Handling. 627 * 628 * When the Guest gives us a buffer, it sends an array of addresses and sizes. 629 * We need to make sure it's not trying to reach into the Launcher itself, so 630 * we have a convenient routine which checks it and exits with an error message 631 * if something funny is going on: 632 */ 633static void *_check_pointer(struct device *d, 634 unsigned long addr, unsigned int size, 635 unsigned int line) 636{ 637 /* 638 * Check if the requested address and size exceeds the allocated memory, 639 * or addr + size wraps around. 640 */ 641 if ((addr + size) > guest_limit || (addr + size) < addr) 642 bad_driver(d, "%s:%i: Invalid address %#lx", 643 __FILE__, line, addr); 644 /* 645 * We return a pointer for the caller's convenience, now we know it's 646 * safe to use. 647 */ 648 return from_guest_phys(addr); 649} 650/* A macro which transparently hands the line number to the real function. */ 651#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__) 652 653/* 654 * Each buffer in the virtqueues is actually a chain of descriptors. This 655 * function returns the next descriptor in the chain, or vq->vring.num if we're 656 * at the end. 657 */ 658static unsigned next_desc(struct device *d, struct vring_desc *desc, 659 unsigned int i, unsigned int max) 660{ 661 unsigned int next; 662 663 /* If this descriptor says it doesn't chain, we're done. */ 664 if (!(desc[i].flags & VRING_DESC_F_NEXT)) 665 return max; 666 667 /* Check they're not leading us off end of descriptors. */ 668 next = desc[i].next; 669 /* Make sure compiler knows to grab that: we don't want it changing! */ 670 wmb(); 671 672 if (next >= max) 673 bad_driver(d, "Desc next is %u", next); 674 675 return next; 676} 677 678/* 679 * This actually sends the interrupt for this virtqueue, if we've used a 680 * buffer. 681 */ 682static void trigger_irq(struct virtqueue *vq) 683{ 684 unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line }; 685 686 /* Don't inform them if nothing used. */ 687 if (!vq->pending_used) 688 return; 689 vq->pending_used = 0; 690 691 /* 692 * 2.4.7.1: 693 * 694 * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: 695 * The driver MUST set flags to 0 or 1. 696 */ 697 if (vq->vring.avail->flags > 1) 698 bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags); 699 700 /* 701 * 2.4.7.2: 702 * 703 * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated: 704 * 705 * - The device MUST ignore the used_event value. 706 * - After the device writes a descriptor index into the used ring: 707 * - If flags is 1, the device SHOULD NOT send an interrupt. 708 * - If flags is 0, the device MUST send an interrupt. 709 */ 710 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 711 return; 712 } 713 714 /* 715 * 4.1.4.5.1: 716 * 717 * If MSI-X capability is disabled, the device MUST set the Queue 718 * Interrupt bit in ISR status before sending a virtqueue notification 719 * to the driver. 720 */ 721 vq->dev->mmio->isr = 0x1; 722 723 /* Send the Guest an interrupt tell them we used something up. */ 724 if (write(lguest_fd, buf, sizeof(buf)) != 0) 725 err(1, "Triggering irq %i", vq->dev->config.irq_line); 726} 727 728/* 729 * This looks in the virtqueue for the first available buffer, and converts 730 * it to an iovec for convenient access. Since descriptors consist of some 731 * number of output then some number of input descriptors, it's actually two 732 * iovecs, but we pack them into one and note how many of each there were. 733 * 734 * This function waits if necessary, and returns the descriptor number found. 735 */ 736static unsigned wait_for_vq_desc(struct virtqueue *vq, 737 struct iovec iov[], 738 unsigned int *out_num, unsigned int *in_num) 739{ 740 unsigned int i, head, max; 741 struct vring_desc *desc; 742 u16 last_avail = lg_last_avail(vq); 743 744 /* 745 * 2.4.7.1: 746 * 747 * The driver MUST handle spurious interrupts from the device. 748 * 749 * That's why this is a while loop. 750 */ 751 752 /* There's nothing available? */ 753 while (last_avail == vq->vring.avail->idx) { 754 u64 event; 755 756 /* 757 * Since we're about to sleep, now is a good time to tell the 758 * Guest about what we've used up to now. 759 */ 760 trigger_irq(vq); 761 762 /* OK, now we need to know about added descriptors. */ 763 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 764 765 /* 766 * They could have slipped one in as we were doing that: make 767 * sure it's written, then check again. 768 */ 769 mb(); 770 if (last_avail != vq->vring.avail->idx) { 771 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 772 break; 773 } 774 775 /* Nothing new? Wait for eventfd to tell us they refilled. */ 776 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) 777 errx(1, "Event read failed?"); 778 779 /* We don't need to be notified again. */ 780 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 781 } 782 783 /* Check it isn't doing very strange things with descriptor numbers. */ 784 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 785 bad_driver_vq(vq, "Guest moved used index from %u to %u", 786 last_avail, vq->vring.avail->idx); 787 788 /* 789 * Make sure we read the descriptor number *after* we read the ring 790 * update; don't let the cpu or compiler change the order. 791 */ 792 rmb(); 793 794 /* 795 * Grab the next descriptor number they're advertising, and increment 796 * the index we've seen. 797 */ 798 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 799 lg_last_avail(vq)++; 800 801 /* If their number is silly, that's a fatal mistake. */ 802 if (head >= vq->vring.num) 803 bad_driver_vq(vq, "Guest says index %u is available", head); 804 805 /* When we start there are none of either input nor output. */ 806 *out_num = *in_num = 0; 807 808 max = vq->vring.num; 809 desc = vq->vring.desc; 810 i = head; 811 812 /* 813 * We have to read the descriptor after we read the descriptor number, 814 * but there's a data dependency there so the CPU shouldn't reorder 815 * that: no rmb() required. 816 */ 817 818 do { 819 /* 820 * If this is an indirect entry, then this buffer contains a 821 * descriptor table which we handle as if it's any normal 822 * descriptor chain. 823 */ 824 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 825 /* 2.4.5.3.1: 826 * 827 * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT 828 * flag unless the VIRTIO_F_INDIRECT_DESC feature was 829 * negotiated. 830 */ 831 if (!(vq->dev->features_accepted & 832 (1<<VIRTIO_RING_F_INDIRECT_DESC))) 833 bad_driver_vq(vq, "vq indirect not negotiated"); 834 835 /* 836 * 2.4.5.3.1: 837 * 838 * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT 839 * flag within an indirect descriptor (ie. only one 840 * table per descriptor). 841 */ 842 if (desc != vq->vring.desc) 843 bad_driver_vq(vq, "Indirect within indirect"); 844 845 /* 846 * Proposed update VIRTIO-134 spells this out: 847 * 848 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 849 * and VIRTQ_DESC_F_NEXT in flags. 850 */ 851 if (desc[i].flags & VRING_DESC_F_NEXT) 852 bad_driver_vq(vq, "indirect and next together"); 853 854 if (desc[i].len % sizeof(struct vring_desc)) 855 bad_driver_vq(vq, 856 "Invalid size for indirect table"); 857 /* 858 * 2.4.5.3.2: 859 * 860 * The device MUST ignore the write-only flag 861 * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that 862 * refers to an indirect table. 863 * 864 * We ignore it here: :) 865 */ 866 867 max = desc[i].len / sizeof(struct vring_desc); 868 desc = check_pointer(vq->dev, desc[i].addr, desc[i].len); 869 i = 0; 870 871 /* 2.4.5.3.1: 872 * 873 * A driver MUST NOT create a descriptor chain longer 874 * than the Queue Size of the device. 875 */ 876 if (max > vq->pci_config.queue_size) 877 bad_driver_vq(vq, 878 "indirect has too many entries"); 879 } 880 881 /* Grab the first descriptor, and check it's OK. */ 882 iov[*out_num + *in_num].iov_len = desc[i].len; 883 iov[*out_num + *in_num].iov_base 884 = check_pointer(vq->dev, desc[i].addr, desc[i].len); 885 /* If this is an input descriptor, increment that count. */ 886 if (desc[i].flags & VRING_DESC_F_WRITE) 887 (*in_num)++; 888 else { 889 /* 890 * If it's an output descriptor, they're all supposed 891 * to come before any input descriptors. 892 */ 893 if (*in_num) 894 bad_driver_vq(vq, 895 "Descriptor has out after in"); 896 (*out_num)++; 897 } 898 899 /* If we've got too many, that implies a descriptor loop. */ 900 if (*out_num + *in_num > max) 901 bad_driver_vq(vq, "Looped descriptor"); 902 } while ((i = next_desc(vq->dev, desc, i, max)) != max); 903 904 return head; 905} 906 907/* 908 * After we've used one of their buffers, we tell the Guest about it. Sometime 909 * later we'll want to send them an interrupt using trigger_irq(); note that 910 * wait_for_vq_desc() does that for us if it has to wait. 911 */ 912static void add_used(struct virtqueue *vq, unsigned int head, int len) 913{ 914 struct vring_used_elem *used; 915 916 /* 917 * The virtqueue contains a ring of used buffers. Get a pointer to the 918 * next entry in that used ring. 919 */ 920 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 921 used->id = head; 922 used->len = len; 923 /* Make sure buffer is written before we update index. */ 924 wmb(); 925 vq->vring.used->idx++; 926 vq->pending_used++; 927} 928 929/* And here's the combo meal deal. Supersize me! */ 930static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) 931{ 932 add_used(vq, head, len); 933 trigger_irq(vq); 934} 935 936/* 937 * The Console 938 * 939 * We associate some data with the console for our exit hack. 940 */ 941struct console_abort { 942 /* How many times have they hit ^C? */ 943 int count; 944 /* When did they start? */ 945 struct timeval start; 946}; 947 948/* This is the routine which handles console input (ie. stdin). */ 949static void console_input(struct virtqueue *vq) 950{ 951 int len; 952 unsigned int head, in_num, out_num; 953 struct console_abort *abort = vq->dev->priv; 954 struct iovec iov[vq->vring.num]; 955 956 /* Make sure there's a descriptor available. */ 957 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 958 if (out_num) 959 bad_driver_vq(vq, "Output buffers in console in queue?"); 960 961 /* Read into it. This is where we usually wait. */ 962 len = readv(STDIN_FILENO, iov, in_num); 963 if (len <= 0) { 964 /* Ran out of input? */ 965 warnx("Failed to get console input, ignoring console."); 966 /* 967 * For simplicity, dying threads kill the whole Launcher. So 968 * just nap here. 969 */ 970 for (;;) 971 pause(); 972 } 973 974 /* Tell the Guest we used a buffer. */ 975 add_used_and_trigger(vq, head, len); 976 977 /* 978 * Three ^C within one second? Exit. 979 * 980 * This is such a hack, but works surprisingly well. Each ^C has to 981 * be in a buffer by itself, so they can't be too fast. But we check 982 * that we get three within about a second, so they can't be too 983 * slow. 984 */ 985 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 986 abort->count = 0; 987 return; 988 } 989 990 abort->count++; 991 if (abort->count == 1) 992 gettimeofday(&abort->start, NULL); 993 else if (abort->count == 3) { 994 struct timeval now; 995 gettimeofday(&now, NULL); 996 /* Kill all Launcher processes with SIGINT, like normal ^C */ 997 if (now.tv_sec <= abort->start.tv_sec+1) 998 kill(0, SIGINT); 999 abort->count = 0; 1000 } 1001} 1002 1003/* This is the routine which handles console output (ie. stdout). */ 1004static void console_output(struct virtqueue *vq) 1005{ 1006 unsigned int head, out, in; 1007 struct iovec iov[vq->vring.num]; 1008 1009 /* We usually wait in here, for the Guest to give us something. */ 1010 head = wait_for_vq_desc(vq, iov, &out, &in); 1011 if (in) 1012 bad_driver_vq(vq, "Input buffers in console output queue?"); 1013 1014 /* writev can return a partial write, so we loop here. */ 1015 while (!iov_empty(iov, out)) { 1016 int len = writev(STDOUT_FILENO, iov, out); 1017 if (len <= 0) { 1018 warn("Write to stdout gave %i (%d)", len, errno); 1019 break; 1020 } 1021 iov_consume(vq->dev, iov, out, NULL, len); 1022 } 1023 1024 /* 1025 * We're finished with that buffer: if we're going to sleep, 1026 * wait_for_vq_desc() will prod the Guest with an interrupt. 1027 */ 1028 add_used(vq, head, 0); 1029} 1030 1031/* 1032 * The Network 1033 * 1034 * Handling output for network is also simple: we get all the output buffers 1035 * and write them to /dev/net/tun. 1036 */ 1037struct net_info { 1038 int tunfd; 1039}; 1040 1041static void net_output(struct virtqueue *vq) 1042{ 1043 struct net_info *net_info = vq->dev->priv; 1044 unsigned int head, out, in; 1045 struct iovec iov[vq->vring.num]; 1046 1047 /* We usually wait in here for the Guest to give us a packet. */ 1048 head = wait_for_vq_desc(vq, iov, &out, &in); 1049 if (in) 1050 bad_driver_vq(vq, "Input buffers in net output queue?"); 1051 /* 1052 * Send the whole thing through to /dev/net/tun. It expects the exact 1053 * same format: what a coincidence! 1054 */ 1055 if (writev(net_info->tunfd, iov, out) < 0) 1056 warnx("Write to tun failed (%d)?", errno); 1057 1058 /* 1059 * Done with that one; wait_for_vq_desc() will send the interrupt if 1060 * all packets are processed. 1061 */ 1062 add_used(vq, head, 0); 1063} 1064 1065/* 1066 * Handling network input is a bit trickier, because I've tried to optimize it. 1067 * 1068 * First we have a helper routine which tells is if from this file descriptor 1069 * (ie. the /dev/net/tun device) will block: 1070 */ 1071static bool will_block(int fd) 1072{ 1073 fd_set fdset; 1074 struct timeval zero = { 0, 0 }; 1075 FD_ZERO(&fdset); 1076 FD_SET(fd, &fdset); 1077 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 1078} 1079 1080/* 1081 * This handles packets coming in from the tun device to our Guest. Like all 1082 * service routines, it gets called again as soon as it returns, so you don't 1083 * see a while(1) loop here. 1084 */ 1085static void net_input(struct virtqueue *vq) 1086{ 1087 int len; 1088 unsigned int head, out, in; 1089 struct iovec iov[vq->vring.num]; 1090 struct net_info *net_info = vq->dev->priv; 1091 1092 /* 1093 * Get a descriptor to write an incoming packet into. This will also 1094 * send an interrupt if they're out of descriptors. 1095 */ 1096 head = wait_for_vq_desc(vq, iov, &out, &in); 1097 if (out) 1098 bad_driver_vq(vq, "Output buffers in net input queue?"); 1099 1100 /* 1101 * If it looks like we'll block reading from the tun device, send them 1102 * an interrupt. 1103 */ 1104 if (vq->pending_used && will_block(net_info->tunfd)) 1105 trigger_irq(vq); 1106 1107 /* 1108 * Read in the packet. This is where we normally wait (when there's no 1109 * incoming network traffic). 1110 */ 1111 len = readv(net_info->tunfd, iov, in); 1112 if (len <= 0) 1113 warn("Failed to read from tun (%d).", errno); 1114 1115 /* 1116 * Mark that packet buffer as used, but don't interrupt here. We want 1117 * to wait until we've done as much work as we can. 1118 */ 1119 add_used(vq, head, len); 1120} 1121/*:*/ 1122 1123/* This is the helper to create threads: run the service routine in a loop. */ 1124static int do_thread(void *_vq) 1125{ 1126 struct virtqueue *vq = _vq; 1127 1128 for (;;) 1129 vq->service(vq); 1130 return 0; 1131} 1132 1133/* 1134 * When a child dies, we kill our entire process group with SIGTERM. This 1135 * also has the side effect that the shell restores the console for us! 1136 */ 1137static void kill_launcher(int signal) 1138{ 1139 kill(0, SIGTERM); 1140} 1141 1142static void reset_vq_pci_config(struct virtqueue *vq) 1143{ 1144 vq->pci_config.queue_size = VIRTQUEUE_NUM; 1145 vq->pci_config.queue_enable = 0; 1146} 1147 1148static void reset_device(struct device *dev) 1149{ 1150 struct virtqueue *vq; 1151 1152 verbose("Resetting device %s\n", dev->name); 1153 1154 /* Clear any features they've acked. */ 1155 dev->features_accepted = 0; 1156 1157 /* We're going to be explicitly killing threads, so ignore them. */ 1158 signal(SIGCHLD, SIG_IGN); 1159 1160 /* 1161 * 4.1.4.3.1: 1162 * 1163 * The device MUST present a 0 in queue_enable on reset. 1164 * 1165 * This means we set it here, and reset the saved ones in every vq. 1166 */ 1167 dev->mmio->cfg.queue_enable = 0; 1168 1169 /* Get rid of the virtqueue threads */ 1170 for (vq = dev->vq; vq; vq = vq->next) { 1171 vq->last_avail_idx = 0; 1172 reset_vq_pci_config(vq); 1173 if (vq->thread != (pid_t)-1) { 1174 kill(vq->thread, SIGTERM); 1175 waitpid(vq->thread, NULL, 0); 1176 vq->thread = (pid_t)-1; 1177 } 1178 } 1179 dev->running = false; 1180 dev->wrote_features_ok = false; 1181 1182 /* Now we care if threads die. */ 1183 signal(SIGCHLD, (void *)kill_launcher); 1184} 1185 1186static void cleanup_devices(void) 1187{ 1188 unsigned int i; 1189 1190 for (i = 1; i < MAX_PCI_DEVICES; i++) { 1191 struct device *d = devices.pci[i]; 1192 if (!d) 1193 continue; 1194 reset_device(d); 1195 } 1196 1197 /* If we saved off the original terminal settings, restore them now. */ 1198 if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) 1199 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 1200} 1201 1202/*L:217 1203 * We do PCI. This is mainly done to let us test the kernel virtio PCI 1204 * code. 1205 */ 1206 1207/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */ 1208static struct device pci_host_bridge; 1209 1210static void init_pci_host_bridge(void) 1211{ 1212 pci_host_bridge.name = "PCI Host Bridge"; 1213 pci_host_bridge.config.class = 0x06; /* bridge */ 1214 pci_host_bridge.config.subclass = 0; /* host bridge */ 1215 devices.pci[0] = &pci_host_bridge; 1216} 1217 1218/* The IO ports used to read the PCI config space. */ 1219#define PCI_CONFIG_ADDR 0xCF8 1220#define PCI_CONFIG_DATA 0xCFC 1221 1222/* 1223 * Not really portable, but does help readability: this is what the Guest 1224 * writes to the PCI_CONFIG_ADDR IO port. 1225 */ 1226union pci_config_addr { 1227 struct { 1228 unsigned mbz: 2; 1229 unsigned offset: 6; 1230 unsigned funcnum: 3; 1231 unsigned devnum: 5; 1232 unsigned busnum: 8; 1233 unsigned reserved: 7; 1234 unsigned enabled : 1; 1235 } bits; 1236 u32 val; 1237}; 1238 1239/* 1240 * We cache what they wrote to the address port, so we know what they're 1241 * talking about when they access the data port. 1242 */ 1243static union pci_config_addr pci_config_addr; 1244 1245static struct device *find_pci_device(unsigned int index) 1246{ 1247 return devices.pci[index]; 1248} 1249 1250/* PCI can do 1, 2 and 4 byte reads; we handle that here. */ 1251static void ioread(u16 off, u32 v, u32 mask, u32 *val) 1252{ 1253 assert(off < 4); 1254 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); 1255 *val = (v >> (off * 8)) & mask; 1256} 1257 1258/* PCI can do 1, 2 and 4 byte writes; we handle that here. */ 1259static void iowrite(u16 off, u32 v, u32 mask, u32 *dst) 1260{ 1261 assert(off < 4); 1262 assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF); 1263 *dst &= ~(mask << (off * 8)); 1264 *dst |= (v & mask) << (off * 8); 1265} 1266 1267/* 1268 * Where PCI_CONFIG_DATA accesses depends on the previous write to 1269 * PCI_CONFIG_ADDR. 1270 */ 1271static struct device *dev_and_reg(u32 *reg) 1272{ 1273 if (!pci_config_addr.bits.enabled) 1274 return NULL; 1275 1276 if (pci_config_addr.bits.funcnum != 0) 1277 return NULL; 1278 1279 if (pci_config_addr.bits.busnum != 0) 1280 return NULL; 1281 1282 if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config)) 1283 return NULL; 1284 1285 *reg = pci_config_addr.bits.offset; 1286 return find_pci_device(pci_config_addr.bits.devnum); 1287} 1288 1289/* 1290 * We can get invalid combinations of values while they're writing, so we 1291 * only fault if they try to write with some invalid bar/offset/length. 1292 */ 1293static bool valid_bar_access(struct device *d, 1294 struct virtio_pci_cfg_cap *cfg_access) 1295{ 1296 /* We only have 1 bar (BAR0) */ 1297 if (cfg_access->cap.bar != 0) 1298 return false; 1299 1300 /* Check it's within BAR0. */ 1301 if (cfg_access->cap.offset >= d->mmio_size 1302 || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size) 1303 return false; 1304 1305 /* Check length is 1, 2 or 4. */ 1306 if (cfg_access->cap.length != 1 1307 && cfg_access->cap.length != 2 1308 && cfg_access->cap.length != 4) 1309 return false; 1310 1311 /* 1312 * 4.1.4.7.2: 1313 * 1314 * The driver MUST NOT write a cap.offset which is not a multiple of 1315 * cap.length (ie. all accesses MUST be aligned). 1316 */ 1317 if (cfg_access->cap.offset % cfg_access->cap.length != 0) 1318 return false; 1319 1320 /* Return pointer into word in BAR0. */ 1321 return true; 1322} 1323 1324/* Is this accessing the PCI config address port?. */ 1325static bool is_pci_addr_port(u16 port) 1326{ 1327 return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4; 1328} 1329 1330static bool pci_addr_iowrite(u16 port, u32 mask, u32 val) 1331{ 1332 iowrite(port - PCI_CONFIG_ADDR, val, mask, 1333 &pci_config_addr.val); 1334 verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n", 1335 pci_config_addr.bits.enabled ? "" : " DISABLED", 1336 val, mask, 1337 pci_config_addr.bits.busnum, 1338 pci_config_addr.bits.devnum, 1339 pci_config_addr.bits.funcnum, 1340 pci_config_addr.bits.offset); 1341 return true; 1342} 1343 1344static void pci_addr_ioread(u16 port, u32 mask, u32 *val) 1345{ 1346 ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val); 1347} 1348 1349/* Is this accessing the PCI config data port?. */ 1350static bool is_pci_data_port(u16 port) 1351{ 1352 return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4; 1353} 1354 1355static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask); 1356 1357static bool pci_data_iowrite(u16 port, u32 mask, u32 val) 1358{ 1359 u32 reg, portoff; 1360 struct device *d = dev_and_reg(®); 1361 1362 /* Complain if they don't belong to a device. */ 1363 if (!d) 1364 return false; 1365 1366 /* They can do 1 byte writes, etc. */ 1367 portoff = port - PCI_CONFIG_DATA; 1368 1369 /* 1370 * PCI uses a weird way to determine the BAR size: the OS 1371 * writes all 1's, and sees which ones stick. 1372 */ 1373 if (&d->config_words[reg] == &d->config.bar[0]) { 1374 int i; 1375 1376 iowrite(portoff, val, mask, &d->config.bar[0]); 1377 for (i = 0; (1 << i) < d->mmio_size; i++) 1378 d->config.bar[0] &= ~(1 << i); 1379 return true; 1380 } else if ((&d->config_words[reg] > &d->config.bar[0] 1381 && &d->config_words[reg] <= &d->config.bar[6]) 1382 || &d->config_words[reg] == &d->config.expansion_rom_addr) { 1383 /* Allow writing to any other BAR, or expansion ROM */ 1384 iowrite(portoff, val, mask, &d->config_words[reg]); 1385 return true; 1386 /* We let them overide latency timer and cacheline size */ 1387 } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) { 1388 /* Only let them change the first two fields. */ 1389 if (mask == 0xFFFFFFFF) 1390 mask = 0xFFFF; 1391 iowrite(portoff, val, mask, &d->config_words[reg]); 1392 return true; 1393 } else if (&d->config_words[reg] == (void *)&d->config.command 1394 && mask == 0xFFFF) { 1395 /* Ignore command writes. */ 1396 return true; 1397 } else if (&d->config_words[reg] 1398 == (void *)&d->config.cfg_access.cap.bar 1399 || &d->config_words[reg] 1400 == &d->config.cfg_access.cap.length 1401 || &d->config_words[reg] 1402 == &d->config.cfg_access.cap.offset) { 1403 1404 /* 1405 * The VIRTIO_PCI_CAP_PCI_CFG capability 1406 * provides a backdoor to access the MMIO 1407 * regions without mapping them. Weird, but 1408 * useful. 1409 */ 1410 iowrite(portoff, val, mask, &d->config_words[reg]); 1411 return true; 1412 } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { 1413 u32 write_mask; 1414 1415 /* 1416 * 4.1.4.7.1: 1417 * 1418 * Upon detecting driver write access to pci_cfg_data, the 1419 * device MUST execute a write access at offset cap.offset at 1420 * BAR selected by cap.bar using the first cap.length bytes 1421 * from pci_cfg_data. 1422 */ 1423 1424 /* Must be bar 0 */ 1425 if (!valid_bar_access(d, &d->config.cfg_access)) 1426 return false; 1427 1428 iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data); 1429 1430 /* 1431 * Now emulate a write. The mask we use is set by 1432 * len, *not* this write! 1433 */ 1434 write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1; 1435 verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n", 1436 d->config.cfg_access.pci_cfg_data, write_mask, 1437 d->config.cfg_access.cap.bar, 1438 d->config.cfg_access.cap.offset, 1439 d->config.cfg_access.cap.length); 1440 1441 emulate_mmio_write(d, d->config.cfg_access.cap.offset, 1442 d->config.cfg_access.pci_cfg_data, 1443 write_mask); 1444 return true; 1445 } 1446 1447 /* 1448 * 4.1.4.1: 1449 * 1450 * The driver MUST NOT write into any field of the capability 1451 * structure, with the exception of those with cap_type 1452 * VIRTIO_PCI_CAP_PCI_CFG... 1453 */ 1454 return false; 1455} 1456 1457static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask); 1458 1459static void pci_data_ioread(u16 port, u32 mask, u32 *val) 1460{ 1461 u32 reg; 1462 struct device *d = dev_and_reg(®); 1463 1464 if (!d) 1465 return; 1466 1467 /* Read through the PCI MMIO access window is special */ 1468 if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) { 1469 u32 read_mask; 1470 1471 /* 1472 * 4.1.4.7.1: 1473 * 1474 * Upon detecting driver read access to pci_cfg_data, the 1475 * device MUST execute a read access of length cap.length at 1476 * offset cap.offset at BAR selected by cap.bar and store the 1477 * first cap.length bytes in pci_cfg_data. 1478 */ 1479 /* Must be bar 0 */ 1480 if (!valid_bar_access(d, &d->config.cfg_access)) 1481 bad_driver(d, 1482 "Invalid cfg_access to bar%u, offset %u len %u", 1483 d->config.cfg_access.cap.bar, 1484 d->config.cfg_access.cap.offset, 1485 d->config.cfg_access.cap.length); 1486 1487 /* 1488 * Read into the window. The mask we use is set by 1489 * len, *not* this read! 1490 */ 1491 read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1; 1492 d->config.cfg_access.pci_cfg_data 1493 = emulate_mmio_read(d, 1494 d->config.cfg_access.cap.offset, 1495 read_mask); 1496 verbose("Window read %#x/%#x from bar %u, offset %u len %u\n", 1497 d->config.cfg_access.pci_cfg_data, read_mask, 1498 d->config.cfg_access.cap.bar, 1499 d->config.cfg_access.cap.offset, 1500 d->config.cfg_access.cap.length); 1501 } 1502 ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val); 1503} 1504 1505/*L:216 1506 * This is where we emulate a handful of Guest instructions. It's ugly 1507 * and we used to do it in the kernel but it grew over time. 1508 */ 1509 1510/* 1511 * We use the ptrace syscall's pt_regs struct to talk about registers 1512 * to lguest: these macros convert the names to the offsets. 1513 */ 1514#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name)) 1515#define setreg(name, val) \ 1516 setreg_off(offsetof(struct user_regs_struct, name), (val)) 1517 1518static u32 getreg_off(size_t offset) 1519{ 1520 u32 r; 1521 unsigned long args[] = { LHREQ_GETREG, offset }; 1522 1523 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) 1524 err(1, "Getting register %u", offset); 1525 if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r)) 1526 err(1, "Reading register %u", offset); 1527 1528 return r; 1529} 1530 1531static void setreg_off(size_t offset, u32 val) 1532{ 1533 unsigned long args[] = { LHREQ_SETREG, offset, val }; 1534 1535 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) 1536 err(1, "Setting register %u", offset); 1537} 1538 1539/* Get register by instruction encoding */ 1540static u32 getreg_num(unsigned regnum, u32 mask) 1541{ 1542 /* 8 bit ops use regnums 4-7 for high parts of word */ 1543 if (mask == 0xFF && (regnum & 0x4)) 1544 return getreg_num(regnum & 0x3, 0xFFFF) >> 8; 1545 1546 switch (regnum) { 1547 case 0: return getreg(eax) & mask; 1548 case 1: return getreg(ecx) & mask; 1549 case 2: return getreg(edx) & mask; 1550 case 3: return getreg(ebx) & mask; 1551 case 4: return getreg(esp) & mask; 1552 case 5: return getreg(ebp) & mask; 1553 case 6: return getreg(esi) & mask; 1554 case 7: return getreg(edi) & mask; 1555 } 1556 abort(); 1557} 1558 1559/* Set register by instruction encoding */ 1560static void setreg_num(unsigned regnum, u32 val, u32 mask) 1561{ 1562 /* Don't try to set bits out of range */ 1563 assert(~(val & ~mask)); 1564 1565 /* 8 bit ops use regnums 4-7 for high parts of word */ 1566 if (mask == 0xFF && (regnum & 0x4)) { 1567 /* Construct the 16 bits we want. */ 1568 val = (val << 8) | getreg_num(regnum & 0x3, 0xFF); 1569 setreg_num(regnum & 0x3, val, 0xFFFF); 1570 return; 1571 } 1572 1573 switch (regnum) { 1574 case 0: setreg(eax, val | (getreg(eax) & ~mask)); return; 1575 case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return; 1576 case 2: setreg(edx, val | (getreg(edx) & ~mask)); return; 1577 case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return; 1578 case 4: setreg(esp, val | (getreg(esp) & ~mask)); return; 1579 case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return; 1580 case 6: setreg(esi, val | (getreg(esi) & ~mask)); return; 1581 case 7: setreg(edi, val | (getreg(edi) & ~mask)); return; 1582 } 1583 abort(); 1584} 1585 1586/* Get bytes of displacement appended to instruction, from r/m encoding */ 1587static u32 insn_displacement_len(u8 mod_reg_rm) 1588{ 1589 /* Switch on the mod bits */ 1590 switch (mod_reg_rm >> 6) { 1591 case 0: 1592 /* If mod == 0, and r/m == 101, 16-bit displacement follows */ 1593 if ((mod_reg_rm & 0x7) == 0x5) 1594 return 2; 1595 /* Normally, mod == 0 means no literal displacement */ 1596 return 0; 1597 case 1: 1598 /* One byte displacement */ 1599 return 1; 1600 case 2: 1601 /* Four byte displacement */ 1602 return 4; 1603 case 3: 1604 /* Register mode */ 1605 return 0; 1606 } 1607 abort(); 1608} 1609 1610static void emulate_insn(const u8 insn[]) 1611{ 1612 unsigned long args[] = { LHREQ_TRAP, 13 }; 1613 unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access; 1614 unsigned int eax, port, mask; 1615 /* 1616 * Default is to return all-ones on IO port reads, which traditionally 1617 * means "there's nothing there". 1618 */ 1619 u32 val = 0xFFFFFFFF; 1620 1621 /* 1622 * This must be the Guest kernel trying to do something, not userspace! 1623 * The bottom two bits of the CS segment register are the privilege 1624 * level. 1625 */ 1626 if ((getreg(xcs) & 3) != 0x1) 1627 goto no_emulate; 1628 1629 /* Decoding x86 instructions is icky. */ 1630 1631 /* 1632 * Around 2.6.33, the kernel started using an emulation for the 1633 * cmpxchg8b instruction in early boot on many configurations. This 1634 * code isn't paravirtualized, and it tries to disable interrupts. 1635 * Ignore it, which will Mostly Work. 1636 */ 1637 if (insn[insnlen] == 0xfa) { 1638 /* "cli", or Clear Interrupt Enable instruction. Skip it. */ 1639 insnlen = 1; 1640 goto skip_insn; 1641 } 1642 1643 /* 1644 * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. 1645 */ 1646 if (insn[insnlen] == 0x66) { 1647 small_operand = 1; 1648 /* The instruction is 1 byte so far, read the next byte. */ 1649 insnlen = 1; 1650 } 1651 1652 /* If the lower bit isn't set, it's a single byte access */ 1653 byte_access = !(insn[insnlen] & 1); 1654 1655 /* 1656 * Now we can ignore the lower bit and decode the 4 opcodes 1657 * we need to emulate. 1658 */ 1659 switch (insn[insnlen] & 0xFE) { 1660 case 0xE4: /* in <next byte>,%al */ 1661 port = insn[insnlen+1]; 1662 insnlen += 2; 1663 in = 1; 1664 break; 1665 case 0xEC: /* in (%dx),%al */ 1666 port = getreg(edx) & 0xFFFF; 1667 insnlen += 1; 1668 in = 1; 1669 break; 1670 case 0xE6: /* out %al,<next byte> */ 1671 port = insn[insnlen+1]; 1672 insnlen += 2; 1673 break; 1674 case 0xEE: /* out %al,(%dx) */ 1675 port = getreg(edx) & 0xFFFF; 1676 insnlen += 1; 1677 break; 1678 default: 1679 /* OK, we don't know what this is, can't emulate. */ 1680 goto no_emulate; 1681 } 1682 1683 /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */ 1684 if (byte_access) 1685 mask = 0xFF; 1686 else if (small_operand) 1687 mask = 0xFFFF; 1688 else 1689 mask = 0xFFFFFFFF; 1690 1691 /* 1692 * If it was an "IN" instruction, they expect the result to be read 1693 * into %eax, so we change %eax. 1694 */ 1695 eax = getreg(eax); 1696 1697 if (in) { 1698 /* This is the PS/2 keyboard status; 1 means ready for output */ 1699 if (port == 0x64) 1700 val = 1; 1701 else if (is_pci_addr_port(port)) 1702 pci_addr_ioread(port, mask, &val); 1703 else if (is_pci_data_port(port)) 1704 pci_data_ioread(port, mask, &val); 1705 1706 /* Clear the bits we're about to read */ 1707 eax &= ~mask; 1708 /* Copy bits in from val. */ 1709 eax |= val & mask; 1710 /* Now update the register. */ 1711 setreg(eax, eax); 1712 } else { 1713 if (is_pci_addr_port(port)) { 1714 if (!pci_addr_iowrite(port, mask, eax)) 1715 goto bad_io; 1716 } else if (is_pci_data_port(port)) { 1717 if (!pci_data_iowrite(port, mask, eax)) 1718 goto bad_io; 1719 } 1720 /* There are many other ports, eg. CMOS clock, serial 1721 * and parallel ports, so we ignore them all. */ 1722 } 1723 1724 verbose("IO %s of %x to %u: %#08x\n", 1725 in ? "IN" : "OUT", mask, port, eax); 1726skip_insn: 1727 /* Finally, we've "done" the instruction, so move past it. */ 1728 setreg(eip, getreg(eip) + insnlen); 1729 return; 1730 1731bad_io: 1732 warnx("Attempt to %s port %u (%#x mask)", 1733 in ? "read from" : "write to", port, mask); 1734 1735no_emulate: 1736 /* Inject trap into Guest. */ 1737 if (write(lguest_fd, args, sizeof(args)) < 0) 1738 err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip)); 1739} 1740 1741static struct device *find_mmio_region(unsigned long paddr, u32 *off) 1742{ 1743 unsigned int i; 1744 1745 for (i = 1; i < MAX_PCI_DEVICES; i++) { 1746 struct device *d = devices.pci[i]; 1747 1748 if (!d) 1749 continue; 1750 if (paddr < d->mmio_addr) 1751 continue; 1752 if (paddr >= d->mmio_addr + d->mmio_size) 1753 continue; 1754 *off = paddr - d->mmio_addr; 1755 return d; 1756 } 1757 return NULL; 1758} 1759 1760/* FIXME: Use vq array. */ 1761static struct virtqueue *vq_by_num(struct device *d, u32 num) 1762{ 1763 struct virtqueue *vq = d->vq; 1764 1765 while (num-- && vq) 1766 vq = vq->next; 1767 1768 return vq; 1769} 1770 1771static void save_vq_config(const struct virtio_pci_common_cfg *cfg, 1772 struct virtqueue *vq) 1773{ 1774 vq->pci_config = *cfg; 1775} 1776 1777static void restore_vq_config(struct virtio_pci_common_cfg *cfg, 1778 struct virtqueue *vq) 1779{ 1780 /* Only restore the per-vq part */ 1781 size_t off = offsetof(struct virtio_pci_common_cfg, queue_size); 1782 1783 memcpy((void *)cfg + off, (void *)&vq->pci_config + off, 1784 sizeof(*cfg) - off); 1785} 1786 1787/* 1788 * 4.1.4.3.2: 1789 * 1790 * The driver MUST configure the other virtqueue fields before 1791 * enabling the virtqueue with queue_enable. 1792 * 1793 * When they enable the virtqueue, we check that their setup is valid. 1794 */ 1795static void check_virtqueue(struct device *d, struct virtqueue *vq) 1796{ 1797 /* Because lguest is 32 bit, all the descriptor high bits must be 0 */ 1798 if (vq->pci_config.queue_desc_hi 1799 || vq->pci_config.queue_avail_hi 1800 || vq->pci_config.queue_used_hi) 1801 bad_driver_vq(vq, "invalid 64-bit queue address"); 1802 1803 /* 1804 * 2.4.1: 1805 * 1806 * The driver MUST ensure that the physical address of the first byte 1807 * of each virtqueue part is a multiple of the specified alignment 1808 * value in the above table. 1809 */ 1810 if (vq->pci_config.queue_desc_lo % 16 1811 || vq->pci_config.queue_avail_lo % 2 1812 || vq->pci_config.queue_used_lo % 4) 1813 bad_driver_vq(vq, "invalid alignment in queue addresses"); 1814 1815 /* Initialize the virtqueue and check they're all in range. */ 1816 vq->vring.num = vq->pci_config.queue_size; 1817 vq->vring.desc = check_pointer(vq->dev, 1818 vq->pci_config.queue_desc_lo, 1819 sizeof(*vq->vring.desc) * vq->vring.num); 1820 vq->vring.avail = check_pointer(vq->dev, 1821 vq->pci_config.queue_avail_lo, 1822 sizeof(*vq->vring.avail) 1823 + (sizeof(vq->vring.avail->ring[0]) 1824 * vq->vring.num)); 1825 vq->vring.used = check_pointer(vq->dev, 1826 vq->pci_config.queue_used_lo, 1827 sizeof(*vq->vring.used) 1828 + (sizeof(vq->vring.used->ring[0]) 1829 * vq->vring.num)); 1830 1831 /* 1832 * 2.4.9.1: 1833 * 1834 * The driver MUST initialize flags in the used ring to 0 1835 * when allocating the used ring. 1836 */ 1837 if (vq->vring.used->flags != 0) 1838 bad_driver_vq(vq, "invalid initial used.flags %#x", 1839 vq->vring.used->flags); 1840} 1841 1842static void start_virtqueue(struct virtqueue *vq) 1843{ 1844 /* 1845 * Create stack for thread. Since the stack grows upwards, we point 1846 * the stack pointer to the end of this region. 1847 */ 1848 char *stack = malloc(32768); 1849 1850 /* Create a zero-initialized eventfd. */ 1851 vq->eventfd = eventfd(0, 0); 1852 if (vq->eventfd < 0) 1853 err(1, "Creating eventfd"); 1854 1855 /* 1856 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 1857 * we get a signal if it dies. 1858 */ 1859 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1860 if (vq->thread == (pid_t)-1) 1861 err(1, "Creating clone"); 1862} 1863 1864static void start_virtqueues(struct device *d) 1865{ 1866 struct virtqueue *vq; 1867 1868 for (vq = d->vq; vq; vq = vq->next) { 1869 if (vq->pci_config.queue_enable) 1870 start_virtqueue(vq); 1871 } 1872} 1873 1874static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask) 1875{ 1876 struct virtqueue *vq; 1877 1878 switch (off) { 1879 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): 1880 /* 1881 * 4.1.4.3.1: 1882 * 1883 * The device MUST present the feature bits it is offering in 1884 * device_feature, starting at bit device_feature_select ∗ 32 1885 * for any device_feature_select written by the driver 1886 */ 1887 if (val == 0) 1888 d->mmio->cfg.device_feature = d->features; 1889 else if (val == 1) 1890 d->mmio->cfg.device_feature = (d->features >> 32); 1891 else 1892 d->mmio->cfg.device_feature = 0; 1893 goto feature_write_through32; 1894 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): 1895 if (val > 1) 1896 bad_driver(d, "Unexpected driver select %u", val); 1897 goto feature_write_through32; 1898 case offsetof(struct virtio_pci_mmio, cfg.guest_feature): 1899 if (d->mmio->cfg.guest_feature_select == 0) { 1900 d->features_accepted &= ~((u64)0xFFFFFFFF); 1901 d->features_accepted |= val; 1902 } else { 1903 assert(d->mmio->cfg.guest_feature_select == 1); 1904 d->features_accepted &= 0xFFFFFFFF; 1905 d->features_accepted |= ((u64)val) << 32; 1906 } 1907 /* 1908 * 2.2.1: 1909 * 1910 * The driver MUST NOT accept a feature which the device did 1911 * not offer 1912 */ 1913 if (d->features_accepted & ~d->features) 1914 bad_driver(d, "over-accepted features %#llx of %#llx", 1915 d->features_accepted, d->features); 1916 goto feature_write_through32; 1917 case offsetof(struct virtio_pci_mmio, cfg.device_status): { 1918 u8 prev; 1919 1920 verbose("%s: device status -> %#x\n", d->name, val); 1921 /* 1922 * 4.1.4.3.1: 1923 * 1924 * The device MUST reset when 0 is written to device_status, 1925 * and present a 0 in device_status once that is done. 1926 */ 1927 if (val == 0) { 1928 reset_device(d); 1929 goto write_through8; 1930 } 1931 1932 /* 2.1.1: The driver MUST NOT clear a device status bit. */ 1933 if (d->mmio->cfg.device_status & ~val) 1934 bad_driver(d, "unset of device status bit %#x -> %#x", 1935 d->mmio->cfg.device_status, val); 1936 1937 /* 1938 * 2.1.2: 1939 * 1940 * The device MUST NOT consume buffers or notify the driver 1941 * before DRIVER_OK. 1942 */ 1943 if (val & VIRTIO_CONFIG_S_DRIVER_OK 1944 && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) 1945 start_virtqueues(d); 1946 1947 /* 1948 * 3.1.1: 1949 * 1950 * The driver MUST follow this sequence to initialize a device: 1951 * - Reset the device. 1952 * - Set the ACKNOWLEDGE status bit: the guest OS has 1953 * notice the device. 1954 * - Set the DRIVER status bit: the guest OS knows how 1955 * to drive the device. 1956 * - Read device feature bits, and write the subset 1957 * of feature bits understood by the OS and driver 1958 * to the device. During this step the driver MAY 1959 * read (but MUST NOT write) the device-specific 1960 * configuration fields to check that it can 1961 * support the device before accepting it. 1962 * - Set the FEATURES_OK status bit. The driver 1963 * MUST not accept new feature bits after this 1964 * step. 1965 * - Re-read device status to ensure the FEATURES_OK 1966 * bit is still set: otherwise, the device does 1967 * not support our subset of features and the 1968 * device is unusable. 1969 * - Perform device-specific setup, including 1970 * discovery of virtqueues for the device, 1971 * optional per-bus setup, reading and possibly 1972 * writing the device’s virtio configuration 1973 * space, and population of virtqueues. 1974 * - Set the DRIVER_OK status bit. At this point the 1975 * device is “live”. 1976 */ 1977 prev = 0; 1978 switch (val & ~d->mmio->cfg.device_status) { 1979 case VIRTIO_CONFIG_S_DRIVER_OK: 1980 prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */ 1981 case VIRTIO_CONFIG_S_FEATURES_OK: 1982 prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */ 1983 case VIRTIO_CONFIG_S_DRIVER: 1984 prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */ 1985 case VIRTIO_CONFIG_S_ACKNOWLEDGE: 1986 break; 1987 default: 1988 bad_driver(d, "unknown device status bit %#x -> %#x", 1989 d->mmio->cfg.device_status, val); 1990 } 1991 if (d->mmio->cfg.device_status != prev) 1992 bad_driver(d, "unexpected status transition %#x -> %#x", 1993 d->mmio->cfg.device_status, val); 1994 1995 /* If they just wrote FEATURES_OK, we make sure they read */ 1996 switch (val & ~d->mmio->cfg.device_status) { 1997 case VIRTIO_CONFIG_S_FEATURES_OK: 1998 d->wrote_features_ok = true; 1999 break; 2000 case VIRTIO_CONFIG_S_DRIVER_OK: 2001 if (d->wrote_features_ok) 2002 bad_driver(d, "did not re-read FEATURES_OK"); 2003 break; 2004 } 2005 goto write_through8; 2006 } 2007 case offsetof(struct virtio_pci_mmio, cfg.queue_select): 2008 vq = vq_by_num(d, val); 2009 /* 2010 * 4.1.4.3.1: 2011 * 2012 * The device MUST present a 0 in queue_size if the virtqueue 2013 * corresponding to the current queue_select is unavailable. 2014 */ 2015 if (!vq) { 2016 d->mmio->cfg.queue_size = 0; 2017 goto write_through16; 2018 } 2019 /* Save registers for old vq, if it was a valid vq */ 2020 if (d->mmio->cfg.queue_size) 2021 save_vq_config(&d->mmio->cfg, 2022 vq_by_num(d, d->mmio->cfg.queue_select)); 2023 /* Restore the registers for the queue they asked for */ 2024 restore_vq_config(&d->mmio->cfg, vq); 2025 goto write_through16; 2026 case offsetof(struct virtio_pci_mmio, cfg.queue_size): 2027 /* 2028 * 4.1.4.3.2: 2029 * 2030 * The driver MUST NOT write a value which is not a power of 2 2031 * to queue_size. 2032 */ 2033 if (val & (val-1)) 2034 bad_driver(d, "invalid queue size %u", val); 2035 if (d->mmio->cfg.queue_enable) 2036 bad_driver(d, "changing queue size on live device"); 2037 goto write_through16; 2038 case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector): 2039 bad_driver(d, "attempt to set MSIX vector to %u", val); 2040 case offsetof(struct virtio_pci_mmio, cfg.queue_enable): { 2041 struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select); 2042 2043 /* 2044 * 4.1.4.3.2: 2045 * 2046 * The driver MUST NOT write a 0 to queue_enable. 2047 */ 2048 if (val != 1) 2049 bad_driver(d, "setting queue_enable to %u", val); 2050 2051 /* 2052 * 3.1.1: 2053 * 2054 * 7. Perform device-specific setup, including discovery of 2055 * virtqueues for the device, optional per-bus setup, 2056 * reading and possibly writing the device’s virtio 2057 * configuration space, and population of virtqueues. 2058 * 8. Set the DRIVER_OK status bit. 2059 * 2060 * All our devices require all virtqueues to be enabled, so 2061 * they should have done that before setting DRIVER_OK. 2062 */ 2063 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK) 2064 bad_driver(d, "enabling vq after DRIVER_OK"); 2065 2066 d->mmio->cfg.queue_enable = val; 2067 save_vq_config(&d->mmio->cfg, vq); 2068 check_virtqueue(d, vq); 2069 goto write_through16; 2070 } 2071 case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off): 2072 bad_driver(d, "attempt to write to queue_notify_off"); 2073 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo): 2074 case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi): 2075 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo): 2076 case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi): 2077 case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo): 2078 case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi): 2079 /* 2080 * 4.1.4.3.2: 2081 * 2082 * The driver MUST configure the other virtqueue fields before 2083 * enabling the virtqueue with queue_enable. 2084 */ 2085 if (d->mmio->cfg.queue_enable) 2086 bad_driver(d, "changing queue on live device"); 2087 2088 /* 2089 * 3.1.1: 2090 * 2091 * The driver MUST follow this sequence to initialize a device: 2092 *... 2093 * 5. Set the FEATURES_OK status bit. The driver MUST not 2094 * accept new feature bits after this step. 2095 */ 2096 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)) 2097 bad_driver(d, "setting up vq before FEATURES_OK"); 2098 2099 /* 2100 * 6. Re-read device status to ensure the FEATURES_OK bit is 2101 * still set... 2102 */ 2103 if (d->wrote_features_ok) 2104 bad_driver(d, "didn't re-read FEATURES_OK before setup"); 2105 2106 goto write_through32; 2107 case offsetof(struct virtio_pci_mmio, notify): 2108 vq = vq_by_num(d, val); 2109 if (!vq) 2110 bad_driver(d, "Invalid vq notification on %u", val); 2111 /* Notify the process handling this vq by adding 1 to eventfd */ 2112 write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8); 2113 goto write_through16; 2114 case offsetof(struct virtio_pci_mmio, isr): 2115 bad_driver(d, "Unexpected write to isr"); 2116 /* Weird corner case: write to emerg_wr of console */ 2117 case sizeof(struct virtio_pci_mmio) 2118 + offsetof(struct virtio_console_config, emerg_wr): 2119 if (strcmp(d->name, "console") == 0) { 2120 char c = val; 2121 write(STDOUT_FILENO, &c, 1); 2122 goto write_through32; 2123 } 2124 /* Fall through... */ 2125 default: 2126 /* 2127 * 4.1.4.3.2: 2128 * 2129 * The driver MUST NOT write to device_feature, num_queues, 2130 * config_generation or queue_notify_off. 2131 */ 2132 bad_driver(d, "Unexpected write to offset %u", off); 2133 } 2134 2135feature_write_through32: 2136 /* 2137 * 3.1.1: 2138 * 2139 * The driver MUST follow this sequence to initialize a device: 2140 *... 2141 * - Set the DRIVER status bit: the guest OS knows how 2142 * to drive the device. 2143 * - Read device feature bits, and write the subset 2144 * of feature bits understood by the OS and driver 2145 * to the device. 2146 *... 2147 * - Set the FEATURES_OK status bit. The driver MUST not 2148 * accept new feature bits after this step. 2149 */ 2150 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2151 bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER"); 2152 if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK) 2153 bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK"); 2154 2155 /* 2156 * 4.1.3.1: 2157 * 2158 * The driver MUST access each field using the “natural” access 2159 * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for 2160 * 16-bit fields and 8-bit accesses for 8-bit fields. 2161 */ 2162write_through32: 2163 if (mask != 0xFFFFFFFF) { 2164 bad_driver(d, "non-32-bit write to offset %u (%#x)", 2165 off, getreg(eip)); 2166 return; 2167 } 2168 memcpy((char *)d->mmio + off, &val, 4); 2169 return; 2170 2171write_through16: 2172 if (mask != 0xFFFF) 2173 bad_driver(d, "non-16-bit write to offset %u (%#x)", 2174 off, getreg(eip)); 2175 memcpy((char *)d->mmio + off, &val, 2); 2176 return; 2177 2178write_through8: 2179 if (mask != 0xFF) 2180 bad_driver(d, "non-8-bit write to offset %u (%#x)", 2181 off, getreg(eip)); 2182 memcpy((char *)d->mmio + off, &val, 1); 2183 return; 2184} 2185 2186static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask) 2187{ 2188 u8 isr; 2189 u32 val = 0; 2190 2191 switch (off) { 2192 case offsetof(struct virtio_pci_mmio, cfg.device_feature_select): 2193 case offsetof(struct virtio_pci_mmio, cfg.device_feature): 2194 case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select): 2195 case offsetof(struct virtio_pci_mmio, cfg.guest_feature): 2196 /* 2197 * 3.1.1: 2198 * 2199 * The driver MUST follow this sequence to initialize a device: 2200 *... 2201 * - Set the DRIVER status bit: the guest OS knows how 2202 * to drive the device. 2203 * - Read device feature bits, and write the subset 2204 * of feature bits understood by the OS and driver 2205 * to the device. 2206 */ 2207 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2208 bad_driver(d, 2209 "feature read before VIRTIO_CONFIG_S_DRIVER"); 2210 goto read_through32; 2211 case offsetof(struct virtio_pci_mmio, cfg.msix_config): 2212 bad_driver(d, "read of msix_config"); 2213 case offsetof(struct virtio_pci_mmio, cfg.num_queues): 2214 goto read_through16; 2215 case offsetof(struct virtio_pci_mmio, cfg.device_status): 2216 /* As they did read, any write of FEATURES_OK is now fine. */ 2217 d->wrote_features_ok = false; 2218 goto read_through8; 2219 case offsetof(struct virtio_pci_mmio, cfg.config_generation): 2220 /* 2221 * 4.1.4.3.1: 2222 * 2223 * The device MUST present a changed config_generation after 2224 * the driver has read a device-specific configuration value 2225 * which has changed since any part of the device-specific 2226 * configuration was last read. 2227 * 2228 * This is simple: none of our devices change config, so this 2229 * is always 0. 2230 */ 2231 goto read_through8; 2232 case offsetof(struct virtio_pci_mmio, notify): 2233 /* 2234 * 3.1.1: 2235 * 2236 * The driver MUST NOT notify the device before setting 2237 * DRIVER_OK. 2238 */ 2239 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)) 2240 bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK"); 2241 goto read_through16; 2242 case offsetof(struct virtio_pci_mmio, isr): 2243 if (mask != 0xFF) 2244 bad_driver(d, "non-8-bit read from offset %u (%#x)", 2245 off, getreg(eip)); 2246 isr = d->mmio->isr; 2247 /* 2248 * 4.1.4.5.1: 2249 * 2250 * The device MUST reset ISR status to 0 on driver read. 2251 */ 2252 d->mmio->isr = 0; 2253 return isr; 2254 case offsetof(struct virtio_pci_mmio, padding): 2255 bad_driver(d, "read from padding (%#x)", getreg(eip)); 2256 default: 2257 /* Read from device config space, beware unaligned overflow */ 2258 if (off > d->mmio_size - 4) 2259 bad_driver(d, "read past end (%#x)", getreg(eip)); 2260 2261 /* 2262 * 3.1.1: 2263 * The driver MUST follow this sequence to initialize a device: 2264 *... 2265 * 3. Set the DRIVER status bit: the guest OS knows how to 2266 * drive the device. 2267 * 4. Read device feature bits, and write the subset of 2268 * feature bits understood by the OS and driver to the 2269 * device. During this step the driver MAY read (but MUST NOT 2270 * write) the device-specific configuration fields to check 2271 * that it can support the device before accepting it. 2272 */ 2273 if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER)) 2274 bad_driver(d, 2275 "config read before VIRTIO_CONFIG_S_DRIVER"); 2276 2277 if (mask == 0xFFFFFFFF) 2278 goto read_through32; 2279 else if (mask == 0xFFFF) 2280 goto read_through16; 2281 else 2282 goto read_through8; 2283 } 2284 2285 /* 2286 * 4.1.3.1: 2287 * 2288 * The driver MUST access each field using the “natural” access 2289 * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for 2290 * 16-bit fields and 8-bit accesses for 8-bit fields. 2291 */ 2292read_through32: 2293 if (mask != 0xFFFFFFFF) 2294 bad_driver(d, "non-32-bit read to offset %u (%#x)", 2295 off, getreg(eip)); 2296 memcpy(&val, (char *)d->mmio + off, 4); 2297 return val; 2298 2299read_through16: 2300 if (mask != 0xFFFF) 2301 bad_driver(d, "non-16-bit read to offset %u (%#x)", 2302 off, getreg(eip)); 2303 memcpy(&val, (char *)d->mmio + off, 2); 2304 return val; 2305 2306read_through8: 2307 if (mask != 0xFF) 2308 bad_driver(d, "non-8-bit read to offset %u (%#x)", 2309 off, getreg(eip)); 2310 memcpy(&val, (char *)d->mmio + off, 1); 2311 return val; 2312} 2313 2314static void emulate_mmio(unsigned long paddr, const u8 *insn) 2315{ 2316 u32 val, off, mask = 0xFFFFFFFF, insnlen = 0; 2317 struct device *d = find_mmio_region(paddr, &off); 2318 unsigned long args[] = { LHREQ_TRAP, 14 }; 2319 2320 if (!d) { 2321 warnx("MMIO touching %#08lx (not a device)", paddr); 2322 goto reinject; 2323 } 2324 2325 /* Prefix makes it a 16 bit op */ 2326 if (insn[0] == 0x66) { 2327 mask = 0xFFFF; 2328 insnlen++; 2329 } 2330 2331 /* iowrite */ 2332 if (insn[insnlen] == 0x89) { 2333 /* Next byte is r/m byte: bits 3-5 are register. */ 2334 val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask); 2335 emulate_mmio_write(d, off, val, mask); 2336 insnlen += 2 + insn_displacement_len(insn[insnlen+1]); 2337 } else if (insn[insnlen] == 0x8b) { /* ioread */ 2338 /* Next byte is r/m byte: bits 3-5 are register. */ 2339 val = emulate_mmio_read(d, off, mask); 2340 setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask); 2341 insnlen += 2 + insn_displacement_len(insn[insnlen+1]); 2342 } else if (insn[0] == 0x88) { /* 8-bit iowrite */ 2343 mask = 0xff; 2344 /* Next byte is r/m byte: bits 3-5 are register. */ 2345 val = getreg_num((insn[1] >> 3) & 0x7, mask); 2346 emulate_mmio_write(d, off, val, mask); 2347 insnlen = 2 + insn_displacement_len(insn[1]); 2348 } else if (insn[0] == 0x8a) { /* 8-bit ioread */ 2349 mask = 0xff; 2350 val = emulate_mmio_read(d, off, mask); 2351 setreg_num((insn[1] >> 3) & 0x7, val, mask); 2352 insnlen = 2 + insn_displacement_len(insn[1]); 2353 } else { 2354 warnx("Unknown MMIO instruction touching %#08lx:" 2355 " %02x %02x %02x %02x at %u", 2356 paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip)); 2357 reinject: 2358 /* Inject trap into Guest. */ 2359 if (write(lguest_fd, args, sizeof(args)) < 0) 2360 err(1, "Reinjecting trap 14 for fault at %#x", 2361 getreg(eip)); 2362 return; 2363 } 2364 2365 /* Finally, we've "done" the instruction, so move past it. */ 2366 setreg(eip, getreg(eip) + insnlen); 2367} 2368 2369/*L:190 2370 * Device Setup 2371 * 2372 * All devices need a descriptor so the Guest knows it exists, and a "struct 2373 * device" so the Launcher can keep track of it. We have common helper 2374 * routines to allocate and manage them. 2375 */ 2376static void add_pci_virtqueue(struct device *dev, 2377 void (*service)(struct virtqueue *), 2378 const char *name) 2379{ 2380 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 2381 2382 /* Initialize the virtqueue */ 2383 vq->next = NULL; 2384 vq->last_avail_idx = 0; 2385 vq->dev = dev; 2386 vq->name = name; 2387 2388 /* 2389 * This is the routine the service thread will run, and its Process ID 2390 * once it's running. 2391 */ 2392 vq->service = service; 2393 vq->thread = (pid_t)-1; 2394 2395 /* Initialize the configuration. */ 2396 reset_vq_pci_config(vq); 2397 vq->pci_config.queue_notify_off = 0; 2398 2399 /* Add one to the number of queues */ 2400 vq->dev->mmio->cfg.num_queues++; 2401 2402 /* 2403 * Add to tail of list, so dev->vq is first vq, dev->vq->next is 2404 * second. 2405 */ 2406 for (i = &dev->vq; *i; i = &(*i)->next); 2407 *i = vq; 2408} 2409 2410/* The Guest accesses the feature bits via the PCI common config MMIO region */ 2411static void add_pci_feature(struct device *dev, unsigned bit) 2412{ 2413 dev->features |= (1ULL << bit); 2414} 2415 2416/* For devices with no config. */ 2417static void no_device_config(struct device *dev) 2418{ 2419 dev->mmio_addr = get_mmio_region(dev->mmio_size); 2420 2421 dev->config.bar[0] = dev->mmio_addr; 2422 /* Bottom 4 bits must be zero */ 2423 assert(~(dev->config.bar[0] & 0xF)); 2424} 2425 2426/* This puts the device config into BAR0 */ 2427static void set_device_config(struct device *dev, const void *conf, size_t len) 2428{ 2429 /* Set up BAR 0 */ 2430 dev->mmio_size += len; 2431 dev->mmio = realloc(dev->mmio, dev->mmio_size); 2432 memcpy(dev->mmio + 1, conf, len); 2433 2434 /* 2435 * 4.1.4.6: 2436 * 2437 * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG 2438 * capability for any device type which has a device-specific 2439 * configuration. 2440 */ 2441 /* Hook up device cfg */ 2442 dev->config.cfg_access.cap.cap_next 2443 = offsetof(struct pci_config, device); 2444 2445 /* 2446 * 4.1.4.6.1: 2447 * 2448 * The offset for the device-specific configuration MUST be 4-byte 2449 * aligned. 2450 */ 2451 assert(dev->config.cfg_access.cap.cap_next % 4 == 0); 2452 2453 /* Fix up device cfg field length. */ 2454 dev->config.device.length = len; 2455 2456 /* The rest is the same as the no-config case */ 2457 no_device_config(dev); 2458} 2459 2460static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type, 2461 size_t bar_offset, size_t bar_bytes, u8 next) 2462{ 2463 cap->cap_vndr = PCI_CAP_ID_VNDR; 2464 cap->cap_next = next; 2465 cap->cap_len = caplen; 2466 cap->cfg_type = type; 2467 cap->bar = 0; 2468 memset(cap->padding, 0, sizeof(cap->padding)); 2469 cap->offset = bar_offset; 2470 cap->length = bar_bytes; 2471} 2472 2473/* 2474 * This sets up the pci_config structure, as defined in the virtio 1.0 2475 * standard (and PCI standard). 2476 */ 2477static void init_pci_config(struct pci_config *pci, u16 type, 2478 u8 class, u8 subclass) 2479{ 2480 size_t bar_offset, bar_len; 2481 2482 /* 2483 * 4.1.4.4.1: 2484 * 2485 * The device MUST either present notify_off_multiplier as an even 2486 * power of 2, or present notify_off_multiplier as 0. 2487 * 2488 * 2.1.2: 2489 * 2490 * The device MUST initialize device status to 0 upon reset. 2491 */ 2492 memset(pci, 0, sizeof(*pci)); 2493 2494 /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */ 2495 pci->vendor_id = 0x1AF4; 2496 /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */ 2497 pci->device_id = 0x1040 + type; 2498 2499 /* 2500 * PCI have specific codes for different types of devices. 2501 * Linux doesn't care, but it's a good clue for people looking 2502 * at the device. 2503 */ 2504 pci->class = class; 2505 pci->subclass = subclass; 2506 2507 /* 2508 * 4.1.2.1: 2509 * 2510 * Non-transitional devices SHOULD have a PCI Revision ID of 1 or 2511 * higher 2512 */ 2513 pci->revid = 1; 2514 2515 /* 2516 * 4.1.2.1: 2517 * 2518 * Non-transitional devices SHOULD have a PCI Subsystem Device ID of 2519 * 0x40 or higher. 2520 */ 2521 pci->subsystem_device_id = 0x40; 2522 2523 /* We use our dummy interrupt controller, and irq_line is the irq */ 2524 pci->irq_line = devices.next_irq++; 2525 pci->irq_pin = 0; 2526 2527 /* Support for extended capabilities. */ 2528 pci->status = (1 << 4); 2529 2530 /* Link them in. */ 2531 /* 2532 * 4.1.4.3.1: 2533 * 2534 * The device MUST present at least one common configuration 2535 * capability. 2536 */ 2537 pci->capabilities = offsetof(struct pci_config, common); 2538 2539 /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */ 2540 assert(pci->capabilities % 4 == 0); 2541 2542 bar_offset = offsetof(struct virtio_pci_mmio, cfg); 2543 bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg); 2544 init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG, 2545 bar_offset, bar_len, 2546 offsetof(struct pci_config, notify)); 2547 2548 /* 2549 * 4.1.4.4.1: 2550 * 2551 * The device MUST present at least one notification capability. 2552 */ 2553 bar_offset += bar_len; 2554 bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify); 2555 2556 /* 2557 * 4.1.4.4.1: 2558 * 2559 * The cap.offset MUST be 2-byte aligned. 2560 */ 2561 assert(pci->common.cap_next % 2 == 0); 2562 2563 /* FIXME: Use a non-zero notify_off, for per-queue notification? */ 2564 /* 2565 * 4.1.4.4.1: 2566 * 2567 * The value cap.length presented by the device MUST be at least 2 and 2568 * MUST be large enough to support queue notification offsets for all 2569 * supported queues in all possible configurations. 2570 */ 2571 assert(bar_len >= 2); 2572 2573 init_cap(&pci->notify.cap, sizeof(pci->notify), 2574 VIRTIO_PCI_CAP_NOTIFY_CFG, 2575 bar_offset, bar_len, 2576 offsetof(struct pci_config, isr)); 2577 2578 bar_offset += bar_len; 2579 bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr); 2580 /* 2581 * 4.1.4.5.1: 2582 * 2583 * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG 2584 * capability. 2585 */ 2586 init_cap(&pci->isr, sizeof(pci->isr), 2587 VIRTIO_PCI_CAP_ISR_CFG, 2588 bar_offset, bar_len, 2589 offsetof(struct pci_config, cfg_access)); 2590 2591 /* 2592 * 4.1.4.7.1: 2593 * 2594 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG 2595 * capability. 2596 */ 2597 /* This doesn't have any presence in the BAR */ 2598 init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access), 2599 VIRTIO_PCI_CAP_PCI_CFG, 2600 0, 0, 0); 2601 2602 bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding); 2603 assert(bar_offset == sizeof(struct virtio_pci_mmio)); 2604 2605 /* 2606 * This gets sewn in and length set in set_device_config(). 2607 * Some devices don't have a device configuration interface, so 2608 * we never expose this if we don't call set_device_config(). 2609 */ 2610 init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG, 2611 bar_offset, 0, 0); 2612} 2613 2614/* 2615 * This routine does all the creation and setup of a new device, but we don't 2616 * actually place the MMIO region until we know the size (if any) of the 2617 * device-specific config. And we don't actually start the service threads 2618 * until later. 2619 * 2620 * See what I mean about userspace being boring? 2621 */ 2622static struct device *new_pci_device(const char *name, u16 type, 2623 u8 class, u8 subclass) 2624{ 2625 struct device *dev = malloc(sizeof(*dev)); 2626 2627 /* Now we populate the fields one at a time. */ 2628 dev->name = name; 2629 dev->vq = NULL; 2630 dev->running = false; 2631 dev->wrote_features_ok = false; 2632 dev->mmio_size = sizeof(struct virtio_pci_mmio); 2633 dev->mmio = calloc(1, dev->mmio_size); 2634 dev->features = (u64)1 << VIRTIO_F_VERSION_1; 2635 dev->features_accepted = 0; 2636 2637 if (devices.device_num + 1 >= MAX_PCI_DEVICES) 2638 errx(1, "Can only handle 31 PCI devices"); 2639 2640 init_pci_config(&dev->config, type, class, subclass); 2641 assert(!devices.pci[devices.device_num+1]); 2642 devices.pci[++devices.device_num] = dev; 2643 2644 return dev; 2645} 2646 2647/* 2648 * Our first setup routine is the console. It's a fairly simple device, but 2649 * UNIX tty handling makes it uglier than it could be. 2650 */ 2651static void setup_console(void) 2652{ 2653 struct device *dev; 2654 struct virtio_console_config conf; 2655 2656 /* If we can save the initial standard input settings... */ 2657 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 2658 struct termios term = orig_term; 2659 /* 2660 * Then we turn off echo, line buffering and ^C etc: We want a 2661 * raw input stream to the Guest. 2662 */ 2663 term.c_lflag &= ~(ISIG|ICANON|ECHO); 2664 tcsetattr(STDIN_FILENO, TCSANOW, &term); 2665 } 2666 2667 dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00); 2668 2669 /* We store the console state in dev->priv, and initialize it. */ 2670 dev->priv = malloc(sizeof(struct console_abort)); 2671 ((struct console_abort *)dev->priv)->count = 0; 2672 2673 /* 2674 * The console needs two virtqueues: the input then the output. When 2675 * they put something the input queue, we make sure we're listening to 2676 * stdin. When they put something in the output queue, we write it to 2677 * stdout. 2678 */ 2679 add_pci_virtqueue(dev, console_input, "input"); 2680 add_pci_virtqueue(dev, console_output, "output"); 2681 2682 /* We need a configuration area for the emerg_wr early writes. */ 2683 add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE); 2684 set_device_config(dev, &conf, sizeof(conf)); 2685 2686 verbose("device %u: console\n", devices.device_num); 2687} 2688/*:*/ 2689 2690/*M:010 2691 * Inter-guest networking is an interesting area. Simplest is to have a 2692 * --sharenet=<name> option which opens or creates a named pipe. This can be 2693 * used to send packets to another guest in a 1:1 manner. 2694 * 2695 * More sophisticated is to use one of the tools developed for project like UML 2696 * to do networking. 2697 * 2698 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 2699 * completely generic ("here's my vring, attach to your vring") and would work 2700 * for any traffic. Of course, namespace and permissions issues need to be 2701 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide 2702 * multiple inter-guest channels behind one interface, although it would 2703 * require some manner of hotplugging new virtio channels. 2704 * 2705 * Finally, we could use a virtio network switch in the kernel, ie. vhost. 2706:*/ 2707 2708static u32 str2ip(const char *ipaddr) 2709{ 2710 unsigned int b[4]; 2711 2712 if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) 2713 errx(1, "Failed to parse IP address '%s'", ipaddr); 2714 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; 2715} 2716 2717static void str2mac(const char *macaddr, unsigned char mac[6]) 2718{ 2719 unsigned int m[6]; 2720 if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", 2721 &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) 2722 errx(1, "Failed to parse mac address '%s'", macaddr); 2723 mac[0] = m[0]; 2724 mac[1] = m[1]; 2725 mac[2] = m[2]; 2726 mac[3] = m[3]; 2727 mac[4] = m[4]; 2728 mac[5] = m[5]; 2729} 2730 2731/* 2732 * This code is "adapted" from libbridge: it attaches the Host end of the 2733 * network device to the bridge device specified by the command line. 2734 * 2735 * This is yet another James Morris contribution (I'm an IP-level guy, so I 2736 * dislike bridging), and I just try not to break it. 2737 */ 2738static void add_to_bridge(int fd, const char *if_name, const char *br_name) 2739{ 2740 int ifidx; 2741 struct ifreq ifr; 2742 2743 if (!*br_name) 2744 errx(1, "must specify bridge name"); 2745 2746 ifidx = if_nametoindex(if_name); 2747 if (!ifidx) 2748 errx(1, "interface %s does not exist!", if_name); 2749 2750 strncpy(ifr.ifr_name, br_name, IFNAMSIZ); 2751 ifr.ifr_name[IFNAMSIZ-1] = '\0'; 2752 ifr.ifr_ifindex = ifidx; 2753 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) 2754 err(1, "can't add %s to bridge %s", if_name, br_name); 2755} 2756 2757/* 2758 * This sets up the Host end of the network device with an IP address, brings 2759 * it up so packets will flow, the copies the MAC address into the hwaddr 2760 * pointer. 2761 */ 2762static void configure_device(int fd, const char *tapif, u32 ipaddr) 2763{ 2764 struct ifreq ifr; 2765 struct sockaddr_in sin; 2766 2767 memset(&ifr, 0, sizeof(ifr)); 2768 strcpy(ifr.ifr_name, tapif); 2769 2770 /* Don't read these incantations. Just cut & paste them like I did! */ 2771 sin.sin_family = AF_INET; 2772 sin.sin_addr.s_addr = htonl(ipaddr); 2773 memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); 2774 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) 2775 err(1, "Setting %s interface address", tapif); 2776 ifr.ifr_flags = IFF_UP; 2777 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) 2778 err(1, "Bringing interface %s up", tapif); 2779} 2780 2781static int get_tun_device(char tapif[IFNAMSIZ]) 2782{ 2783 struct ifreq ifr; 2784 int vnet_hdr_sz; 2785 int netfd; 2786 2787 /* Start with this zeroed. Messy but sure. */ 2788 memset(&ifr, 0, sizeof(ifr)); 2789 2790 /* 2791 * We open the /dev/net/tun device and tell it we want a tap device. A 2792 * tap device is like a tun device, only somehow different. To tell 2793 * the truth, I completely blundered my way through this code, but it 2794 * works now! 2795 */ 2796 netfd = open_or_die("/dev/net/tun", O_RDWR); 2797 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 2798 strcpy(ifr.ifr_name, "tap%d"); 2799 if (ioctl(netfd, TUNSETIFF, &ifr) != 0) 2800 err(1, "configuring /dev/net/tun"); 2801 2802 if (ioctl(netfd, TUNSETOFFLOAD, 2803 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 2804 err(1, "Could not set features for tun device"); 2805 2806 /* 2807 * We don't need checksums calculated for packets coming in this 2808 * device: trust us! 2809 */ 2810 ioctl(netfd, TUNSETNOCSUM, 1); 2811 2812 /* 2813 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit 2814 * field at the end of the network header iff 2815 * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0, 2816 * that became the norm, but we need to tell the tun device 2817 * about our expanded header (which is called 2818 * virtio_net_hdr_mrg_rxbuf in the legacy system). 2819 */ 2820 vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1); 2821 if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0) 2822 err(1, "Setting tun header size to %u", vnet_hdr_sz); 2823 2824 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 2825 return netfd; 2826} 2827 2828/*L:195 2829 * Our network is a Host<->Guest network. This can either use bridging or 2830 * routing, but the principle is the same: it uses the "tun" device to inject 2831 * packets into the Host as if they came in from a normal network card. We 2832 * just shunt packets between the Guest and the tun device. 2833 */ 2834static void setup_tun_net(char *arg) 2835{ 2836 struct device *dev; 2837 struct net_info *net_info = malloc(sizeof(*net_info)); 2838 int ipfd; 2839 u32 ip = INADDR_ANY; 2840 bool bridging = false; 2841 char tapif[IFNAMSIZ], *p; 2842 struct virtio_net_config conf; 2843 2844 net_info->tunfd = get_tun_device(tapif); 2845 2846 /* First we create a new network device. */ 2847 dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00); 2848 dev->priv = net_info; 2849 2850 /* Network devices need a recv and a send queue, just like console. */ 2851 add_pci_virtqueue(dev, net_input, "rx"); 2852 add_pci_virtqueue(dev, net_output, "tx"); 2853 2854 /* 2855 * We need a socket to perform the magic network ioctls to bring up the 2856 * tap interface, connect to the bridge etc. Any socket will do! 2857 */ 2858 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 2859 if (ipfd < 0) 2860 err(1, "opening IP socket"); 2861 2862 /* If the command line was --tunnet=bridge:<name> do bridging. */ 2863 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { 2864 arg += strlen(BRIDGE_PFX); 2865 bridging = true; 2866 } 2867 2868 /* A mac address may follow the bridge name or IP address */ 2869 p = strchr(arg, ':'); 2870 if (p) { 2871 str2mac(p+1, conf.mac); 2872 add_pci_feature(dev, VIRTIO_NET_F_MAC); 2873 *p = '\0'; 2874 } 2875 2876 /* arg is now either an IP address or a bridge name */ 2877 if (bridging) 2878 add_to_bridge(ipfd, tapif, arg); 2879 else 2880 ip = str2ip(arg); 2881 2882 /* Set up the tun device. */ 2883 configure_device(ipfd, tapif, ip); 2884 2885 /* Expect Guest to handle everything except UFO */ 2886 add_pci_feature(dev, VIRTIO_NET_F_CSUM); 2887 add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 2888 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 2889 add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 2890 add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN); 2891 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4); 2892 add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6); 2893 add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN); 2894 /* We handle indirect ring entries */ 2895 add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); 2896 set_device_config(dev, &conf, sizeof(conf)); 2897 2898 /* We don't need the socket any more; setup is done. */ 2899 close(ipfd); 2900 2901 if (bridging) 2902 verbose("device %u: tun %s attached to bridge: %s\n", 2903 devices.device_num, tapif, arg); 2904 else 2905 verbose("device %u: tun %s: %s\n", 2906 devices.device_num, tapif, arg); 2907} 2908/*:*/ 2909 2910/* This hangs off device->priv. */ 2911struct vblk_info { 2912 /* The size of the file. */ 2913 off64_t len; 2914 2915 /* The file descriptor for the file. */ 2916 int fd; 2917 2918}; 2919 2920/*L:210 2921 * The Disk 2922 * 2923 * The disk only has one virtqueue, so it only has one thread. It is really 2924 * simple: the Guest asks for a block number and we read or write that position 2925 * in the file. 2926 * 2927 * Before we serviced each virtqueue in a separate thread, that was unacceptably 2928 * slow: the Guest waits until the read is finished before running anything 2929 * else, even if it could have been doing useful work. 2930 * 2931 * We could have used async I/O, except it's reputed to suck so hard that 2932 * characters actually go missing from your code when you try to use it. 2933 */ 2934static void blk_request(struct virtqueue *vq) 2935{ 2936 struct vblk_info *vblk = vq->dev->priv; 2937 unsigned int head, out_num, in_num, wlen; 2938 int ret, i; 2939 u8 *in; 2940 struct virtio_blk_outhdr out; 2941 struct iovec iov[vq->vring.num]; 2942 off64_t off; 2943 2944 /* 2945 * Get the next request, where we normally wait. It triggers the 2946 * interrupt to acknowledge previously serviced requests (if any). 2947 */ 2948 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 2949 2950 /* Copy the output header from the front of the iov (adjusts iov) */ 2951 iov_consume(vq->dev, iov, out_num, &out, sizeof(out)); 2952 2953 /* Find and trim end of iov input array, for our status byte. */ 2954 in = NULL; 2955 for (i = out_num + in_num - 1; i >= out_num; i--) { 2956 if (iov[i].iov_len > 0) { 2957 in = iov[i].iov_base + iov[i].iov_len - 1; 2958 iov[i].iov_len--; 2959 break; 2960 } 2961 } 2962 if (!in) 2963 bad_driver_vq(vq, "Bad virtblk cmd with no room for status"); 2964 2965 /* 2966 * For historical reasons, block operations are expressed in 512 byte 2967 * "sectors". 2968 */ 2969 off = out.sector * 512; 2970 2971 if (out.type & VIRTIO_BLK_T_OUT) { 2972 /* 2973 * Write 2974 * 2975 * Move to the right location in the block file. This can fail 2976 * if they try to write past end. 2977 */ 2978 if (lseek64(vblk->fd, off, SEEK_SET) != off) 2979 err(1, "Bad seek to sector %llu", out.sector); 2980 2981 ret = writev(vblk->fd, iov, out_num); 2982 verbose("WRITE to sector %llu: %i\n", out.sector, ret); 2983 2984 /* 2985 * Grr... Now we know how long the descriptor they sent was, we 2986 * make sure they didn't try to write over the end of the block 2987 * file (possibly extending it). 2988 */ 2989 if (ret > 0 && off + ret > vblk->len) { 2990 /* Trim it back to the correct length */ 2991 ftruncate64(vblk->fd, vblk->len); 2992 /* Die, bad Guest, die. */ 2993 bad_driver_vq(vq, "Write past end %llu+%u", off, ret); 2994 } 2995 2996 wlen = sizeof(*in); 2997 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 2998 } else if (out.type & VIRTIO_BLK_T_FLUSH) { 2999 /* Flush */ 3000 ret = fdatasync(vblk->fd); 3001 verbose("FLUSH fdatasync: %i\n", ret); 3002 wlen = sizeof(*in); 3003 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 3004 } else { 3005 /* 3006 * Read 3007 * 3008 * Move to the right location in the block file. This can fail 3009 * if they try to read past end. 3010 */ 3011 if (lseek64(vblk->fd, off, SEEK_SET) != off) 3012 err(1, "Bad seek to sector %llu", out.sector); 3013 3014 ret = readv(vblk->fd, iov + out_num, in_num); 3015 if (ret >= 0) { 3016 wlen = sizeof(*in) + ret; 3017 *in = VIRTIO_BLK_S_OK; 3018 } else { 3019 wlen = sizeof(*in); 3020 *in = VIRTIO_BLK_S_IOERR; 3021 } 3022 } 3023 3024 /* Finished that request. */ 3025 add_used(vq, head, wlen); 3026} 3027 3028/*L:198 This actually sets up a virtual block device. */ 3029static void setup_block_file(const char *filename) 3030{ 3031 struct device *dev; 3032 struct vblk_info *vblk; 3033 struct virtio_blk_config conf; 3034 3035 /* Create the device. */ 3036 dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80); 3037 3038 /* The device has one virtqueue, where the Guest places requests. */ 3039 add_pci_virtqueue(dev, blk_request, "request"); 3040 3041 /* Allocate the room for our own bookkeeping */ 3042 vblk = dev->priv = malloc(sizeof(*vblk)); 3043 3044 /* First we open the file and store the length. */ 3045 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 3046 vblk->len = lseek64(vblk->fd, 0, SEEK_END); 3047 3048 /* Tell Guest how many sectors this device has. */ 3049 conf.capacity = cpu_to_le64(vblk->len / 512); 3050 3051 /* 3052 * Tell Guest not to put in too many descriptors at once: two are used 3053 * for the in and out elements. 3054 */ 3055 add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX); 3056 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 3057 3058 set_device_config(dev, &conf, sizeof(struct virtio_blk_config)); 3059 3060 verbose("device %u: virtblock %llu sectors\n", 3061 devices.device_num, le64_to_cpu(conf.capacity)); 3062} 3063 3064/*L:211 3065 * Our random number generator device reads from /dev/urandom into the Guest's 3066 * input buffers. The usual case is that the Guest doesn't want random numbers 3067 * and so has no buffers although /dev/urandom is still readable, whereas 3068 * console is the reverse. 3069 * 3070 * The same logic applies, however. 3071 */ 3072struct rng_info { 3073 int rfd; 3074}; 3075 3076static void rng_input(struct virtqueue *vq) 3077{ 3078 int len; 3079 unsigned int head, in_num, out_num, totlen = 0; 3080 struct rng_info *rng_info = vq->dev->priv; 3081 struct iovec iov[vq->vring.num]; 3082 3083 /* First we need a buffer from the Guests's virtqueue. */ 3084 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 3085 if (out_num) 3086 bad_driver_vq(vq, "Output buffers in rng?"); 3087 3088 /* 3089 * Just like the console write, we loop to cover the whole iovec. 3090 * In this case, short reads actually happen quite a bit. 3091 */ 3092 while (!iov_empty(iov, in_num)) { 3093 len = readv(rng_info->rfd, iov, in_num); 3094 if (len <= 0) 3095 err(1, "Read from /dev/urandom gave %i", len); 3096 iov_consume(vq->dev, iov, in_num, NULL, len); 3097 totlen += len; 3098 } 3099 3100 /* Tell the Guest about the new input. */ 3101 add_used(vq, head, totlen); 3102} 3103 3104/*L:199 3105 * This creates a "hardware" random number device for the Guest. 3106 */ 3107static void setup_rng(void) 3108{ 3109 struct device *dev; 3110 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 3111 3112 /* Our device's private info simply contains the /dev/urandom fd. */ 3113 rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY); 3114 3115 /* Create the new device. */ 3116 dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0); 3117 dev->priv = rng_info; 3118 3119 /* The device has one virtqueue, where the Guest places inbufs. */ 3120 add_pci_virtqueue(dev, rng_input, "input"); 3121 3122 /* We don't have any configuration space */ 3123 no_device_config(dev); 3124 3125 verbose("device %u: rng\n", devices.device_num); 3126} 3127/* That's the end of device setup. */ 3128 3129/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ 3130static void __attribute__((noreturn)) restart_guest(void) 3131{ 3132 unsigned int i; 3133 3134 /* 3135 * Since we don't track all open fds, we simply close everything beyond 3136 * stderr. 3137 */ 3138 for (i = 3; i < FD_SETSIZE; i++) 3139 close(i); 3140 3141 /* Reset all the devices (kills all threads). */ 3142 cleanup_devices(); 3143 3144 execv(main_args[0], main_args); 3145 err(1, "Could not exec %s", main_args[0]); 3146} 3147 3148/*L:220 3149 * Finally we reach the core of the Launcher which runs the Guest, serves 3150 * its input and output, and finally, lays it to rest. 3151 */ 3152static void __attribute__((noreturn)) run_guest(void) 3153{ 3154 for (;;) { 3155 struct lguest_pending notify; 3156 int readval; 3157 3158 /* We read from the /dev/lguest device to run the Guest. */ 3159 readval = pread(lguest_fd, ¬ify, sizeof(notify), cpu_id); 3160 if (readval == sizeof(notify)) { 3161 if (notify.trap == 13) { 3162 verbose("Emulating instruction at %#x\n", 3163 getreg(eip)); 3164 emulate_insn(notify.insn); 3165 } else if (notify.trap == 14) { 3166 verbose("Emulating MMIO at %#x\n", 3167 getreg(eip)); 3168 emulate_mmio(notify.addr, notify.insn); 3169 } else 3170 errx(1, "Unknown trap %i addr %#08x\n", 3171 notify.trap, notify.addr); 3172 /* ENOENT means the Guest died. Reading tells us why. */ 3173 } else if (errno == ENOENT) { 3174 char reason[1024] = { 0 }; 3175 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); 3176 errx(1, "%s", reason); 3177 /* ERESTART means that we need to reboot the guest */ 3178 } else if (errno == ERESTART) { 3179 restart_guest(); 3180 /* Anything else means a bug or incompatible change. */ 3181 } else 3182 err(1, "Running guest failed"); 3183 } 3184} 3185/*L:240 3186 * This is the end of the Launcher. The good news: we are over halfway 3187 * through! The bad news: the most fiendish part of the code still lies ahead 3188 * of us. 3189 * 3190 * Are you ready? Take a deep breath and join me in the core of the Host, in 3191 * "make Host". 3192:*/ 3193 3194static struct option opts[] = { 3195 { "verbose", 0, NULL, 'v' }, 3196 { "tunnet", 1, NULL, 't' }, 3197 { "block", 1, NULL, 'b' }, 3198 { "rng", 0, NULL, 'r' }, 3199 { "initrd", 1, NULL, 'i' }, 3200 { "username", 1, NULL, 'u' }, 3201 { "chroot", 1, NULL, 'c' }, 3202 { NULL }, 3203}; 3204static void usage(void) 3205{ 3206 errx(1, "Usage: lguest [--verbose] " 3207 "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n" 3208 "|--block=<filename>|--initrd=<filename>]...\n" 3209 "<mem-in-mb> vmlinux [args...]"); 3210} 3211 3212/*L:105 The main routine is where the real work begins: */ 3213int main(int argc, char *argv[]) 3214{ 3215 /* Memory, code startpoint and size of the (optional) initrd. */ 3216 unsigned long mem = 0, start, initrd_size = 0; 3217 /* Two temporaries. */ 3218 int i, c; 3219 /* The boot information for the Guest. */ 3220 struct boot_params *boot; 3221 /* If they specify an initrd file to load. */ 3222 const char *initrd_name = NULL; 3223 3224 /* Password structure for initgroups/setres[gu]id */ 3225 struct passwd *user_details = NULL; 3226 3227 /* Directory to chroot to */ 3228 char *chroot_path = NULL; 3229 3230 /* Save the args: we "reboot" by execing ourselves again. */ 3231 main_args = argv; 3232 3233 /* 3234 * First we initialize the device list. We remember next interrupt 3235 * number to use for devices (1: remember that 0 is used by the timer). 3236 */ 3237 devices.next_irq = 1; 3238 3239 /* We're CPU 0. In fact, that's the only CPU possible right now. */ 3240 cpu_id = 0; 3241 3242 /* 3243 * We need to know how much memory so we can set up the device 3244 * descriptor and memory pages for the devices as we parse the command 3245 * line. So we quickly look through the arguments to find the amount 3246 * of memory now. 3247 */ 3248 for (i = 1; i < argc; i++) { 3249 if (argv[i][0] != '-') { 3250 mem = atoi(argv[i]) * 1024 * 1024; 3251 /* 3252 * We start by mapping anonymous pages over all of 3253 * guest-physical memory range. This fills it with 0, 3254 * and ensures that the Guest won't be killed when it 3255 * tries to access it. 3256 */ 3257 guest_base = map_zeroed_pages(mem / getpagesize() 3258 + DEVICE_PAGES); 3259 guest_limit = mem; 3260 guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize(); 3261 break; 3262 } 3263 } 3264 3265 /* We always have a console device, and it's always device 1. */ 3266 setup_console(); 3267 3268 /* The options are fairly straight-forward */ 3269 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 3270 switch (c) { 3271 case 'v': 3272 verbose = true; 3273 break; 3274 case 't': 3275 setup_tun_net(optarg); 3276 break; 3277 case 'b': 3278 setup_block_file(optarg); 3279 break; 3280 case 'r': 3281 setup_rng(); 3282 break; 3283 case 'i': 3284 initrd_name = optarg; 3285 break; 3286 case 'u': 3287 user_details = getpwnam(optarg); 3288 if (!user_details) 3289 err(1, "getpwnam failed, incorrect username?"); 3290 break; 3291 case 'c': 3292 chroot_path = optarg; 3293 break; 3294 default: 3295 warnx("Unknown argument %s", argv[optind]); 3296 usage(); 3297 } 3298 } 3299 /* 3300 * After the other arguments we expect memory and kernel image name, 3301 * followed by command line arguments for the kernel. 3302 */ 3303 if (optind + 2 > argc) 3304 usage(); 3305 3306 verbose("Guest base is at %p\n", guest_base); 3307 3308 /* Initialize the (fake) PCI host bridge device. */ 3309 init_pci_host_bridge(); 3310 3311 /* Now we load the kernel */ 3312 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 3313 3314 /* Boot information is stashed at physical address 0 */ 3315 boot = from_guest_phys(0); 3316 3317 /* Map the initrd image if requested (at top of physical memory) */ 3318 if (initrd_name) { 3319 initrd_size = load_initrd(initrd_name, mem); 3320 /* 3321 * These are the location in the Linux boot header where the 3322 * start and size of the initrd are expected to be found. 3323 */ 3324 boot->hdr.ramdisk_image = mem - initrd_size; 3325 boot->hdr.ramdisk_size = initrd_size; 3326 /* The bootloader type 0xFF means "unknown"; that's OK. */ 3327 boot->hdr.type_of_loader = 0xFF; 3328 } 3329 3330 /* 3331 * The Linux boot header contains an "E820" memory map: ours is a 3332 * simple, single region. 3333 */ 3334 boot->e820_entries = 1; 3335 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 3336 /* 3337 * The boot header contains a command line pointer: we put the command 3338 * line after the boot header. 3339 */ 3340 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 3341 /* We use a simple helper to copy the arguments separated by spaces. */ 3342 concat((char *)(boot + 1), argv+optind+2); 3343 3344 /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */ 3345 boot->hdr.kernel_alignment = 0x1000000; 3346 3347 /* Boot protocol version: 2.07 supports the fields for lguest. */ 3348 boot->hdr.version = 0x207; 3349 3350 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ 3351 boot->hdr.hardware_subarch = 1; 3352 3353 /* Tell the entry path not to try to reload segment registers. */ 3354 boot->hdr.loadflags |= KEEP_SEGMENTS; 3355 3356 /* We tell the kernel to initialize the Guest. */ 3357 tell_kernel(start); 3358 3359 /* Ensure that we terminate if a device-servicing child dies. */ 3360 signal(SIGCHLD, kill_launcher); 3361 3362 /* If we exit via err(), this kills all the threads, restores tty. */ 3363 atexit(cleanup_devices); 3364 3365 /* If requested, chroot to a directory */ 3366 if (chroot_path) { 3367 if (chroot(chroot_path) != 0) 3368 err(1, "chroot(\"%s\") failed", chroot_path); 3369 3370 if (chdir("/") != 0) 3371 err(1, "chdir(\"/\") failed"); 3372 3373 verbose("chroot done\n"); 3374 } 3375 3376 /* If requested, drop privileges */ 3377 if (user_details) { 3378 uid_t u; 3379 gid_t g; 3380 3381 u = user_details->pw_uid; 3382 g = user_details->pw_gid; 3383 3384 if (initgroups(user_details->pw_name, g) != 0) 3385 err(1, "initgroups failed"); 3386 3387 if (setresgid(g, g, g) != 0) 3388 err(1, "setresgid failed"); 3389 3390 if (setresuid(u, u, u) != 0) 3391 err(1, "setresuid failed"); 3392 3393 verbose("Dropping privileges completed\n"); 3394 } 3395 3396 /* Finally, run the Guest. This doesn't return. */ 3397 run_guest(); 3398} 3399/*:*/ 3400 3401/*M:999 3402 * Mastery is done: you now know everything I do. 3403 * 3404 * But surely you have seen code, features and bugs in your wanderings which 3405 * you now yearn to attack? That is the real game, and I look forward to you 3406 * patching and forking lguest into the Your-Name-Here-visor. 3407 * 3408 * Farewell, and good coding! 3409 * Rusty Russell. 3410 */ 3411