#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static volatile int stop; struct params { int uffd; int filefd; long page_size; long long startaddr; bool copy; }; static inline uint64_t getns(void) { struct timespec ts; int ret = clock_gettime(CLOCK_MONOTONIC, &ts); assert(ret == 0); return (((uint64_t)ts.tv_sec) * 1000000000ULL) + ts.tv_nsec; } static long get_page_size(void) { long ret = sysconf(_SC_PAGESIZE); if (ret == -1) { perror("sysconf/pagesize"); exit(1); } assert(ret > 0); return ret; } static void *handler(void *arg) { struct params *p = arg; long page_size = p->page_size; char buf[page_size]; for (;;) { struct uffd_msg msg; struct pollfd pollfd[1]; pollfd[0].fd = p->uffd; pollfd[0].events = POLLIN; // wait for a userfaultfd event to occur int pollres = poll(pollfd, 1, 2000); if (stop) return NULL; switch (pollres) { case -1: perror("poll/userfaultfd"); continue; case 0: continue; case 1: break; default: fprintf(stderr, "unexpected poll result\n"); exit(1); } if (pollfd[0].revents & POLLERR) { fprintf(stderr, "pollerr\n"); exit(1); } if (!pollfd[0].revents & POLLIN) { continue; } int readres = read(p->uffd, &msg, sizeof(msg)); if (readres == -1) { if (errno == EAGAIN) continue; perror("read/userfaultfd"); exit(1); } if (readres != sizeof(msg)) { fprintf(stderr, "invalid msg size\n"); exit(1); } // handle the page fault by copying a page worth of bytes if (msg.event & UFFD_EVENT_PAGEFAULT) { long long addr = msg.arg.pagefault.address; off_t offset = msg.arg.pagefault.address - p->startaddr; if (lseek(p->filefd, offset, SEEK_SET) < 0) { fprintf(stderr, "FD %d, offset %ld\n", p->filefd, offset); perror("lseek"); exit(1); } char *dest = buf; if (!p->copy) { dest = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); if (dest == MAP_FAILED) { perror("mmap buf"); exit(1); } } ssize_t n, t = 0; while ((n = read(p->filefd, dest + t, page_size - t)) >= 0) { t += n; if (t >= page_size) break; } if (p->copy) { struct uffdio_copy copy; copy.src = (long long)buf; copy.dst = (long long)addr; copy.len = page_size; copy.mode = 0; if (ioctl(p->uffd, UFFDIO_COPY, ©) == -1) { perror("ioctl/copy"); exit(1); } } else { // if (mprotect(dest, page_size, PROT_READ) != 0) { // perror("mprotect"); // exit(1); // } // we should assert that addr is on the beginning of the page if (mremap(dest, page_size, page_size, MREMAP_FIXED | MREMAP_MAYMOVE, addr) == MAP_FAILED) { if (errno == ENOMEM) { fprintf(stderr, "Run this to increase limit on mappings: sudo sysctl -w 'vm.max_map_count=1048576'\n"); } perror("mremap"); exit(1); } struct uffdio_range range; range.start = addr; range.len = page_size; if (ioctl(p->uffd, UFFDIO_WAKE, &range) == -1) { perror("ioctl/wake"); exit(1); } } } } return NULL; } static void touch_all_pages(const char *name, size_t page_size, unsigned long num_pages, char *region) { printf("%s:\nPage size: %ld\nNum pages: %ld\n", name, page_size, num_pages); uint64_t sum = 0, min = LONG_MAX, max = 0; int value = 0; char *cur = region; uint64_t before = getns(); for (unsigned long i = 0; i < num_pages; i++) { uint64_t start = getns(); int v = *((int*)cur); uint64_t dur = getns() - start; value += v; cur += page_size; sum += dur; min = dur < min ? dur : min; max = dur > max ? dur : max; } uint64_t after = getns(); printf("TOTAL %lu (%lu), AVG %lu, MIN %lu, MAX %lu ... VALUE %d\n", after - before, sum, sum / num_pages, min, max, value); printf("--------------------------------------\n"); } static void test_mmap_file(const char *name, int fd, size_t page_size, unsigned long num_pages, bool populate) { uint64_t start = getns(); void *region = mmap(NULL, page_size * num_pages, PROT_READ, MAP_PRIVATE | (populate ? MAP_POPULATE : 0), fd, 0); uint64_t end = getns(); if (region == MAP_FAILED) { perror("mmap fd"); exit(1); } printf("mmap call took %lu ns\n", end - start); touch_all_pages(name, page_size, num_pages, (char *) region); munmap(region, page_size * num_pages); } static void test_userfaultfd(const char *name, int fd, bool huge, bool copy, size_t page_size, unsigned long num_pages) { // open the userfault fd int uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd == -1) { perror("syscall/userfaultfd"); exit(1); } // enable for api version and check features struct uffdio_api uffdio_api; uffdio_api.api = UFFD_API; uffdio_api.features = 0; if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { perror("ioctl/uffdio_api"); exit(1); } if (uffdio_api.api != UFFD_API) { fprintf(stderr, "unsupported userfaultfd api\n"); exit(1); } // allocate a memory region to be managed by userfaultfd void *region = mmap(NULL, page_size * num_pages, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | (huge ? MAP_HUGETLB : 0), -1, 0); if (region == MAP_FAILED) { perror("mmap"); exit(1); } // register the pages in the region for missing callbacks struct uffdio_register uffdio_register; uffdio_register.range.start = (unsigned long)region; uffdio_register.range.len = page_size * num_pages; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { perror("ioctl/uffdio_register"); exit(1); } // start the thread that will handle userfaultfd events stop = 0; struct params p; p.uffd = uffd; p.filefd = fd; p.page_size = page_size; p.startaddr = (long long) region; p.copy = copy; pthread_t uffd_thread; pthread_create(&uffd_thread, NULL, handler, &p); sleep(1); touch_all_pages(name, page_size, num_pages, region); stop = 1; pthread_join(uffd_thread, NULL); if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { fprintf(stderr, "ioctl unregister failure\n"); exit(1); } munmap(region, page_size * num_pages); close(uffd); } /* * Refactored code originally from https://noahdesu.github.io/2016/10/10/userfaultfd-hello-world.html */ int main(int argc, char **argv) { if (argc < 2) { fprintf(stderr, "Use filename as first argument"); exit(1); } struct stat st; if (stat(argv[1], &st) != 0) { perror("stat"); } printf("Tested file has %lu MB (%lu bytes)\n", st.st_size >> 20, st.st_size); int fd = open(argv[1], O_RDONLY); if (fd <= 0) { perror("open"); } size_t regular_page_size = get_page_size(); unsigned long num_regular = (st.st_size - 1) / regular_page_size + 1; test_mmap_file("mmapped file", fd, regular_page_size, num_regular, false); test_mmap_file("mmapped file, populated", fd, regular_page_size, num_regular, true); test_userfaultfd("Userfaultfd, regular pages, copy", fd, false, true, regular_page_size, num_regular); test_userfaultfd("Userfaultfd, regular pages, remap", fd, false, false, regular_page_size, num_regular); size_t huge_page_size = 2 << 20; unsigned long num_huge = (st.st_size - 1) / huge_page_size + 1; test_userfaultfd("Userfaultfd, huge pages, copy", fd, true, true, huge_page_size, num_huge); test_userfaultfd("Userfaultfd, huge pages, remap", fd, true, false, huge_page_size, num_huge); close(fd); return 0; }