libpmem: move mmap alignment to init time
[fio.git] / engines / libpmem.c
CommitLineData
ae0db592
TI
1/*
2 * libpmem: IO engine that uses NVML libpmem to read and write data
3 *
4 * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License,
8 * version 2 as published by the Free Software Foundation..
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 */
16
17/*
18 * libpmem engine
19 *
20 * IO engine that uses libpmem to read and write data
21 *
22 * To use:
23 * ioengine=libpmem
24 *
25 * Other relevant settings:
26 * iodepth=1
27 * direct=1
28 * directory=/mnt/pmem0/
29 * bs=4k
30 *
31 * direct=1 means that pmem_drain() is executed for each write operation.
32 * In contrast, direct=0 means that pmem_drain() is not executed.
33 *
34 * The pmem device must have a DAX-capable filesystem and be mounted
35 * with DAX enabled. directory must point to a mount point of DAX FS.
36 *
37 * Example:
38 * mkfs.xfs /dev/pmem0
39 * mkdir /mnt/pmem0
40 * mount -o dax /dev/pmem0 /mnt/pmem0
41 *
42 *
43 * See examples/libpmem.fio for more.
44 *
45 *
46 * libpmem.so
47 * By default, the libpmem engine will let the system find the libpmem.so
48 * that it uses. You can use an alternative libpmem by setting the
49 * FIO_PMEM_LIB environment variable to the full path to the desired
50 * libpmem.so.
51 */
52
53#include <stdio.h>
54#include <limits.h>
55#include <stdlib.h>
56#include <unistd.h>
57#include <errno.h>
58#include <sys/mman.h>
59#include <sys/stat.h>
60#include <sys/sysmacros.h>
61#include <libgen.h>
62#include <libpmem.h>
63
64#include "../fio.h"
65#include "../verify.h"
66
67/*
68 * Limits us to 1GiB of mapped files in total to model after
69 * libpmem engine behavior
70 */
71#define MMAP_TOTAL_SZ (1 * 1024 * 1024 * 1024UL)
72
73struct fio_libpmem_data {
74 void *libpmem_ptr;
75 size_t libpmem_sz;
76 off_t libpmem_off;
77};
78
79#define MEGABYTE ((uintptr_t)1 << 20)
80#define GIGABYTE ((uintptr_t)1 << 30)
81#define PROCMAXLEN 2048 /* maximum expected line length in /proc files */
82#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
83
4f1957b7 84static bool Mmap_no_random;
ae0db592
TI
85static void *Mmap_hint;
86static unsigned long long Mmap_align;
ae0db592
TI
87
88/*
89 * util_map_hint_align -- choose the desired mapping alignment
90 *
91 * Use 2MB/1GB page alignment only if the mapping length is at least
92 * twice as big as the page size.
93 */
597a6533 94static inline size_t util_map_hint_align(size_t len, size_t req_align)
ae0db592 95{
4f1957b7 96 size_t align = Mmap_align;
ae0db592
TI
97
98 dprint(FD_IO, "DEBUG util_map_hint_align\n" );
ae0db592
TI
99
100 if (req_align)
101 align = req_align;
102 else if (len >= 2 * GIGABYTE)
103 align = GIGABYTE;
104 else if (len >= 4 * MEGABYTE)
105 align = 2 * MEGABYTE;
106
107 dprint(FD_IO, "align=%d\n", (int)align);
108 return align;
109}
110
111#ifdef __FreeBSD__
112static const char *sscanf_os = "%p %p";
113#define MAP_NORESERVE 0
114#define OS_MAPFILE "/proc/curproc/map"
115#else
116static const char *sscanf_os = "%p-%p";
117#define OS_MAPFILE "/proc/self/maps"
118#endif
119
120/*
121 * util_map_hint_unused -- use /proc to determine a hint address for mmap()
122 *
123 * This is a helper function for util_map_hint().
124 * It opens up /proc/self/maps and looks for the first unused address
125 * in the process address space that is:
126 * - greater or equal 'minaddr' argument,
127 * - large enough to hold range of given length,
128 * - aligned to the specified unit.
129 *
130 * Asking for aligned address like this will allow the DAX code to use large
131 * mappings. It is not an error if mmap() ignores the hint and chooses
132 * different address.
133 */
597a6533 134static char *util_map_hint_unused(void *minaddr, size_t len, size_t align)
ae0db592
TI
135{
136 char *lo = NULL; /* beginning of current range in maps file */
137 char *hi = NULL; /* end of current range in maps file */
138 char *raddr = minaddr; /* ignore regions below 'minaddr' */
139
140#ifdef WIN32
141 MEMORY_BASIC_INFORMATION mi;
142#else
143 FILE *fp;
144 char line[PROCMAXLEN]; /* for fgets() */
145#endif
146
147 dprint(FD_IO, "DEBUG util_map_hint_unused\n");
148 assert(align > 0);
149
ae0db592 150 if (raddr == NULL)
4f1957b7 151 raddr += page_size;
ae0db592
TI
152
153 raddr = (char *)roundup((uintptr_t)raddr, align);
154
155#ifdef WIN32
156 while ((uintptr_t)raddr < UINTPTR_MAX - len) {
157 size_t ret = VirtualQuery(raddr, &mi, sizeof(mi));
158 if (ret == 0) {
159 ERR("VirtualQuery %p", raddr);
160 return MAP_FAILED;
161 }
162 dprint(FD_IO, "addr %p len %zu state %d",
163 mi.BaseAddress, mi.RegionSize, mi.State);
164
165 if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) {
166 raddr = (char *)mi.BaseAddress + mi.RegionSize;
167 raddr = (char *)roundup((uintptr_t)raddr, align);
168 dprint(FD_IO, "nearest aligned addr %p", raddr);
169 } else {
170 dprint(FD_IO, "unused region of size %zu found at %p",
171 mi.RegionSize, mi.BaseAddress);
172 return mi.BaseAddress;
173 }
174 }
175
176 dprint(FD_IO, "end of address space reached");
177 return MAP_FAILED;
178#else
597a6533
JA
179 fp = fopen(OS_MAPFILE, "r");
180 if (!fp) {
ae0db592
TI
181 log_err("!%s\n", OS_MAPFILE);
182 return MAP_FAILED;
183 }
184
185 while (fgets(line, PROCMAXLEN, fp) != NULL) {
186 /* check for range line */
187 if (sscanf(line, sscanf_os, &lo, &hi) == 2) {
188 dprint(FD_IO, "%p-%p\n", lo, hi);
189 if (lo > raddr) {
190 if ((uintptr_t)(lo - raddr) >= len) {
191 dprint(FD_IO, "unused region of size "
192 "%zu found at %p\n",
193 lo - raddr, raddr);
194 break;
195 } else {
196 dprint(FD_IO, "region is too small: "
197 "%zu < %zu\n",
198 lo - raddr, len);
199 }
200 }
201
202 if (hi > raddr) {
203 raddr = (char *)roundup((uintptr_t)hi, align);
204 dprint(FD_IO, "nearest aligned addr %p\n",
205 raddr);
206 }
207
208 if (raddr == 0) {
209 dprint(FD_IO, "end of address space reached\n");
210 break;
211 }
212 }
213 }
214
215 /*
216 * Check for a case when this is the last unused range in the address
217 * space, but is not large enough. (very unlikely)
218 */
219 if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) {
220 dprint(FD_IO, "end of address space reached");
221 raddr = MAP_FAILED;
222 }
223
224 fclose(fp);
225
226 dprint(FD_IO, "returning %p", raddr);
227 return raddr;
228#endif
229}
230
231/*
232 * util_map_hint -- determine hint address for mmap()
233 *
234 * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick
235 * the randomized mapping address. Otherwise, a user-defined hint address
236 * is used.
237 *
238 * Windows Environment:
239 * XXX - Windows doesn't support large DAX pages yet, so there is
240 * no point in aligning for the same.
241 *
242 * Except for Windows Environment:
243 * ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap
244 * (bit positions 12-39), which means the base mapping address is randomized
245 * within [0..1024GB] range, with 4KB granularity. Assuming additional
246 * 1GB alignment, it results in 1024 possible locations.
247 *
248 * Configuring the hint address via PMEM_MMAP_HINT environment variable
249 * disables address randomization. In such case, the function will search for
250 * the first unused, properly aligned region of given size, above the
251 * specified address.
252 */
597a6533 253static char *util_map_hint(size_t len, size_t req_align)
ae0db592
TI
254{
255 char *addr;
256 size_t align = 0;
257 char *e = NULL;
258
259 dprint(FD_IO, "DEBUG util_map_hint\n");
260 dprint(FD_IO, "len %zu req_align %zu\n", len, req_align);
261
262 /* choose the desired alignment based on the requested length */
263 align = util_map_hint_align(len, req_align);
264
265 e = getenv("PMEM_MMAP_HINT");
266 if (e) {
267 char *endp;
268 unsigned long long val = 0;
269
270 errno = 0;
271
272 val = strtoull(e, &endp, 16);
273 if (errno || endp == e) {
274 dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n");
275 } else {
276 Mmap_hint = (void *)val;
4f1957b7 277 Mmap_no_random = true;
ae0db592
TI
278 dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint);
279 }
280 }
281
282 if (Mmap_no_random) {
283 dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint);
284 addr = util_map_hint_unused((void *)Mmap_hint, len, align);
285 } else {
286 /*
287 * Create dummy mapping to find an unused region of given size.
288 * * Request for increased size for later address alignment.
289 *
290 * Windows Environment:
291 * Use MAP_NORESERVE flag to only reserve the range of pages
292 * rather than commit. We don't want the pages to be actually
293 * backed by the operating system paging file, as the swap
294 * file is usually too small to handle terabyte pools.
295 *
296 * Except for Windows Environment:
297 * Use MAP_PRIVATE with read-only access to simulate
298 * zero cost for overcommit accounting. Note: MAP_NORESERVE
299 * flag is ignored if overcommit is disabled (mode 2).
300 */
301#ifndef WIN32
302 addr = mmap(NULL, len + align, PROT_READ,
303 MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
304#else
305 addr = mmap(NULL, len + align, PROT_READ,
306 MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
307#endif
308 if (addr != MAP_FAILED) {
309 dprint(FD_IO, "system choice %p\n", addr);
310 munmap(addr, len + align);
311 addr = (char *)roundup((uintptr_t)addr, align);
312 }
313 }
314
315 dprint(FD_IO, "hint %p\n", addr);
316
317 return addr;
318}
319
320/*
321 * This is the mmap execution function
322 */
323static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
324 size_t length, off_t off)
325{
326 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
327 int flags = 0;
328 void *addr = NULL;
329
330 dprint(FD_IO, "DEBUG fio_libpmem_file\n");
331
332 if (td_rw(td))
333 flags = PROT_READ | PROT_WRITE;
334 else if (td_write(td)) {
335 flags = PROT_WRITE;
336
337 if (td->o.verify != VERIFY_NONE)
338 flags |= PROT_READ;
339 } else
340 flags = PROT_READ;
341
342 dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name,
343 td->o.verify);
344 dprint(FD_IO, "length = %ld flags = %d f->fd = %d off = %ld \n",
345 length, flags, f->fd,off);
346
347 addr = util_map_hint(length, 0);
348
349 fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off);
350 if (fdd->libpmem_ptr == MAP_FAILED) {
351 fdd->libpmem_ptr = NULL;
352 td_verror(td, errno, "mmap");
353 }
354
355 if (td->error && fdd->libpmem_ptr)
356 munmap(fdd->libpmem_ptr, length);
357
358 return td->error;
359}
360
361/*
362 * XXX Just mmap an appropriate portion, we cannot mmap the full extent
363 */
364static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u)
365{
366 struct fio_file *f = io_u->file;
367 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
368
369 dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" );
370
371 if (io_u->buflen > f->real_file_size) {
372 log_err("libpmem: bs too big for libpmem engine\n");
373 return EIO;
374 }
375
376 fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
377 if (fdd->libpmem_sz > f->io_size)
378 fdd->libpmem_sz = f->io_size;
379
380 fdd->libpmem_off = io_u->offset;
381
382 return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
383}
384
385/*
386 * Attempt to mmap the entire file
387 */
388static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u)
389{
390 struct fio_file *f = io_u->file;
391 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
392 int ret;
393
394 dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" );
395
396 if (fio_file_partial_mmap(f))
397 return EINVAL;
398
399 dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n",
400 f->io_size, io_u->offset);
401
402 if (io_u->offset != (size_t) io_u->offset ||
597a6533 403 f->io_size != (size_t) f->io_size) {
ae0db592
TI
404 fio_file_set_partial_mmap(f);
405 return EINVAL;
406 }
407 fdd->libpmem_sz = f->io_size;
408 fdd->libpmem_off = 0;
409
410 ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
411 if (ret)
412 fio_file_set_partial_mmap(f);
413
414 return ret;
415}
416
417static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
418{
419 struct fio_file *f = io_u->file;
420 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
421 int ret;
422
423 dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
424 /*
425 * It fits within existing mapping, use it
426 */
427 dprint(FD_IO," io_u->offset %lld : fdd->libpmem_off %ld : "
428 "io_u->buflen %ld : fdd->libpmem_sz %ld\n",
429 io_u->offset, fdd->libpmem_off,
430 io_u->buflen, fdd->libpmem_sz);
431
432 if (io_u->offset >= fdd->libpmem_off &&
597a6533
JA
433 (io_u->offset + io_u->buflen <
434 fdd->libpmem_off + fdd->libpmem_sz))
ae0db592
TI
435 goto done;
436
437 /*
438 * unmap any existing mapping
439 */
440 if (fdd->libpmem_ptr) {
441 dprint(FD_IO,"munmap \n");
442 if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0)
443 return errno;
444 fdd->libpmem_ptr = NULL;
445 }
446
447 if (fio_libpmem_prep_full(td, io_u)) {
448 td_clear_error(td);
449 ret = fio_libpmem_prep_limited(td, io_u);
450 if (ret)
451 return ret;
452 }
453
454done:
455 io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off
597a6533 456 - f->file_offset;
ae0db592
TI
457 return 0;
458}
459
460static int fio_libpmem_queue(struct thread_data *td, struct io_u *io_u)
461{
462 fio_ro_check(td, io_u);
463 io_u->error = 0;
464
465 dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
466
467 switch (io_u->ddir) {
597a6533
JA
468 case DDIR_READ:
469 memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
470 break;
471 case DDIR_WRITE:
472 dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
473 io_u->mmap_data, io_u->xfer_buf );
474 dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
475 if (td->o.odirect) {
476 pmem_memcpy_persist(io_u->mmap_data,
ae0db592
TI
477 io_u->xfer_buf,
478 io_u->xfer_buflen);
597a6533
JA
479 } else {
480 pmem_memcpy_nodrain(io_u->mmap_data,
ae0db592
TI
481 io_u->xfer_buf,
482 io_u->xfer_buflen);
597a6533
JA
483 }
484 break;
485 case DDIR_SYNC:
486 case DDIR_DATASYNC:
487 case DDIR_SYNC_FILE_RANGE:
488 break;
489 default:
490 io_u->error = EINVAL;
491 break;
ae0db592
TI
492 }
493
494 return FIO_Q_COMPLETED;
495}
496
497static int fio_libpmem_init(struct thread_data *td)
498{
499 struct thread_options *o = &td->o;
500
501 dprint(FD_IO,"o->rw_min_bs %d \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n",
502 o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
503 dprint(FD_IO, "DEBUG fio_libpmem_init\n");
504
505 if ((o->rw_min_bs & page_mask) &&
597a6533 506 (o->fsync_blocks || o->fdatasync_blocks)) {
ae0db592
TI
507 log_err("libpmem: mmap options dictate a minimum block size of "
508 "%llu bytes\n", (unsigned long long) page_size);
509 return 1;
510 }
511 return 0;
512}
513
514static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
515{
516 struct fio_libpmem_data *fdd;
517 int ret;
518
519 dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
520 dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
521 dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
522 dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
523 dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
524
525 ret = generic_open_file(td, f);
526 if (ret)
527 return ret;
528
529 fdd = calloc(1, sizeof(*fdd));
530 if (!fdd) {
531 int fio_unused __ret;
532 __ret = generic_close_file(td, f);
533 return 1;
534 }
535
536 FILE_SET_ENG_DATA(f, fdd);
537
538 return 0;
539}
540
541static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
542{
543 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
544
545 dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
546 dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
547
597a6533 548 if (!td->o.odirect) {
ae0db592
TI
549 dprint(FD_IO,"pmem_drain\n");
550 pmem_drain();
551 }
552
553 FILE_SET_ENG_DATA(f, NULL);
554 free(fdd);
555 fio_file_clear_partial_mmap(f);
556
557 return generic_close_file(td, f);
558}
559
560static struct ioengine_ops ioengine = {
597a6533
JA
561 .name = "libpmem",
562 .version = FIO_IOOPS_VERSION,
563 .init = fio_libpmem_init,
564 .prep = fio_libpmem_prep,
565 .queue = fio_libpmem_queue,
566 .open_file = fio_libpmem_open_file,
567 .close_file = fio_libpmem_close_file,
568 .get_file_size = generic_get_file_size,
569 .flags = FIO_SYNCIO |FIO_NOEXTEND,
ae0db592
TI
570};
571
572static void fio_init fio_libpmem_register(void)
573{
4f1957b7
JA
574#ifndef WIN32
575 Mmap_align = page_size;
576#else
577 if (Mmap_align == 0) {
578 SYSTEM_INFO si;
579
580 GetSystemInfo(&si);
581 Mmap_align = si.dwAllocationGranularity;
582 }
583#endif
584
ae0db592
TI
585 register_ioengine(&ioengine);
586}
587
588static void fio_exit fio_libpmem_unregister(void)
589{
590 unregister_ioengine(&ioengine);
591}