libpmem: code cleanups
[fio.git] / engines / libpmem.c
CommitLineData
ae0db592
TI
1/*
2 * libpmem: IO engine that uses NVML libpmem to read and write data
3 *
4 * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License,
8 * version 2 as published by the Free Software Foundation..
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 */
16
17/*
18 * libpmem engine
19 *
20 * IO engine that uses libpmem to read and write data
21 *
22 * To use:
23 * ioengine=libpmem
24 *
25 * Other relevant settings:
26 * iodepth=1
27 * direct=1
28 * directory=/mnt/pmem0/
29 * bs=4k
30 *
31 * direct=1 means that pmem_drain() is executed for each write operation.
32 * In contrast, direct=0 means that pmem_drain() is not executed.
33 *
34 * The pmem device must have a DAX-capable filesystem and be mounted
35 * with DAX enabled. directory must point to a mount point of DAX FS.
36 *
37 * Example:
38 * mkfs.xfs /dev/pmem0
39 * mkdir /mnt/pmem0
40 * mount -o dax /dev/pmem0 /mnt/pmem0
41 *
42 *
43 * See examples/libpmem.fio for more.
44 *
45 *
46 * libpmem.so
47 * By default, the libpmem engine will let the system find the libpmem.so
48 * that it uses. You can use an alternative libpmem by setting the
49 * FIO_PMEM_LIB environment variable to the full path to the desired
50 * libpmem.so.
51 */
52
53#include <stdio.h>
54#include <limits.h>
55#include <stdlib.h>
56#include <unistd.h>
57#include <errno.h>
58#include <sys/mman.h>
59#include <sys/stat.h>
60#include <sys/sysmacros.h>
61#include <libgen.h>
62#include <libpmem.h>
63
64#include "../fio.h"
65#include "../verify.h"
66
67/*
68 * Limits us to 1GiB of mapped files in total to model after
69 * libpmem engine behavior
70 */
71#define MMAP_TOTAL_SZ (1 * 1024 * 1024 * 1024UL)
72
73struct fio_libpmem_data {
74 void *libpmem_ptr;
75 size_t libpmem_sz;
76 off_t libpmem_off;
77};
78
79#define MEGABYTE ((uintptr_t)1 << 20)
80#define GIGABYTE ((uintptr_t)1 << 30)
81#define PROCMAXLEN 2048 /* maximum expected line length in /proc files */
82#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
83
84static int Mmap_no_random;
85static void *Mmap_hint;
86static unsigned long long Mmap_align;
87static unsigned long long Pagesize = 0;
88
89/*
90 * util_map_hint_align -- choose the desired mapping alignment
91 *
92 * Use 2MB/1GB page alignment only if the mapping length is at least
93 * twice as big as the page size.
94 */
597a6533 95static inline size_t util_map_hint_align(size_t len, size_t req_align)
ae0db592
TI
96{
97 size_t align = 0;
98
99 dprint(FD_IO, "DEBUG util_map_hint_align\n" );
100#ifndef WIN32
101 Mmap_align = Pagesize;
102#else
103 if (Mmap_align == 0) {
104 SYSTEM_INFO si;
105 GetSystemInfo(&si);
106 Mmap_align = si.dwAllocationGranularity;
107 }
108#endif
109
110 align = Mmap_align;
111
112 if (req_align)
113 align = req_align;
114 else if (len >= 2 * GIGABYTE)
115 align = GIGABYTE;
116 else if (len >= 4 * MEGABYTE)
117 align = 2 * MEGABYTE;
118
119 dprint(FD_IO, "align=%d\n", (int)align);
120 return align;
121}
122
123#ifdef __FreeBSD__
124static const char *sscanf_os = "%p %p";
125#define MAP_NORESERVE 0
126#define OS_MAPFILE "/proc/curproc/map"
127#else
128static const char *sscanf_os = "%p-%p";
129#define OS_MAPFILE "/proc/self/maps"
130#endif
131
132/*
133 * util_map_hint_unused -- use /proc to determine a hint address for mmap()
134 *
135 * This is a helper function for util_map_hint().
136 * It opens up /proc/self/maps and looks for the first unused address
137 * in the process address space that is:
138 * - greater or equal 'minaddr' argument,
139 * - large enough to hold range of given length,
140 * - aligned to the specified unit.
141 *
142 * Asking for aligned address like this will allow the DAX code to use large
143 * mappings. It is not an error if mmap() ignores the hint and chooses
144 * different address.
145 */
597a6533 146static char *util_map_hint_unused(void *minaddr, size_t len, size_t align)
ae0db592
TI
147{
148 char *lo = NULL; /* beginning of current range in maps file */
149 char *hi = NULL; /* end of current range in maps file */
150 char *raddr = minaddr; /* ignore regions below 'minaddr' */
151
152#ifdef WIN32
153 MEMORY_BASIC_INFORMATION mi;
154#else
155 FILE *fp;
156 char line[PROCMAXLEN]; /* for fgets() */
157#endif
158
159 dprint(FD_IO, "DEBUG util_map_hint_unused\n");
160 assert(align > 0);
161
162 /* XXX - replace sysconf() with util_get_sys_xxx() */
163 Pagesize = (unsigned long) sysconf(_SC_PAGESIZE);
164
165 if (raddr == NULL)
166 raddr += Pagesize;
167
168 raddr = (char *)roundup((uintptr_t)raddr, align);
169
170#ifdef WIN32
171 while ((uintptr_t)raddr < UINTPTR_MAX - len) {
172 size_t ret = VirtualQuery(raddr, &mi, sizeof(mi));
173 if (ret == 0) {
174 ERR("VirtualQuery %p", raddr);
175 return MAP_FAILED;
176 }
177 dprint(FD_IO, "addr %p len %zu state %d",
178 mi.BaseAddress, mi.RegionSize, mi.State);
179
180 if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) {
181 raddr = (char *)mi.BaseAddress + mi.RegionSize;
182 raddr = (char *)roundup((uintptr_t)raddr, align);
183 dprint(FD_IO, "nearest aligned addr %p", raddr);
184 } else {
185 dprint(FD_IO, "unused region of size %zu found at %p",
186 mi.RegionSize, mi.BaseAddress);
187 return mi.BaseAddress;
188 }
189 }
190
191 dprint(FD_IO, "end of address space reached");
192 return MAP_FAILED;
193#else
597a6533
JA
194 fp = fopen(OS_MAPFILE, "r");
195 if (!fp) {
ae0db592
TI
196 log_err("!%s\n", OS_MAPFILE);
197 return MAP_FAILED;
198 }
199
200 while (fgets(line, PROCMAXLEN, fp) != NULL) {
201 /* check for range line */
202 if (sscanf(line, sscanf_os, &lo, &hi) == 2) {
203 dprint(FD_IO, "%p-%p\n", lo, hi);
204 if (lo > raddr) {
205 if ((uintptr_t)(lo - raddr) >= len) {
206 dprint(FD_IO, "unused region of size "
207 "%zu found at %p\n",
208 lo - raddr, raddr);
209 break;
210 } else {
211 dprint(FD_IO, "region is too small: "
212 "%zu < %zu\n",
213 lo - raddr, len);
214 }
215 }
216
217 if (hi > raddr) {
218 raddr = (char *)roundup((uintptr_t)hi, align);
219 dprint(FD_IO, "nearest aligned addr %p\n",
220 raddr);
221 }
222
223 if (raddr == 0) {
224 dprint(FD_IO, "end of address space reached\n");
225 break;
226 }
227 }
228 }
229
230 /*
231 * Check for a case when this is the last unused range in the address
232 * space, but is not large enough. (very unlikely)
233 */
234 if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) {
235 dprint(FD_IO, "end of address space reached");
236 raddr = MAP_FAILED;
237 }
238
239 fclose(fp);
240
241 dprint(FD_IO, "returning %p", raddr);
242 return raddr;
243#endif
244}
245
246/*
247 * util_map_hint -- determine hint address for mmap()
248 *
249 * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick
250 * the randomized mapping address. Otherwise, a user-defined hint address
251 * is used.
252 *
253 * Windows Environment:
254 * XXX - Windows doesn't support large DAX pages yet, so there is
255 * no point in aligning for the same.
256 *
257 * Except for Windows Environment:
258 * ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap
259 * (bit positions 12-39), which means the base mapping address is randomized
260 * within [0..1024GB] range, with 4KB granularity. Assuming additional
261 * 1GB alignment, it results in 1024 possible locations.
262 *
263 * Configuring the hint address via PMEM_MMAP_HINT environment variable
264 * disables address randomization. In such case, the function will search for
265 * the first unused, properly aligned region of given size, above the
266 * specified address.
267 */
597a6533 268static char *util_map_hint(size_t len, size_t req_align)
ae0db592
TI
269{
270 char *addr;
271 size_t align = 0;
272 char *e = NULL;
273
274 dprint(FD_IO, "DEBUG util_map_hint\n");
275 dprint(FD_IO, "len %zu req_align %zu\n", len, req_align);
276
277 /* choose the desired alignment based on the requested length */
278 align = util_map_hint_align(len, req_align);
279
280 e = getenv("PMEM_MMAP_HINT");
281 if (e) {
282 char *endp;
283 unsigned long long val = 0;
284
285 errno = 0;
286
287 val = strtoull(e, &endp, 16);
288 if (errno || endp == e) {
289 dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n");
290 } else {
291 Mmap_hint = (void *)val;
292 Mmap_no_random = 1;
293 dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint);
294 }
295 }
296
297 if (Mmap_no_random) {
298 dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint);
299 addr = util_map_hint_unused((void *)Mmap_hint, len, align);
300 } else {
301 /*
302 * Create dummy mapping to find an unused region of given size.
303 * * Request for increased size for later address alignment.
304 *
305 * Windows Environment:
306 * Use MAP_NORESERVE flag to only reserve the range of pages
307 * rather than commit. We don't want the pages to be actually
308 * backed by the operating system paging file, as the swap
309 * file is usually too small to handle terabyte pools.
310 *
311 * Except for Windows Environment:
312 * Use MAP_PRIVATE with read-only access to simulate
313 * zero cost for overcommit accounting. Note: MAP_NORESERVE
314 * flag is ignored if overcommit is disabled (mode 2).
315 */
316#ifndef WIN32
317 addr = mmap(NULL, len + align, PROT_READ,
318 MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
319#else
320 addr = mmap(NULL, len + align, PROT_READ,
321 MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
322#endif
323 if (addr != MAP_FAILED) {
324 dprint(FD_IO, "system choice %p\n", addr);
325 munmap(addr, len + align);
326 addr = (char *)roundup((uintptr_t)addr, align);
327 }
328 }
329
330 dprint(FD_IO, "hint %p\n", addr);
331
332 return addr;
333}
334
335/*
336 * This is the mmap execution function
337 */
338static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
339 size_t length, off_t off)
340{
341 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
342 int flags = 0;
343 void *addr = NULL;
344
345 dprint(FD_IO, "DEBUG fio_libpmem_file\n");
346
347 if (td_rw(td))
348 flags = PROT_READ | PROT_WRITE;
349 else if (td_write(td)) {
350 flags = PROT_WRITE;
351
352 if (td->o.verify != VERIFY_NONE)
353 flags |= PROT_READ;
354 } else
355 flags = PROT_READ;
356
357 dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name,
358 td->o.verify);
359 dprint(FD_IO, "length = %ld flags = %d f->fd = %d off = %ld \n",
360 length, flags, f->fd,off);
361
362 addr = util_map_hint(length, 0);
363
364 fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off);
365 if (fdd->libpmem_ptr == MAP_FAILED) {
366 fdd->libpmem_ptr = NULL;
367 td_verror(td, errno, "mmap");
368 }
369
370 if (td->error && fdd->libpmem_ptr)
371 munmap(fdd->libpmem_ptr, length);
372
373 return td->error;
374}
375
376/*
377 * XXX Just mmap an appropriate portion, we cannot mmap the full extent
378 */
379static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u)
380{
381 struct fio_file *f = io_u->file;
382 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
383
384 dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" );
385
386 if (io_u->buflen > f->real_file_size) {
387 log_err("libpmem: bs too big for libpmem engine\n");
388 return EIO;
389 }
390
391 fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
392 if (fdd->libpmem_sz > f->io_size)
393 fdd->libpmem_sz = f->io_size;
394
395 fdd->libpmem_off = io_u->offset;
396
397 return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
398}
399
400/*
401 * Attempt to mmap the entire file
402 */
403static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u)
404{
405 struct fio_file *f = io_u->file;
406 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
407 int ret;
408
409 dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" );
410
411 if (fio_file_partial_mmap(f))
412 return EINVAL;
413
414 dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n",
415 f->io_size, io_u->offset);
416
417 if (io_u->offset != (size_t) io_u->offset ||
597a6533 418 f->io_size != (size_t) f->io_size) {
ae0db592
TI
419 fio_file_set_partial_mmap(f);
420 return EINVAL;
421 }
422 fdd->libpmem_sz = f->io_size;
423 fdd->libpmem_off = 0;
424
425 ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
426 if (ret)
427 fio_file_set_partial_mmap(f);
428
429 return ret;
430}
431
432static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
433{
434 struct fio_file *f = io_u->file;
435 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
436 int ret;
437
438 dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
439 /*
440 * It fits within existing mapping, use it
441 */
442 dprint(FD_IO," io_u->offset %lld : fdd->libpmem_off %ld : "
443 "io_u->buflen %ld : fdd->libpmem_sz %ld\n",
444 io_u->offset, fdd->libpmem_off,
445 io_u->buflen, fdd->libpmem_sz);
446
447 if (io_u->offset >= fdd->libpmem_off &&
597a6533
JA
448 (io_u->offset + io_u->buflen <
449 fdd->libpmem_off + fdd->libpmem_sz))
ae0db592
TI
450 goto done;
451
452 /*
453 * unmap any existing mapping
454 */
455 if (fdd->libpmem_ptr) {
456 dprint(FD_IO,"munmap \n");
457 if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0)
458 return errno;
459 fdd->libpmem_ptr = NULL;
460 }
461
462 if (fio_libpmem_prep_full(td, io_u)) {
463 td_clear_error(td);
464 ret = fio_libpmem_prep_limited(td, io_u);
465 if (ret)
466 return ret;
467 }
468
469done:
470 io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off
597a6533 471 - f->file_offset;
ae0db592
TI
472 return 0;
473}
474
475static int fio_libpmem_queue(struct thread_data *td, struct io_u *io_u)
476{
477 fio_ro_check(td, io_u);
478 io_u->error = 0;
479
480 dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
481
482 switch (io_u->ddir) {
597a6533
JA
483 case DDIR_READ:
484 memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
485 break;
486 case DDIR_WRITE:
487 dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
488 io_u->mmap_data, io_u->xfer_buf );
489 dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
490 if (td->o.odirect) {
491 pmem_memcpy_persist(io_u->mmap_data,
ae0db592
TI
492 io_u->xfer_buf,
493 io_u->xfer_buflen);
597a6533
JA
494 } else {
495 pmem_memcpy_nodrain(io_u->mmap_data,
ae0db592
TI
496 io_u->xfer_buf,
497 io_u->xfer_buflen);
597a6533
JA
498 }
499 break;
500 case DDIR_SYNC:
501 case DDIR_DATASYNC:
502 case DDIR_SYNC_FILE_RANGE:
503 break;
504 default:
505 io_u->error = EINVAL;
506 break;
ae0db592
TI
507 }
508
509 return FIO_Q_COMPLETED;
510}
511
512static int fio_libpmem_init(struct thread_data *td)
513{
514 struct thread_options *o = &td->o;
515
516 dprint(FD_IO,"o->rw_min_bs %d \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n",
517 o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
518 dprint(FD_IO, "DEBUG fio_libpmem_init\n");
519
520 if ((o->rw_min_bs & page_mask) &&
597a6533 521 (o->fsync_blocks || o->fdatasync_blocks)) {
ae0db592
TI
522 log_err("libpmem: mmap options dictate a minimum block size of "
523 "%llu bytes\n", (unsigned long long) page_size);
524 return 1;
525 }
526 return 0;
527}
528
529static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
530{
531 struct fio_libpmem_data *fdd;
532 int ret;
533
534 dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
535 dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
536 dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
537 dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
538 dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
539
540 ret = generic_open_file(td, f);
541 if (ret)
542 return ret;
543
544 fdd = calloc(1, sizeof(*fdd));
545 if (!fdd) {
546 int fio_unused __ret;
547 __ret = generic_close_file(td, f);
548 return 1;
549 }
550
551 FILE_SET_ENG_DATA(f, fdd);
552
553 return 0;
554}
555
556static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
557{
558 struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
559
560 dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
561 dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
562
597a6533 563 if (!td->o.odirect) {
ae0db592
TI
564 dprint(FD_IO,"pmem_drain\n");
565 pmem_drain();
566 }
567
568 FILE_SET_ENG_DATA(f, NULL);
569 free(fdd);
570 fio_file_clear_partial_mmap(f);
571
572 return generic_close_file(td, f);
573}
574
575static struct ioengine_ops ioengine = {
597a6533
JA
576 .name = "libpmem",
577 .version = FIO_IOOPS_VERSION,
578 .init = fio_libpmem_init,
579 .prep = fio_libpmem_prep,
580 .queue = fio_libpmem_queue,
581 .open_file = fio_libpmem_open_file,
582 .close_file = fio_libpmem_close_file,
583 .get_file_size = generic_get_file_size,
584 .flags = FIO_SYNCIO |FIO_NOEXTEND,
ae0db592
TI
585};
586
587static void fio_init fio_libpmem_register(void)
588{
589 register_ioengine(&ioengine);
590}
591
592static void fio_exit fio_libpmem_unregister(void)
593{
594 unregister_ioengine(&ioengine);
595}