From ebac4655dd3624f3296ff83be48e0cdc02852f18 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Dec 2005 15:25:21 +0100 Subject: [PATCH] Move fio to seperate repo --- .gitignore | 3 + COPYING | 340 ++++++ Makefile | 27 + README | 203 ++++ arch-alpha.h | 18 + arch-ia64.h | 29 + arch-ppc.h | 32 + arch-s390.h | 18 + arch-x86.h | 23 + arch-x86_64.h | 24 + arch.h | 42 + crc32.c | 84 ++ crc32.h | 23 + examples/1mbs_clients | 196 ++++ examples/aio-read | 17 + examples/tiobench-example | 24 + fio-ini.c | 934 ++++++++++++++++ fio-io.c | 603 ++++++++++ fio.c | 2192 +++++++++++++++++++++++++++++++++++++ fio.h | 312 ++++++ fio_generate_plots | 61 ++ list.h | 134 +++ md5.c | 118 ++ md5.h | 27 + os-freebsd.h | 23 + os-linux.h | 53 + os.h | 49 + 27 files changed, 5609 insertions(+) create mode 100644 .gitignore create mode 100644 COPYING create mode 100644 Makefile create mode 100644 README create mode 100644 arch-alpha.h create mode 100644 arch-ia64.h create mode 100644 arch-ppc.h create mode 100644 arch-s390.h create mode 100644 arch-x86.h create mode 100644 arch-x86_64.h create mode 100644 arch.h create mode 100644 crc32.c create mode 100644 crc32.h create mode 100644 examples/1mbs_clients create mode 100644 examples/aio-read create mode 100644 examples/tiobench-example create mode 100644 fio-ini.c create mode 100644 fio-io.c create mode 100644 fio.c create mode 100644 fio.h create mode 100755 fio_generate_plots create mode 100644 list.h create mode 100644 md5.c create mode 100644 md5.h create mode 100644 os-freebsd.h create mode 100644 os-linux.h create mode 100644 os.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..e00b2f8d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +fio +*.o +.depend diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..5b6e7c66 --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..091b8589 --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +CC = gcc +CFLAGS = -Wall -O2 -g -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 +PROGS = fio +SCRIPTS = fio_generate_plots + +all: depend $(PROGS) $(SCRIPTS) + +fio: fio.o fio-io.o fio-ini.o md5.o crc32.o + $(CC) $(CFLAGS) -o $@ $(filter %.o,$^) -lpthread -laio -lm -lrt + +clean: + -rm -f *.o .depend $(PROGS) + +depend: + @$(CC) -MM $(ALL_CFLAGS) *.c 1> .depend + +INSTALL = install +prefix = /usr/local +bindir = $(prefix)/bin + +install: $(PROGS) $(SCRIPTS) + $(INSTALL) -m755 -d $(DESTDIR)$(bindir) + $(INSTALL) $(PROGS) $(SCRIPTS) $(DESTDIR)$(bindir) + +ifneq ($(wildcard .depend),) +include .depend +endif diff --git a/README b/README new file mode 100644 index 00000000..afef8ec5 --- /dev/null +++ b/README @@ -0,0 +1,203 @@ +fio +--- + +fio is a tool that will spawn a number of thread doing a particular +type of io action as specified by the user. fio takes a number of +global parameters, each inherited by the thread unless otherwise +parameters given to them overriding that setting is given. + +Options +------- + +$ fio + -s IO is sequential + -b block size in KiB for each io + -t Runtime in seconds + -r For random io, sequence must be repeatable + -R If one thread fails to meet rate, quit all + -o Use direct IO is 1, buffered if 0 + -l Generate per-job latency logs + -w Generate per-job bandwidth logs + -f Read for job descriptions + -v Print version information and exit + +The format is as follows: + + directory=x Use 'x' as the top level directory for storing files + rw=x 'x' may be: read, randread, write, or randwrite + size=x Set file size to x bytes (x string can include k/m/g) + ioengine=x 'x' may be: aio/libaio/linuxaio for Linux aio, + posixaio for POSIX aio, sync for regular read/write io, + mmap for mmap'ed io, or sgio for direct SG_IO io. The + latter only works on Linux on SCSI (or SCSI-like + devices, such as usb-storage or sata/libata driven) + devices. + iodepth=x For async io, allow 'x' ios in flight + overwrite=x If 'x', layout a write file first. + prio=x Run io at prio X, 0-7 is the kernel allowed range + prioclass=x Run io at prio class X + bs=x Use 'x' for thread blocksize. May include k/m postfix. + bsrange=x-y Mix thread block sizes randomly between x and y. May + also include k/m postfix. + direct=x 1 for direct IO, 0 for buffered IO + thinktime=x "Think" x usec after each io + rate=x Throttle rate to x KiB/sec + ratemin=x Quit if rate of x KiB/sec can't be met + ratecycle=x ratemin averaged over x msecs + cpumask=x Only allow job to run on CPUs defined by mask. + fsync=x If writing, fsync after every x blocks have been written + startdelay=x Start this thread x seconds after startup + timeout=x Terminate x seconds after startup + offset=x Start io at offset x (x string can include k/m/g) + invalidate=x Invalidate page cache for file prior to doing io + sync=x Use sync writes if x and writing + mem=x If x == malloc, use malloc for buffers. If x == shm, + use shm for buffers. If x == mmap, use anon mmap. + exitall When one thread quits, terminate the others + bwavgtime=x Average bandwidth stats over an x msec window. + create_serialize=x If 'x', serialize file creation. + create_fsync=x If 'x', run fsync() after file creation. + loops=x Run the job 'x' number of times. + verify=x If 'x' == md5, use md5 for verifies. If 'x' == crc32, + use crc32 for verifies. md5 is 'safer', but crc32 is + a lot faster. Only makes sense for writing to a file. + stonewall Wait for preceeding jobs to end before running. + numjobs=x Create 'x' similar entries for this job + thread Use pthreads instead of forked jobs + + +Examples using a job file +------------------------- + +A sample job file doing the same as above would look like this: + +[read_file] +rw=0 +bs=4096 + +[write_file] +rw=1 +bs=16384 + +And fio would be invoked as: + +$ fio -o1 -s -f file_with_above + +The second example would look like this: + +[rf1] +rw=0 +prio=6 + +[rf2] +rw=0 +prio=3 + +[rf3] +rw=0 +prio=0 +direct=1 + +And fio would be invoked as: + +$ fio -o0 -s -b4096 -f file_with_above + +'global' is a reserved keyword. When used as the filename, it sets the +default options for the threads following that section. It is possible +to have more than one global section in the file, as it only affects +subsequent jobs. + +Also see the examples/ dir for sample job files. + + +Interpreting the output +----------------------- + +fio spits out a lot of output. While running, fio will display the +status of the jobs created. An example of that would be: + +Threads now running: 2 : [ww] [5.73% done] + +The characters inside the square brackets denote the current status of +each thread. The possible values (in typical life cycle order) are: + +Idle Run +---- --- +P Thread setup, but not started. +C Thread created and running, but not doing anything yet + R Running, doing sequential reads. + r Running, doing random reads. + W Running, doing sequential writes. + w Running, doing random writes. +V Running, doing verification of written data. +E Thread exited, not reaped by main thread yet. +_ Thread reaped. + +The other values are fairly self explanatory - number of thread currently +running and doing io, and the estimated completion percentage. + +When fio is done (or interrupted by ctrl-c), it will show the data for +each thread, group of threads, and disks in that order. For each data +direction, the output looks like: + +Client1 (g=0): err= 0: + write: io= 32MiB, bw= 666KiB/s, runt= 50320msec + slat (msec): min= 0, max= 136, avg= 0.03, dev= 1.92 + clat (msec): min= 0, max= 631, avg=48.50, dev=86.82 + bw (KiB/s) : min= 0, max= 1196, per=51.00%, avg=664.02, dev=681.68 + cpu : usr=1.49%, sys=0.25%, ctx=7969 + +The client number is printed, along with the group id and error of that +thread. Below is the io statistics, here for writes. In the order listed, +they denote: + +io= Number of megabytes io performed +bw= Average bandwidth rate +runt= The runtime of that thread + slat= Submission latency (avg being the average, dev being the + standard deviation). This is the time it took to submit + the io. For sync io, the slat is really the completion + latency, since queue/complete is one operation there. + clat= Completion latency. Same names as slat, this denotes the + time from submission to completion of the io pieces. For + sync io, clat will usually be equal (or very close) to 0, + as the time from submit to complete is basically just + CPU time (io has already been done, see slat explanation). + bw= Bandwidth. Same names as the xlat stats, but also includes + an approximate percentage of total aggregate bandwidth + this thread received in this group. This last value is + only really useful if the threads in this group are on the + same disk, since they are then competing for disk access. +cpu= CPU usage. User and system time, along with the number + of context switches this thread went through. + +After each client has been listed, the group statistics are printed. They +will look like this: + +Run status group 0 (all jobs): + READ: io=64MiB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec + WRITE: io=64MiB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec + +For each data direction, it prints: + +io= Number of megabytes io performed. +aggrb= Aggregate bandwidth of threads in this group. +minb= The minimum average bandwidth a thread saw. +maxb= The maximum average bandwidth a thread saw. +mint= The minimum runtime of a thread. +maxt= The maximum runtime of a thread. + +And finally, the disk statistics are printed. They will look like this: + +Disk stats (read/write): + sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% + +Each value is printed for both reads and writes, with reads first. The +numbers denote: + +ios= Number of ios performed by all groups. +merge= Number of merges io the io scheduler. +ticks= Number of ticks we kept the disk busy. +io_queue= Total time spent in the disk queue. +util= The disk utilization. A value of 100% means we kept the disk + busy constantly, 50% would be a disk idling half of the time. diff --git a/arch-alpha.h b/arch-alpha.h new file mode 100644 index 00000000..09293bd3 --- /dev/null +++ b/arch-alpha.h @@ -0,0 +1,18 @@ +#ifndef ARCH_ALPHA_H +#define ARCH_ALPHA_H + +#define ARCH (arch_alpha) + +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 442 +#define __NR_ioprio_get 443 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 413 +#endif + +#define nop do { } while (0) +#define ffz(v) generic_ffz((v)) + +#endif diff --git a/arch-ia64.h b/arch-ia64.h new file mode 100644 index 00000000..c9c05085 --- /dev/null +++ b/arch-ia64.h @@ -0,0 +1,29 @@ +#ifndef ARCH_IA64_H +#define ARCH_IA64_H + +#define ARCH (arch_ia64) + +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 1274 +#define __NR_ioprio_get 1275 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 1234 +#endif + +#define nop asm volatile ("hint @pause" ::: "memory"); + +#define ia64_popcnt(x) \ +({ \ + unsigned long ia64_intri_res; \ + asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x)); \ + ia64_intri_res; \ +}) + +static inline unsigned long ffz(unsigned long bitmask) +{ + return ia64_popcnt(bitmask & (~bitmask - 1)); +} + +#endif diff --git a/arch-ppc.h b/arch-ppc.h new file mode 100644 index 00000000..e16f99ba --- /dev/null +++ b/arch-ppc.h @@ -0,0 +1,32 @@ +#ifndef ARCH_PPC_H +#define ARCH_PPH_H + +#define ARCH (arch_ppc) + +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 273 +#define __NR_ioprio_get 274 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 233 +#endif + +#define nop do { } while (0) + +static inline int __ilog2(unsigned long bitmask) +{ + int lz; + + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (bitmask)); + return 31 - lz; +} + +static inline int ffz(unsigned long bitmask) +{ + if ((bitmask = ~bitmask) == 0) + return 32; + return __ilog2(bitmask & -bitmask); +} + +#endif diff --git a/arch-s390.h b/arch-s390.h new file mode 100644 index 00000000..b7048ad6 --- /dev/null +++ b/arch-s390.h @@ -0,0 +1,18 @@ +#ifndef ARCH_S390_H +#define ARCH_S390_H + +#define ARCH (arch_s390) + +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 282 +#define __NR_ioprio_get 283 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 253 +#endif + +#define nop asm volatile ("diag 0,0,68" : : : "memory") +#define ffz(v) generic_ffz((v)) + +#endif diff --git a/arch-x86.h b/arch-x86.h new file mode 100644 index 00000000..4e74c0c3 --- /dev/null +++ b/arch-x86.h @@ -0,0 +1,23 @@ +#ifndef ARCH_X86_H +#define ARCH_X86_H + +#define ARCH (arch_i386) + +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 289 +#define __NR_ioprio_get 290 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 250 +#endif + +#define nop __asm__ __volatile__("rep;nop": : :"memory") + +static inline unsigned long ffz(unsigned long bitmask) +{ + __asm__("bsfl %1,%0" :"=r" (bitmask) :"r" (~bitmask)); + return bitmask; +} + +#endif diff --git a/arch-x86_64.h b/arch-x86_64.h new file mode 100644 index 00000000..cca66f68 --- /dev/null +++ b/arch-x86_64.h @@ -0,0 +1,24 @@ +#ifndef ARCH_X86_64_h +#define ARCH_X86_64_h + +#define ARCH (arch_x86_64) + +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 251 +#define __NR_ioprio_get 252 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 221 +#endif + +#define nop __asm__ __volatile__("rep;nop": : :"memory") + +static inline unsigned long ffz(unsigned long bitmask) +{ + __asm__("bsfq %1,%0" :"=r" (bitmask) :"r" (~bitmask)); + return bitmask; +} + + +#endif diff --git a/arch.h b/arch.h new file mode 100644 index 00000000..745bf3a4 --- /dev/null +++ b/arch.h @@ -0,0 +1,42 @@ +#ifndef ARCH_H +#define ARCH_H + +enum { + arch_x86_64, + arch_i386, + arch_ppc, + arch_ia64, + arch_s390, + arch_alpha, +}; + +static inline unsigned long generic_ffz(unsigned long word) +{ + unsigned int i; + + for (i = 0; i < sizeof(word) * 8; i++) + if ((word & (1UL << i)) == 0) + return i; + + return -1; +} + +#if defined(__i386__) +#include "arch-x86.h" +#elif defined(__x86_64__) +#include "arch-x86_64.h" +#elif defined(__powerpc__) || defined(__powerpc64__) +#include "arch-ppc.h" +#elif defined(__ia64__) +#include "arch-ia64.h" +#elif defined(__alpha__) +#include "arch-alpha.h" +#elif defined(__s390x__) || defined(__s390__) +#include "arch-s390.h" +#else +#error "Unsupported arch" +#endif + +#define BITS_PER_LONG (__WORDSIZE) + +#endif diff --git a/crc32.c b/crc32.c new file mode 100644 index 00000000..ba6cc065 --- /dev/null +++ b/crc32.c @@ -0,0 +1,84 @@ +/* crc32 -- calculate and POSIX.2 checksum + Copyright (C) 92, 1995-1999 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include "crc32.h" + +static const unsigned long crctab[256] = { + 0x0, + 0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B, + 0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, + 0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD, + 0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC, + 0x5BD4B01B, 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F, + 0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A, + 0x745E66CD, 0x9823B6E0, 0x9CE2AB57, 0x91A18D8E, 0x95609039, + 0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58, + 0xBAEA46EF, 0xB7A96036, 0xB3687D81, 0xAD2F2D84, 0xA9EE3033, + 0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE, + 0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, 0xCA753D95, + 0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4, + 0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0, + 0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5, + 0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16, + 0x018AEB13, 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07, + 0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C, + 0x6211E6B5, 0x66D0FB02, 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1, + 0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA, + 0xACA5C697, 0xA864DB20, 0xA527FDF9, 0xA1E6E04E, 0xBFA1B04B, + 0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698, + 0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, 0x902B669D, + 0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E, + 0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F, + 0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34, + 0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80, + 0x644FC637, 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB, + 0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A, + 0x58C1663D, 0x558240E4, 0x51435D53, 0x251D3B9E, 0x21DC2629, + 0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C, + 0x3B5A6B9B, 0x0315D626, 0x07D4CB91, 0x0A97ED48, 0x0E56F0FF, + 0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E, + 0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, 0xE6EA3D65, + 0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8, + 0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3, + 0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2, + 0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71, + 0x92B45BA8, 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74, + 0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640, + 0x4E8EE645, 0x4A4FFBF2, 0x470CDD2B, 0x43CDC09C, 0x7B827D21, + 0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A, + 0x61043093, 0x65C52D24, 0x119B4BE9, 0x155A565E, 0x18197087, + 0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC, + 0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, 0x2497D08D, + 0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE, + 0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB, + 0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18, + 0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09, + 0x8D79E0BE, 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662, + 0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF, + 0xA2F33668, 0xBCB4666D, 0xB8757BDA, 0xB5365D03, 0xB1F740B4 +}; + +unsigned long crc32(const void *buffer, unsigned long length) +{ + const unsigned char *cp = (const unsigned char *) buffer; + unsigned long crc = 0; + + while (length--) + crc = (crc << 8) ^ crctab[((crc >> 24) ^ *(cp++)) & 0xFF]; + + return crc; +} diff --git a/crc32.h b/crc32.h new file mode 100644 index 00000000..b7e5eee1 --- /dev/null +++ b/crc32.h @@ -0,0 +1,23 @@ +/* crc32 -- calculate and POSIX.2 checksum + Copyright (C) 92, 1995-1999 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#ifndef CRC32_H +#define CRC32_H + +extern unsigned long crc32(const void * const, unsigned long); + +#endif diff --git a/examples/1mbs_clients b/examples/1mbs_clients new file mode 100644 index 00000000..f9a228ee --- /dev/null +++ b/examples/1mbs_clients @@ -0,0 +1,196 @@ +; Keep adding 1024kb/s reading clients at 4 seconds +[global] +size=32m + +[/tmp/file1] +rw=read +rate=1250 +ratemin=1024 +startdelay=0 + +[/tmp/file2] +rw=read +rate=1250 +ratemin=1024 +startdelay=4 + +[/tmp/file3] +rw=read +rate=1250 +ratemin=1024 +startdelay=8 + +[/tmp/file4] +rw=read +rate=1250 +ratemin=1024 +startdelay=12 + +[/tmp/file5] +rw=read +rate=1250 +ratemin=1024 +startdelay=16 + +[/tmp/file6] +rw=read +rate=1250 +ratemin=1024 +startdelay=20 + +[/tmp/file7] +rw=read +rate=1250 +ratemin=1024 +startdelay=24 + +[/tmp/file8] +rw=read +rate=1250 +ratemin=1024 +startdelay=28 + +[/tmp/file9] +rw=read +rate=1250 +ratemin=1024 +startdelay=32 + +[/tmp/file10] +rw=read +rate=1250 +ratemin=1024 +startdelay=36 + +[/tmp/file11] +rw=read +rate=1250 +ratemin=1024 +startdelay=40 + +[/tmp/file12] +rw=read +rate=1250 +ratemin=1024 +startdelay=44 + +[/tmp/file13] +rw=read +rate=1250 +ratemin=1024 +startdelay=48 + +[/tmp/file14] +rw=read +rate=1250 +ratemin=1024 +startdelay=52 + +[/tmp/file15] +rw=read +rate=1250 +ratemin=1024 +startdelay=56 + +[/tmp/file16] +rw=read +rate=1250 +ratemin=1024 +startdelay=60 + +[/tmp/file17] +rw=read +rate=1250 +ratemin=1024 +startdelay=64 + +[/tmp/file18] +rw=read +rate=1250 +ratemin=1024 +startdelay=68 + +[/tmp/file19] +rw=read +rate=1250 +ratemin=1024 +startdelay=72 + +[/tmp/file20] +rw=read +rate=1250 +ratemin=1024 +startdelay=76 + +[/tmp/file21] +rw=read +rate=1250 +ratemin=1024 +startdelay=80 + +[/tmp/file22] +rw=read +rate=1250 +ratemin=1024 +startdelay=84 + +[/tmp/file23] +rw=read +rate=1250 +ratemin=1024 +startdelay=88 + +[/tmp/file24] +rw=read +rate=1250 +ratemin=1024 +startdelay=92 + +[/tmp/file25] +rw=read +rate=1250 +ratemin=1024 +startdelay=96 + +[/tmp/file26] +rw=read +rate=1250 +ratemin=1024 +startdelay=100 + +[/tmp/file27] +rw=read +rate=1250 +ratemin=1024 +startdelay=104 + +[/tmp/file28] +rw=read +rate=1250 +ratemin=1024 +startdelay=108 + +[/tmp/file29] +rw=read +rate=1250 +ratemin=1024 +startdelay=112 + +[/tmp/file30] +rw=read +rate=1250 +ratemin=1024 +startdelay=116 + +[/tmp/file31] +rw=read +rate=1250 +ratemin=1024 +startdelay=120 + +[/tmp/file32] +rw=read +rate=1250 +ratemin=1024 +startdelay=124 + diff --git a/examples/aio-read b/examples/aio-read new file mode 100644 index 00000000..832e0060 --- /dev/null +++ b/examples/aio-read @@ -0,0 +1,17 @@ +; Read 4 files with aio at different depths +[global] +ioengine=libaio +rw=randread +bs=128k + +[/data1/file1] +iodepth=4 + +[/data1/file2] +iodepth=32 + +[/data1/file3] +iodepth=8 + +[/data1/file4] +iodepth=16 diff --git a/examples/tiobench-example b/examples/tiobench-example new file mode 100644 index 00000000..5a4493e7 --- /dev/null +++ b/examples/tiobench-example @@ -0,0 +1,24 @@ +; tiobench like setup, add more fX files between the stonewalls to +; create more threads + +[global] +direct=1 +size=512m +bsrange=4k-4k +timeout=60 +numjobs=4 ; 4 simultaneous threads for each job + +[f1] +rw=write + +[f2] +stonewall +rw=randwrite + +[f3] +stonewall +rw=read + +[f4] +stonewall +rw=randread diff --git a/fio-ini.c b/fio-ini.c new file mode 100644 index 00000000..549be0ec --- /dev/null +++ b/fio-ini.c @@ -0,0 +1,934 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fio.h" + +#define DEF_BS (4096) +#define DEF_TIMEOUT (0) +#define DEF_RATE_CYCLE (1000) +#define DEF_ODIRECT (1) +#define DEF_IO_ENGINE (FIO_SYNCIO) +#define DEF_IO_ENGINE_NAME "sync" +#define DEF_SEQUENTIAL (1) +#define DEF_RAND_REPEAT (1) +#define DEF_OVERWRITE (1) +#define DEF_CREATE (1) +#define DEF_INVALIDATE (1) +#define DEF_SYNCIO (0) +#define DEF_RANDSEED (0xb1899bedUL) +#define DEF_BWAVGTIME (500) +#define DEF_CREATE_SER (1) +#define DEF_CREATE_FSYNC (1) +#define DEF_LOOPS (1) +#define DEF_VERIFY (0) +#define DEF_STONEWALL (0) +#define DEF_NUMJOBS (1) +#define DEF_USE_THREAD (0) +#define DEF_FILE_SIZE (1024 * 1024 * 1024UL) + +static char fio_version_string[] = "fio 1.0"; + +static int repeatable = DEF_RAND_REPEAT; +static char *ini_file; +static int max_jobs = MAX_JOBS; + +struct thread_data def_thread; +struct thread_data *threads = NULL; + +int rate_quit = 0; +int write_lat_log = 0; +int write_bw_log = 0; +int exitall_on_terminate = 0; + +static int setup_rate(struct thread_data *td) +{ + int nr_reads_per_sec; + + if (!td->rate) + return 0; + + if (td->rate < td->ratemin) { + fprintf(stderr, "min rate larger than nominal rate\n"); + return -1; + } + + nr_reads_per_sec = (td->rate * 1024) / td->min_bs; + td->rate_usec_cycle = 1000000 / nr_reads_per_sec; + td->rate_pending_usleep = 0; + return 0; +} + +static void setup_log(struct io_log **log) +{ + struct io_log *l = malloc(sizeof(*l)); + + l->nr_samples = 0; + l->max_samples = 1024; + l->log = malloc(l->max_samples * sizeof(struct io_sample)); + *log = l; +} + +void finish_log(struct thread_data *td, struct io_log *log, const char *name) +{ + char file_name[128]; + FILE *f; + unsigned int i; + + sprintf(file_name, "client%d_%s.log", td->thread_number, name); + f = fopen(file_name, "w"); + if (!f) { + perror("fopen log"); + return; + } + + for (i = 0; i < log->nr_samples; i++) + fprintf(f, "%lu, %lu, %u\n", log->log[i].time, log->log[i].val, log->log[i].ddir); + + fclose(f); + free(log->log); + free(log); +} + +static struct thread_data *get_new_job(int global, struct thread_data *parent) +{ + struct thread_data *td; + + if (global) + return &def_thread; + if (thread_number >= max_jobs) + return NULL; + + td = &threads[thread_number++]; + memset(td, 0, sizeof(*td)); + + td->fd = -1; + td->thread_number = thread_number; + + td->ddir = parent->ddir; + td->ioprio = parent->ioprio; + td->sequential = parent->sequential; + td->bs = parent->bs; + td->min_bs = parent->min_bs; + td->max_bs = parent->max_bs; + td->odirect = parent->odirect; + td->thinktime = parent->thinktime; + td->fsync_blocks = parent->fsync_blocks; + td->start_delay = parent->start_delay; + td->timeout = parent->timeout; + td->io_engine = parent->io_engine; + td->create_file = parent->create_file; + td->overwrite = parent->overwrite; + td->invalidate_cache = parent->invalidate_cache; + td->file_size = parent->file_size; + td->file_offset = parent->file_offset; + td->rate = parent->rate; + td->ratemin = parent->ratemin; + td->ratecycle = parent->ratecycle; + td->iodepth = parent->iodepth; + td->sync_io = parent->sync_io; + td->mem_type = parent->mem_type; + td->bw_avg_time = parent->bw_avg_time; + td->create_serialize = parent->create_serialize; + td->create_fsync = parent->create_fsync; + td->loops = parent->loops; + td->verify = parent->verify; + td->stonewall = parent->stonewall; + td->numjobs = parent->numjobs; + td->use_thread = parent->use_thread; + td->do_disk_util = parent->do_disk_util; + memcpy(&td->cpumask, &parent->cpumask, sizeof(td->cpumask)); + strcpy(td->io_engine_name, parent->io_engine_name); + + return td; +} + +static void put_job(struct thread_data *td) +{ + memset(&threads[td->thread_number - 1], 0, sizeof(*td)); + thread_number--; +} + +static int add_job(struct thread_data *td, const char *jobname, int prioclass, + int prio) +{ + char *ddir_str[] = { "read", "write", "randread", "randwrite" }; + struct stat sb; + int numjobs, ddir; + +#ifndef FIO_HAVE_LIBAIO + if (td->io_engine == FIO_LIBAIO) { + fprintf(stderr, "Linux libaio not available\n"); + return 1; + } +#endif +#ifndef FIO_HAVE_POSIXAIO + if (td->io_engine == FIO_POSIXAIO) { + fprintf(stderr, "posix aio not available\n"); + return 1; + } +#endif +#ifdef FIO_HAVE_IOPRIO + td->ioprio = (prioclass << IOPRIO_CLASS_SHIFT) | prio; +#endif + + /* + * the def_thread is just for options, it's not a real job + */ + if (td == &def_thread) + return 0; + + if (td->io_engine & FIO_SYNCIO) + td->iodepth = 1; + else { + if (!td->iodepth) + td->iodepth = 1; + } + + td->filetype = FIO_TYPE_FILE; + if (!stat(jobname, &sb) && S_ISBLK(sb.st_mode)) + td->filetype = FIO_TYPE_BD; + + if (td->filetype == FIO_TYPE_FILE) { + if (td->directory[0] != '\0') + sprintf(td->file_name, "%s/%s.%d", td->directory, jobname, td->thread_number); + else + sprintf(td->file_name, "%s.%d", jobname, td->thread_number); + } else + strcpy(td->file_name, jobname); + + sem_init(&td->mutex, 0, 0); + + td->clat_stat[0].min_val = td->clat_stat[1].min_val = ULONG_MAX; + td->slat_stat[0].min_val = td->slat_stat[1].min_val = ULONG_MAX; + td->bw_stat[0].min_val = td->bw_stat[1].min_val = ULONG_MAX; + + if (td->min_bs == -1U) + td->min_bs = td->bs; + if (td->max_bs == -1U) + td->max_bs = td->bs; + if (td_read(td)) + td->verify = 0; + + if (td->stonewall && td->thread_number > 1) + groupid++; + + td->groupid = groupid; + + if (setup_rate(td)) + goto err; + + if (write_lat_log) { + setup_log(&td->slat_log); + setup_log(&td->clat_log); + } + if (write_bw_log) + setup_log(&td->bw_log); + + ddir = td->ddir + (!td->sequential << 1); + printf("Client%d (g=%d): rw=%s, prio=%d/%d, odir=%d, bs=%d-%d, rate=%d, ioengine=%s, iodepth=%d\n", td->thread_number, td->groupid, ddir_str[ddir], prioclass, prio, td->odirect, td->min_bs, td->max_bs, td->rate, td->io_engine_name, td->iodepth); + + /* + * recurse add identical jobs, clear numjobs and stonewall options + * as they don't apply to sub-jobs + */ + numjobs = td->numjobs; + while (--numjobs) { + struct thread_data *td_new = get_new_job(0, td); + + if (!td_new) + goto err; + + td_new->numjobs = 1; + td_new->stonewall = 0; + + if (add_job(td_new, jobname, prioclass, prio)) + goto err; + } + return 0; +err: + put_job(td); + return -1; +} + +int init_random_state(struct thread_data *td) +{ + unsigned long seed; + int fd, num_maps, blocks; + + fd = open("/dev/random", O_RDONLY); + if (fd == -1) { + td_verror(td, errno); + return 1; + } + + if (read(fd, &seed, sizeof(seed)) < (int) sizeof(seed)) { + td_verror(td, EIO); + close(fd); + return 1; + } + + close(fd); + + srand48_r(seed, &td->bsrange_state); + srand48_r(seed, &td->verify_state); + + if (td->sequential) + return 0; + + if (repeatable) + seed = DEF_RANDSEED; + + blocks = (td->io_size + td->min_bs - 1) / td->min_bs; + num_maps = blocks / BLOCKS_PER_MAP; + td->file_map = malloc(num_maps * sizeof(long)); + td->num_maps = num_maps; + memset(td->file_map, 0, num_maps * sizeof(long)); + + srand48_r(seed, &td->random_state); + return 0; +} + +static void fill_cpu_mask(os_cpu_mask_t cpumask, int cpu) +{ +#ifdef FIO_HAVE_CPU_AFFINITY + unsigned int i; + + CPU_ZERO(&cpumask); + + for (i = 0; i < sizeof(int) * 8; i++) { + if ((1 << i) & cpu) + CPU_SET(i, &cpumask); + } +#endif +} + +static unsigned long get_mult(char c) +{ + switch (c) { + case 'k': + case 'K': + return 1024; + case 'm': + case 'M': + return 1024 * 1024; + case 'g': + case 'G': + return 1024 * 1024 * 1024; + default: + return 1; + } +} + +/* + * convert string after '=' into decimal value, noting any size suffix + */ +static int str_cnv(char *p, unsigned long long *val) +{ + char *str; + int len; + + str = strstr(p, "="); + if (!str) + return 1; + + str++; + len = strlen(str); + + *val = strtoul(str, NULL, 10); + if (*val == ULONG_MAX && errno == ERANGE) + return 1; + + *val *= get_mult(str[len - 2]); + return 0; +} + +static int check_strcnv(char *p, char *name, unsigned long long *val) +{ + if (!strstr(p, name)) + return 1; + + return str_cnv(p, val); +} + +static void strip_blank_front(char **p) +{ + char *s = *p; + + while (isblank(*s)) + s++; +} + +static void strip_blank_end(char *p) +{ + while (isblank(*p)) { + *p = '\0'; + p--; + } +} + +typedef int (str_cb_fn)(struct thread_data *, char *); + +static int check_str(char *p, char *name, str_cb_fn *cb, struct thread_data *td) +{ + char *s = strstr(p, name); + + if (!s) + return 1; + + s = strstr(s, "="); + if (!s) + return 1; + + s++; + strip_blank_front(&s); + return cb(td, s); +} + +static int check_strstore(char *p, char *name, char *dest) +{ + char *s = strstr(p, name); + + if (!s) + return 1; + + s = strstr(p, "="); + if (!s) + return 1; + + s++; + strip_blank_front(&s); + + strcpy(dest, s); + + s = dest + strlen(dest) - 1; + strip_blank_end(s); + return 0; +} + +static int check_range(char *p, char *name, unsigned long *s, unsigned long *e) +{ + char str[128]; + char s1, s2; + + sprintf(str, "%s=%%lu%%c-%%lu%%c", name); + if (sscanf(p, str, s, &s1, e, &s2) == 4) { + *s *= get_mult(s1); + *e *= get_mult(s2); + return 0; + } + + sprintf(str, "%s = %%lu%%c-%%lu%%c", name); + if (sscanf(p, str, s, &s1, e, &s2) == 4) { + *s *= get_mult(s1); + *e *= get_mult(s2); + return 0; + } + + sprintf(str, "%s=%%lu-%%lu", name); + if (sscanf(p, str, s, e) == 2) + return 0; + + sprintf(str, "%s = %%lu-%%lu", name); + if (sscanf(p, str, s, e) == 2) + return 0; + + return 1; + +} + +static int check_int(char *p, char *name, unsigned int *val) +{ + char str[128]; + + sprintf(str, "%s=%%d", name); + if (sscanf(p, str, val) == 1) + return 0; + + sprintf(str, "%s = %%d", name); + if (sscanf(p, str, val) == 1) + return 0; + + return 1; +} + +static int check_strset(char *p, char *name) +{ + return strncmp(p, name, strlen(name)); +} + +static int is_empty_or_comment(char *line) +{ + unsigned int i; + + for (i = 0; i < strlen(line); i++) { + if (line[i] == ';') + return 1; + if (!isspace(line[i]) && !iscntrl(line[i])) + return 0; + } + + return 1; +} + +static int str_rw_cb(struct thread_data *td, char *mem) +{ + if (!strncmp(mem, "read", 4) || !strncmp(mem, "0", 1)) { + td->ddir = DDIR_READ; + td->sequential = 1; + return 0; + } else if (!strncmp(mem, "randread", 8)) { + td->ddir = DDIR_READ; + td->sequential = 0; + return 0; + } else if (!strncmp(mem, "write", 5) || !strncmp(mem, "1", 1)) { + td->ddir = DDIR_WRITE; + td->sequential = 1; + return 0; + } else if (!strncmp(mem, "randwrite", 9)) { + td->ddir = DDIR_WRITE; + td->sequential = 0; + return 0; + } + + fprintf(stderr, "bad data direction: %s\n", mem); + return 1; +} + +static int str_verify_cb(struct thread_data *td, char *mem) +{ + if (!strncmp(mem, "0", 1)) { + td->verify = VERIFY_NONE; + return 0; + } else if (!strncmp(mem, "md5", 3) || !strncmp(mem, "1", 1)) { + td->verify = VERIFY_MD5; + return 0; + } else if (!strncmp(mem, "crc32", 5)) { + td->verify = VERIFY_CRC32; + return 0; + } + + fprintf(stderr, "bad verify type: %s\n", mem); + return 1; +} + +static int str_mem_cb(struct thread_data *td, char *mem) +{ + if (!strncmp(mem, "malloc", 6)) { + td->mem_type = MEM_MALLOC; + return 0; + } else if (!strncmp(mem, "shm", 3)) { + td->mem_type = MEM_SHM; + return 0; + } else if (!strncmp(mem, "mmap", 4)) { + td->mem_type = MEM_MMAP; + return 0; + } + + fprintf(stderr, "bad mem type: %s\n", mem); + return 1; +} + +static int str_ioengine_cb(struct thread_data *td, char *str) +{ + if (!strncmp(str, "linuxaio", 8) || !strncmp(str, "aio", 3) || + !strncmp(str, "libaio", 6)) { + strcpy(td->io_engine_name, "libaio"); + td->io_engine = FIO_LIBAIO; + return 0; + } else if (!strncmp(str, "posixaio", 8)) { + strcpy(td->io_engine_name, "posixaio"); + td->io_engine = FIO_POSIXAIO; + return 0; + } else if (!strncmp(str, "sync", 4)) { + strcpy(td->io_engine_name, "sync"); + td->io_engine = FIO_SYNCIO; + return 0; + } else if (!strncmp(str, "mmap", 4)) { + strcpy(td->io_engine_name, "mmap"); + td->io_engine = FIO_MMAPIO; + return 0; + } else if (!strncmp(str, "sgio", 4)) { + strcpy(td->io_engine_name, "sgio"); + td->io_engine = FIO_SGIO; + return 0; + } + + fprintf(stderr, "bad ioengine type: %s\n", str); + return 1; +} + + +int parse_jobs_ini(char *file) +{ + unsigned int prioclass, prio, cpu, global; + unsigned long long ull; + unsigned long ul1, ul2; + struct thread_data *td; + char *string, *name; + fpos_t off; + FILE *f; + char *p; + + f = fopen(file, "r"); + if (!f) { + perror("fopen"); + return 1; + } + + string = malloc(4096); + name = malloc(256); + + while ((p = fgets(string, 4096, f)) != NULL) { + if (is_empty_or_comment(p)) + continue; + if (sscanf(p, "[%s]", name) != 1) + continue; + + global = !strncmp(name, "global", 6); + + name[strlen(name) - 1] = '\0'; + + td = get_new_job(global, &def_thread); + if (!td) + return 1; + + prioclass = 2; + prio = 4; + + fgetpos(f, &off); + while ((p = fgets(string, 4096, f)) != NULL) { + if (is_empty_or_comment(p)) + continue; + if (strstr(p, "[")) + break; + if (!check_int(p, "prio", &prio)) { +#ifndef FIO_HAVE_IOPRIO + fprintf(stderr, "io priorities not available\n"); + return 1; +#endif + fgetpos(f, &off); + continue; + } + if (!check_int(p, "prioclass", &prioclass)) { +#ifndef FIO_HAVE_IOPRIO + fprintf(stderr, "io priorities not available\n"); + return 1; +#endif + fgetpos(f, &off); + continue; + } + if (!check_int(p, "direct", &td->odirect)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "rate", &td->rate)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "ratemin", &td->ratemin)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "ratecycle", &td->ratecycle)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "thinktime", &td->thinktime)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "cpumask", &cpu)) { +#ifndef FIO_HAVE_CPU_AFFINITY + fprintf(stderr, "cpu affinity not available\n"); + return 1; +#endif + fill_cpu_mask(td->cpumask, cpu); + fgetpos(f, &off); + continue; + } + if (!check_int(p, "fsync", &td->fsync_blocks)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "startdelay", &td->start_delay)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "timeout", &td->timeout)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "invalidate",&td->invalidate_cache)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "iodepth", &td->iodepth)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "sync", &td->sync_io)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "bwavgtime", &td->bw_avg_time)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "create_serialize", &td->create_serialize)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "create_fsync", &td->create_fsync)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "loops", &td->loops)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "numjobs", &td->numjobs)) { + fgetpos(f, &off); + continue; + } + if (!check_int(p, "overwrite", &td->overwrite)) { + fgetpos(f, &off); + continue; + } + if (!check_range(p, "bsrange", &ul1, &ul2)) { + if (ul1 > ul2) { + td->max_bs = ul1; + td->min_bs = ul2; + } else { + td->max_bs = ul2; + td->min_bs = ul1; + } + fgetpos(f, &off); + continue; + } + if (!check_strcnv(p, "bs", &ull)) { + td->bs = ull; + fgetpos(f, &off); + continue; + } + if (!check_strcnv(p, "size", &td->file_size)) { + fgetpos(f, &off); + continue; + } + if (!check_strcnv(p, "offset", &td->file_offset)) { + fgetpos(f, &off); + continue; + } + if (!check_strstore(p, "directory", td->directory)) { + fgetpos(f, &off); + continue; + } + if (!check_str(p, "mem", str_mem_cb, td)) { + fgetpos(f, &off); + continue; + } + if (!check_str(p, "verify", str_verify_cb, td)) { + fgetpos(f, &off); + continue; + } + if (!check_str(p, "rw", str_rw_cb, td)) { + fgetpos(f, &off); + continue; + } + if (!check_str(p, "ioengine", str_ioengine_cb, td)) { + fgetpos(f, &off); + continue; + } + if (!check_strset(p, "create")) { + td->create_file = 1; + fgetpos(f, &off); + continue; + } + if (!check_strset(p, "exitall")) { + exitall_on_terminate = 1; + fgetpos(f, &off); + continue; + } + if (!check_strset(p, "stonewall")) { + td->stonewall = 1; + fgetpos(f, &off); + continue; + } + if (!check_strset(p, "thread")) { + td->use_thread = 1; + fgetpos(f, &off); + continue; + } + + printf("Client%d: bad option %s\n",td->thread_number,p); + } + fsetpos(f, &off); + + if (add_job(td, name, prioclass, prio)) + return 1; + } + + free(string); + free(name); + fclose(f); + return 0; +} + +static int fill_def_thread(void) +{ + memset(&def_thread, 0, sizeof(def_thread)); + + if (fio_getaffinity(getpid(), &def_thread.cpumask) == -1) { + perror("sched_getaffinity"); + return 1; + } + + /* + * fill globals + */ + def_thread.ddir = DDIR_READ; + def_thread.bs = DEF_BS; + def_thread.min_bs = -1; + def_thread.max_bs = -1; + def_thread.io_engine = DEF_IO_ENGINE; + strcpy(def_thread.io_engine_name, DEF_IO_ENGINE_NAME); + def_thread.odirect = DEF_ODIRECT; + def_thread.ratecycle = DEF_RATE_CYCLE; + def_thread.sequential = DEF_SEQUENTIAL; + def_thread.timeout = DEF_TIMEOUT; + def_thread.create_file = DEF_CREATE; + def_thread.overwrite = DEF_OVERWRITE; + def_thread.invalidate_cache = DEF_INVALIDATE; + def_thread.sync_io = DEF_SYNCIO; + def_thread.mem_type = MEM_MALLOC; + def_thread.bw_avg_time = DEF_BWAVGTIME; + def_thread.create_serialize = DEF_CREATE_SER; + def_thread.create_fsync = DEF_CREATE_FSYNC; + def_thread.loops = DEF_LOOPS; + def_thread.verify = DEF_VERIFY; + def_thread.stonewall = DEF_STONEWALL; + def_thread.numjobs = DEF_NUMJOBS; + def_thread.use_thread = DEF_USE_THREAD; +#ifdef FIO_HAVE_DISK_UTIL + def_thread.do_disk_util = 1; +#endif + + return 0; +} + +static void parse_cmd_line(int argc, char *argv[]) +{ + int c; + + while ((c = getopt(argc, argv, "s:b:t:r:R:o:f:lwv")) != EOF) { + switch (c) { + case 's': + def_thread.sequential = !!atoi(optarg); + break; + case 'b': + def_thread.bs = atoi(optarg); + def_thread.bs <<= 10; + if (!def_thread.bs) { + printf("bad block size\n"); + def_thread.bs = DEF_BS; + } + break; + case 't': + def_thread.timeout = atoi(optarg); + break; + case 'r': + repeatable = !!atoi(optarg); + break; + case 'R': + rate_quit = !!atoi(optarg); + break; + case 'o': + def_thread.odirect = !!atoi(optarg); + break; + case 'f': + ini_file = strdup(optarg); + break; + case 'l': + write_lat_log = 1; + break; + case 'w': + write_bw_log = 1; + break; + case 'v': + printf("%s\n", fio_version_string); + exit(0); + } + } +} + +static void free_shm(void) +{ + struct shmid_ds sbuf; + + if (threads) { + shmdt(threads); + threads = NULL; + shmctl(shm_id, IPC_RMID, &sbuf); + } +} + +static int setup_thread_area(void) +{ + /* + * 1024 is too much on some machines, scale max_jobs if + * we get a failure that looks like too large a shm segment + */ + do { + int s = max_jobs * sizeof(struct thread_data); + + shm_id = shmget(0, s, IPC_CREAT | 0600); + if (shm_id != -1) + break; + if (errno != EINVAL) { + perror("shmget"); + break; + } + + max_jobs >>= 1; + } while (max_jobs); + + if (shm_id == -1) + return 1; + + threads = shmat(shm_id, NULL, 0); + if (threads == (void *) -1) { + perror("shmat"); + return 1; + } + + atexit(free_shm); + return 0; +} + +int parse_options(int argc, char *argv[]) +{ + if (setup_thread_area()) + return 1; + if (fill_def_thread()) + return 1; + + parse_cmd_line(argc, argv); + + if (!ini_file) { + printf("Need job file\n"); + return 1; + } + + if (parse_jobs_ini(ini_file)) + return 1; + + return 0; +} diff --git a/fio-io.c b/fio-io.c new file mode 100644 index 00000000..3183e8fd --- /dev/null +++ b/fio-io.c @@ -0,0 +1,603 @@ +/* + * The io parts of the fio tool, includes workers for sync and mmap'ed + * io, as well as both posix and linux libaio support. + * + * sync io is implemented on top of aio. + * + * This is not really specific to fio, if the get_io_u/put_io_u and + * structures was pulled into this as well it would be a perfectly + * generic io engine that could be used for other projects. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include "fio.h" +#include "os.h" + +#ifdef FIO_HAVE_LIBAIO + +#define ev_to_iou(ev) (struct io_u *) ((unsigned long) (ev)->obj) + +static int fio_io_sync(struct thread_data *td) +{ + return fsync(td->fd); +} + +static int fill_timespec(struct timespec *ts) +{ +#ifdef _POSIX_TIMERS + if (!clock_gettime(CLOCK_MONOTONIC, ts)) + return 0; + + perror("clock_gettime"); +#endif + return 1; +} + +static unsigned long long ts_utime_since_now(struct timespec *t) +{ + long long sec, nsec; + struct timespec now; + + if (fill_timespec(&now)) + return 0; + + sec = now.tv_sec - t->tv_sec; + nsec = now.tv_nsec - t->tv_nsec; + if (sec > 0 && nsec < 0) { + sec--; + nsec += 1000000000; + } + + sec *= 1000000; + nsec /= 1000; + return sec + nsec; +} + +struct libaio_data { + io_context_t aio_ctx; + struct io_event *aio_events; +}; + +static int fio_libaio_io_prep(struct thread_data *td, struct io_u *io_u) +{ + if (io_u->ddir == DDIR_READ) + io_prep_pread(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset); + else + io_prep_pwrite(&io_u->iocb, td->fd, io_u->buf, io_u->buflen, io_u->offset); + + return 0; +} + +static struct io_u *fio_libaio_event(struct thread_data *td, int event) +{ + struct libaio_data *ld = td->io_data; + + return ev_to_iou(ld->aio_events + event); +} + +static int fio_libaio_getevents(struct thread_data *td, int min, int max, + struct timespec *t) +{ + struct libaio_data *ld = td->io_data; + int r; + + do { + r = io_getevents(ld->aio_ctx, min, max, ld->aio_events, t); + if (r == -EAGAIN) { + usleep(100); + continue; + } else if (r == -EINTR) + continue; + else + break; + } while (1); + + return r; +} + +static int fio_libaio_queue(struct thread_data *td, struct io_u *io_u) +{ + struct libaio_data *ld = td->io_data; + struct iocb *iocb = &io_u->iocb; + int ret; + + do { + ret = io_submit(ld->aio_ctx, 1, &iocb); + if (ret == 1) + return 0; + else if (ret == -EAGAIN) + usleep(100); + else if (ret == -EINTR) + continue; + else + break; + } while (1); + + return ret; + +} + +static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u) +{ + struct libaio_data *ld = td->io_data; + + return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events); +} + +static void fio_libaio_cleanup(struct thread_data *td) +{ + struct libaio_data *ld = td->io_data; + + if (ld) { + io_destroy(ld->aio_ctx); + if (ld->aio_events) + free(ld->aio_events); + + free(ld); + td->io_data = NULL; + } +} + +int fio_libaio_init(struct thread_data *td) +{ + struct libaio_data *ld = malloc(sizeof(*ld)); + + memset(ld, 0, sizeof(*ld)); + if (io_queue_init(td->iodepth, &ld->aio_ctx)) { + td_verror(td, errno); + return 1; + } + + td->io_prep = fio_libaio_io_prep; + td->io_queue = fio_libaio_queue; + td->io_getevents = fio_libaio_getevents; + td->io_event = fio_libaio_event; + td->io_cancel = fio_libaio_cancel; + td->io_cleanup = fio_libaio_cleanup; + td->io_sync = fio_io_sync; + + ld->aio_events = malloc(td->iodepth * sizeof(struct io_event)); + td->io_data = ld; + return 0; +} + +#else /* FIO_HAVE_LIBAIO */ + +int fio_libaio_init(struct thread_data *td) +{ + return EINVAL; +} + +#endif /* FIO_HAVE_LIBAIO */ + +#ifdef FIO_HAVE_POSIXAIO + +struct posixaio_data { + struct io_u **aio_events; +}; + +static int fio_posixaio_cancel(struct thread_data *td, struct io_u *io_u) +{ + int r = aio_cancel(td->fd, &io_u->aiocb); + + if (r == 1 || r == AIO_CANCELED) + return 0; + + return 1; +} + +static int fio_posixaio_prep(struct thread_data *td, struct io_u *io_u) +{ + struct aiocb *aiocb = &io_u->aiocb; + + aiocb->aio_fildes = td->fd; + aiocb->aio_buf = io_u->buf; + aiocb->aio_nbytes = io_u->buflen; + aiocb->aio_offset = io_u->offset; + + io_u->seen = 0; + return 0; +} + +static int fio_posixaio_getevents(struct thread_data *td, int min, int max, + struct timespec *t) +{ + struct posixaio_data *pd = td->io_data; + struct list_head *entry; + struct timespec start; + int r, have_timeout = 0; + + if (t && !fill_timespec(&start)) + have_timeout = 1; + + r = 0; +restart: + list_for_each(entry, &td->io_u_busylist) { + struct io_u *io_u = list_entry(entry, struct io_u, list); + int err; + + if (io_u->seen) + continue; + + err = aio_error(&io_u->aiocb); + switch (err) { + default: + io_u->error = err; + case ECANCELED: + case 0: + pd->aio_events[r++] = io_u; + io_u->seen = 1; + break; + case EINPROGRESS: + break; + } + + if (r >= max) + break; + } + + if (r >= min) + return r; + + if (have_timeout) { + unsigned long long usec; + + usec = (t->tv_sec * 1000000) + (t->tv_nsec / 1000); + if (ts_utime_since_now(&start) > usec) + return r; + } + + /* + * hrmpf, we need to wait for more. we should use aio_suspend, for + * now just sleep a little and recheck status of busy-and-not-seen + */ + usleep(1000); + goto restart; +} + +static struct io_u *fio_posixaio_event(struct thread_data *td, int event) +{ + struct posixaio_data *pd = td->io_data; + + return pd->aio_events[event]; +} + +static int fio_posixaio_queue(struct thread_data *td, struct io_u *io_u) +{ + struct aiocb *aiocb = &io_u->aiocb; + int ret; + + if (io_u->ddir == DDIR_READ) + ret = aio_read(aiocb); + else + ret = aio_write(aiocb); + + if (ret) + io_u->error = errno; + + return io_u->error; +} + +static void fio_posixaio_cleanup(struct thread_data *td) +{ + struct posixaio_data *pd = td->io_data; + + if (pd) { + free(pd->aio_events); + free(pd); + td->io_data = NULL; + } +} + +int fio_posixaio_init(struct thread_data *td) +{ + struct posixaio_data *pd = malloc(sizeof(*pd)); + + pd->aio_events = malloc(td->iodepth * sizeof(struct io_u *)); + + td->io_prep = fio_posixaio_prep; + td->io_queue = fio_posixaio_queue; + td->io_getevents = fio_posixaio_getevents; + td->io_event = fio_posixaio_event; + td->io_cancel = fio_posixaio_cancel; + td->io_cleanup = fio_posixaio_cleanup; + td->io_sync = fio_io_sync; + + td->io_data = pd; + return 0; +} + +#else /* FIO_HAVE_POSIXAIO */ + +int fio_posixaio_init(struct thread_data *td) +{ + return EINVAL; +} + +#endif /* FIO_HAVE_POSIXAIO */ + +struct syncio_data { + struct io_u *last_io_u; +}; + +static int fio_syncio_getevents(struct thread_data *td, int min, int max, + struct timespec *t) +{ + assert(max <= 1); + + /* + * we can only have one finished io_u for sync io, since the depth + * is always 1 + */ + if (list_empty(&td->io_u_busylist)) + return 0; + + return 1; +} + +static struct io_u *fio_syncio_event(struct thread_data *td, int event) +{ + struct syncio_data *sd = td->io_data; + + assert(event == 0); + + return sd->last_io_u; +} + +static int fio_syncio_prep(struct thread_data *td, struct io_u *io_u) +{ + if (td->cur_off != io_u->offset) { + if (lseek(td->fd, io_u->offset, SEEK_SET) == -1) { + td_verror(td, errno); + return 1; + } + } + + return 0; +} + +static int fio_syncio_queue(struct thread_data *td, struct io_u *io_u) +{ + struct syncio_data *sd = td->io_data; + int ret; + + if (io_u->ddir == DDIR_READ) + ret = read(td->fd, io_u->buf, io_u->buflen); + else + ret = write(td->fd, io_u->buf, io_u->buflen); + + if ((unsigned int) ret != io_u->buflen) { + if (ret > 0) { + io_u->resid = io_u->buflen - ret; + io_u->error = ENODATA; + } else + io_u->error = errno; + } + + if (!io_u->error) + sd->last_io_u = io_u; + + return io_u->error; +} + +static void fio_syncio_cleanup(struct thread_data *td) +{ + if (td->io_data) { + free(td->io_data); + td->io_data = NULL; + } +} + +int fio_syncio_init(struct thread_data *td) +{ + struct syncio_data *sd = malloc(sizeof(*sd)); + + td->io_prep = fio_syncio_prep; + td->io_queue = fio_syncio_queue; + td->io_getevents = fio_syncio_getevents; + td->io_event = fio_syncio_event; + td->io_cancel = NULL; + td->io_cleanup = fio_syncio_cleanup; + td->io_sync = fio_io_sync; + + sd->last_io_u = NULL; + td->io_data = sd; + return 0; +} + +static int fio_mmapio_queue(struct thread_data *td, struct io_u *io_u) +{ + unsigned long long real_off = io_u->offset - td->file_offset; + struct syncio_data *sd = td->io_data; + + if (io_u->ddir == DDIR_READ) + memcpy(io_u->buf, td->mmap + real_off, io_u->buflen); + else + memcpy(td->mmap + real_off, io_u->buf, io_u->buflen); + + /* + * not really direct, but should drop the pages from the cache + */ + if (td->odirect) { + if (msync(td->mmap + real_off, io_u->buflen, MS_SYNC) < 0) + io_u->error = errno; + if (madvise(td->mmap + real_off, io_u->buflen, MADV_DONTNEED) < 0) + io_u->error = errno; + } + + if (!io_u->error) + sd->last_io_u = io_u; + + return io_u->error; +} + +static int fio_mmapio_sync(struct thread_data *td) +{ + return msync(td->mmap, td->file_size, MS_SYNC); +} + +int fio_mmapio_init(struct thread_data *td) +{ + struct syncio_data *sd = malloc(sizeof(*sd)); + + td->io_prep = NULL; + td->io_queue = fio_mmapio_queue; + td->io_getevents = fio_syncio_getevents; + td->io_event = fio_syncio_event; + td->io_cancel = NULL; + td->io_cleanup = fio_syncio_cleanup; + td->io_sync = fio_mmapio_sync; + + sd->last_io_u = NULL; + td->io_data = sd; + return 0; +} + +#ifdef FIO_HAVE_SGIO + +struct sgio_data { + struct io_u *last_io_u; + unsigned char cdb[10]; + unsigned int bs; +}; + +static inline void sgio_hdr_init(struct sgio_data *sd, struct sg_io_hdr *hdr, + struct io_u *io_u) +{ + memset(hdr, 0, sizeof(*hdr)); + memset(sd->cdb, 0, sizeof(sd->cdb)); + + hdr->interface_id = 'S'; + hdr->cmdp = sd->cdb; + hdr->cmd_len = sizeof(sd->cdb); + + if (io_u) { + hdr->dxferp = io_u->buf; + hdr->dxfer_len = io_u->buflen; + } +} + +static int fio_sgio_sync(struct thread_data *td) +{ + struct sgio_data *sd = td->io_data; + struct sg_io_hdr hdr; + + sgio_hdr_init(sd, &hdr, NULL); + hdr.dxfer_direction = SG_DXFER_NONE; + + hdr.cmdp[0] = 0x35; + + return ioctl(td->fd, SG_IO, &hdr); +} + +static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u) +{ + struct sg_io_hdr *hdr = &io_u->hdr; + struct sgio_data *sd = td->io_data; + int nr_blocks, lba; + + if (io_u->buflen & (sd->bs - 1)) { + fprintf(stderr, "read/write not sector aligned\n"); + return EINVAL; + } + + sgio_hdr_init(sd, hdr, io_u); + + if (io_u->ddir == DDIR_READ) { + hdr->dxfer_direction = SG_DXFER_FROM_DEV; + hdr->cmdp[0] = 0x28; + } else { + hdr->dxfer_direction = SG_DXFER_TO_DEV; + hdr->cmdp[0] = 0x2a; + } + + nr_blocks = io_u->buflen / sd->bs; + lba = io_u->offset / sd->bs; + hdr->cmdp[2] = (lba >> 24) & 0xff; + hdr->cmdp[3] = (lba >> 16) & 0xff; + hdr->cmdp[4] = (lba >> 8) & 0xff; + hdr->cmdp[5] = lba & 0xff; + hdr->cmdp[7] = (nr_blocks >> 8) & 0xff; + hdr->cmdp[8] = nr_blocks & 0xff; + return 0; +} + +static int fio_sgio_queue(struct thread_data *td, struct io_u *io_u) +{ + struct sg_io_hdr *hdr = &io_u->hdr; + struct sgio_data *sd = td->io_data; + int ret; + + ret = ioctl(td->fd, SG_IO, hdr); + if (ret < 0) + io_u->error = errno; + else if (hdr->status) { + io_u->resid = hdr->resid; + io_u->error = EIO; + } + + if (!io_u->error) + sd->last_io_u = io_u; + + return io_u->error; +} + +static struct io_u *fio_sgio_event(struct thread_data *td, int event) +{ + struct sgio_data *sd = td->io_data; + + assert(event == 0); + + return sd->last_io_u; +} + +int fio_sgio_init(struct thread_data *td) +{ + struct sgio_data *sd; + int bs; + + if (td->filetype != FIO_TYPE_BD) { + fprintf(stderr, "ioengine sgio only works on block devices\n"); + return 1; + } + + if (ioctl(td->fd, BLKSSZGET, &bs) < 0) { + td_verror(td, errno); + return 1; + } + + sd = malloc(sizeof(*sd)); + sd->bs = bs; + + td->io_prep = fio_sgio_prep; + td->io_queue = fio_sgio_queue; + td->io_getevents = fio_syncio_getevents; + td->io_event = fio_sgio_event; + td->io_cancel = NULL; + td->io_cleanup = fio_syncio_cleanup; + td->io_sync = fio_sgio_sync; + + /* + * we want to do it, regardless of whether odirect is set or not + */ + td->override_sync = 1; + + sd->last_io_u = NULL; + td->io_data = sd; + return 0; +} + +#else /* FIO_HAVE_SGIO */ + +int fio_sgio_init(struct thread_data *td) +{ + return EINVAL; +} + +#endif /* FIO_HAVE_SGIO */ diff --git a/fio.c b/fio.c new file mode 100644 index 00000000..6f122c9b --- /dev/null +++ b/fio.c @@ -0,0 +1,2192 @@ +/* + * fio - the flexible io tester + * + * Copyright (C) 2005 Jens Axboe + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fio.h" +#include "os.h" + +#define MASK (4095) + +#define ALIGN(buf) (char *) (((unsigned long) (buf) + MASK) & ~(MASK)) + +int groupid = 0; +int thread_number = 0; +static char run_str[MAX_JOBS + 1]; +int shm_id = 0; +static LIST_HEAD(disk_list); +static struct itimerval itimer; + +static void update_io_ticks(void); +static void disk_util_timer_arm(void); +static void print_thread_status(void); + +/* + * thread life cycle + */ +enum { + TD_NOT_CREATED = 0, + TD_CREATED, + TD_RUNNING, + TD_VERIFYING, + TD_EXITED, + TD_REAPED, +}; + +#define should_fsync(td) (td_write(td) && (!(td)->odirect || (td)->override_sync)) + +static sem_t startup_sem; + +#define TERMINATE_ALL (-1) + +static void terminate_threads(int group_id) +{ + int i; + + for (i = 0; i < thread_number; i++) { + struct thread_data *td = &threads[i]; + + if (group_id == TERMINATE_ALL || groupid == td->groupid) { + td->terminate = 1; + td->start_delay = 0; + } + } +} + +static void sig_handler(int sig) +{ + switch (sig) { + case SIGALRM: + update_io_ticks(); + disk_util_timer_arm(); + print_thread_status(); + break; + default: + printf("\nfio: terminating on signal\n"); + fflush(stdout); + terminate_threads(TERMINATE_ALL); + break; + } +} + +static unsigned long utime_since(struct timeval *s, struct timeval *e) +{ + double sec, usec; + + sec = e->tv_sec - s->tv_sec; + usec = e->tv_usec - s->tv_usec; + if (sec > 0 && usec < 0) { + sec--; + usec += 1000000; + } + + sec *= (double) 1000000; + + return sec + usec; +} + +static unsigned long utime_since_now(struct timeval *s) +{ + struct timeval t; + + gettimeofday(&t, NULL); + return utime_since(s, &t); +} + +static unsigned long mtime_since(struct timeval *s, struct timeval *e) +{ + double sec, usec; + + sec = e->tv_sec - s->tv_sec; + usec = e->tv_usec - s->tv_usec; + if (sec > 0 && usec < 0) { + sec--; + usec += 1000000; + } + + sec *= (double) 1000; + usec /= (double) 1000; + + return sec + usec; +} + +static unsigned long mtime_since_now(struct timeval *s) +{ + struct timeval t; + + gettimeofday(&t, NULL); + return mtime_since(s, &t); +} + +static inline unsigned long msec_now(struct timeval *s) +{ + return s->tv_sec * 1000 + s->tv_usec / 1000; +} + +static int random_map_free(struct thread_data *td, unsigned long long block) +{ + unsigned int idx = RAND_MAP_IDX(td, block); + unsigned int bit = RAND_MAP_BIT(td, block); + + return (td->file_map[idx] & (1UL << bit)) == 0; +} + +static int get_next_free_block(struct thread_data *td, unsigned long long *b) +{ + int i; + + *b = 0; + i = 0; + while ((*b) * td->min_bs < td->io_size) { + if (td->file_map[i] != -1UL) { + *b += ffz(td->file_map[i]); + return 0; + } + + *b += BLOCKS_PER_MAP; + i++; + } + + return 1; +} + +static void mark_random_map(struct thread_data *td, struct io_u *io_u) +{ + unsigned long block = io_u->offset / td->min_bs; + unsigned int blocks = 0; + + while (blocks < (io_u->buflen / td->min_bs)) { + unsigned int idx, bit; + + if (!random_map_free(td, block)) + break; + + idx = RAND_MAP_IDX(td, block); + bit = RAND_MAP_BIT(td, block); + + assert(idx < td->num_maps); + + td->file_map[idx] |= (1UL << bit); + block++; + blocks++; + } + + if ((blocks * td->min_bs) < io_u->buflen) + io_u->buflen = blocks * td->min_bs; +} + +static int get_next_offset(struct thread_data *td, unsigned long long *offset) +{ + unsigned long long b, rb; + long r; + + if (!td->sequential) { + unsigned long max_blocks = td->io_size / td->min_bs; + int loops = 50; + + do { + lrand48_r(&td->random_state, &r); + b = ((max_blocks - 1) * r / (RAND_MAX+1.0)); + rb = b + (td->file_offset / td->min_bs); + loops--; + } while (!random_map_free(td, rb) && loops); + + if (!loops) { + if (get_next_free_block(td, &b)) + return 1; + } + } else + b = td->last_bytes / td->min_bs; + + *offset = (b * td->min_bs) + td->file_offset; + if (*offset > td->file_size) + return 1; + + return 0; +} + +static unsigned int get_next_buflen(struct thread_data *td) +{ + unsigned int buflen; + long r; + + if (td->min_bs == td->max_bs) + buflen = td->min_bs; + else { + lrand48_r(&td->bsrange_state, &r); + buflen = (1 + (double) (td->max_bs - 1) * r / (RAND_MAX + 1.0)); + buflen = (buflen + td->min_bs - 1) & ~(td->min_bs - 1); + } + + if (buflen > td->io_size - td->this_io_bytes[td->ddir]) + buflen = td->io_size - td->this_io_bytes[td->ddir]; + + return buflen; +} + +static inline void add_stat_sample(struct io_stat *is, unsigned long val) +{ + if (val > is->max_val) + is->max_val = val; + if (val < is->min_val) + is->min_val = val; + + is->val += val; + is->val_sq += val * val; + is->samples++; +} + +static void add_log_sample(struct thread_data *td, struct io_log *iolog, + unsigned long val, int ddir) +{ + if (iolog->nr_samples == iolog->max_samples) { + int new_size = sizeof(struct io_sample) * iolog->max_samples*2; + + iolog->log = realloc(iolog->log, new_size); + iolog->max_samples <<= 1; + } + + iolog->log[iolog->nr_samples].val = val; + iolog->log[iolog->nr_samples].time = mtime_since_now(&td->epoch); + iolog->log[iolog->nr_samples].ddir = ddir; + iolog->nr_samples++; +} + +static void add_clat_sample(struct thread_data *td, int ddir,unsigned long msec) +{ + add_stat_sample(&td->clat_stat[ddir], msec); + + if (td->clat_log) + add_log_sample(td, td->clat_log, msec, ddir); +} + +static void add_slat_sample(struct thread_data *td, int ddir,unsigned long msec) +{ + add_stat_sample(&td->slat_stat[ddir], msec); + + if (td->slat_log) + add_log_sample(td, td->slat_log, msec, ddir); +} + +static void add_bw_sample(struct thread_data *td, int ddir) +{ + unsigned long spent = mtime_since_now(&td->stat_sample_time[ddir]); + unsigned long rate; + + if (spent < td->bw_avg_time) + return; + + rate = (td->this_io_bytes[ddir] - td->stat_io_bytes[ddir]) / spent; + add_stat_sample(&td->bw_stat[ddir], rate); + + if (td->bw_log) + add_log_sample(td, td->bw_log, rate, ddir); + + gettimeofday(&td->stat_sample_time[ddir], NULL); + td->stat_io_bytes[ddir] = td->this_io_bytes[ddir]; +} + +/* + * busy looping version for the last few usec + */ +static void __usec_sleep(unsigned int usec) +{ + struct timeval start; + + gettimeofday(&start, NULL); + while (utime_since_now(&start) < usec) + nop; +} + +static void usec_sleep(struct thread_data *td, unsigned long usec) +{ + struct timespec req, rem; + + req.tv_sec = usec / 1000000; + req.tv_nsec = usec * 1000 - req.tv_sec * 1000000; + + do { + if (usec < 5000) { + __usec_sleep(usec); + break; + } + + rem.tv_sec = rem.tv_nsec = 0; + if (nanosleep(&req, &rem) < 0) + break; + + if ((rem.tv_sec + rem.tv_nsec) == 0) + break; + + req.tv_nsec = rem.tv_nsec; + req.tv_sec = rem.tv_sec; + + usec = rem.tv_sec * 1000000 + rem.tv_nsec / 1000; + } while (!td->terminate); +} + +static void rate_throttle(struct thread_data *td, unsigned long time_spent, + unsigned int bytes) +{ + unsigned long usec_cycle; + + if (!td->rate) + return; + + usec_cycle = td->rate_usec_cycle * (bytes / td->min_bs); + + if (time_spent < usec_cycle) { + unsigned long s = usec_cycle - time_spent; + + td->rate_pending_usleep += s; + if (td->rate_pending_usleep >= 100000) { + usec_sleep(td, td->rate_pending_usleep); + td->rate_pending_usleep = 0; + } + } else { + long overtime = time_spent - usec_cycle; + + td->rate_pending_usleep -= overtime; + } +} + +static int check_min_rate(struct thread_data *td, struct timeval *now) +{ + unsigned long spent; + unsigned long rate; + int ddir = td->ddir; + + /* + * allow a 2 second settle period in the beginning + */ + if (mtime_since(&td->start, now) < 2000) + return 0; + + /* + * if rate blocks is set, sample is running + */ + if (td->rate_bytes) { + spent = mtime_since(&td->lastrate, now); + if (spent < td->ratecycle) + return 0; + + rate = (td->this_io_bytes[ddir] - td->rate_bytes) / spent; + if (rate < td->ratemin) { + printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate); + if (rate_quit) + terminate_threads(td->groupid); + return 1; + } + } + + td->rate_bytes = td->this_io_bytes[ddir]; + memcpy(&td->lastrate, now, sizeof(*now)); + return 0; +} + +static inline int runtime_exceeded(struct thread_data *td, struct timeval *t) +{ + if (!td->timeout) + return 0; + if (mtime_since(&td->epoch, t) >= td->timeout * 1000) + return 1; + + return 0; +} + +static void fill_random_bytes(struct thread_data *td, + unsigned char *p, unsigned int len) +{ + unsigned int todo; + double r; + + while (len) { + drand48_r(&td->verify_state, &r); + + /* + * lrand48_r seems to be broken and only fill the bottom + * 32-bits, even on 64-bit archs with 64-bit longs + */ + todo = sizeof(r); + if (todo > len) + todo = len; + + memcpy(p, &r, todo); + + len -= todo; + p += todo; + } +} + +static void hexdump(void *buffer, int len) +{ + unsigned char *p = buffer; + int i; + + for (i = 0; i < len; i++) + printf("%02x", p[i]); + printf("\n"); +} + +static int verify_io_u_crc32(struct verify_header *hdr, struct io_u *io_u) +{ + unsigned char *p = (unsigned char *) io_u->buf; + unsigned long c; + int ret; + + p += sizeof(*hdr); + c = crc32(p, hdr->len - sizeof(*hdr)); + ret = c != hdr->crc32; + + if (ret) { + fprintf(stderr, "crc32: verify failed at %llu/%u\n", io_u->offset, io_u->buflen); + fprintf(stderr, "crc32: wanted %lx, got %lx\n", hdr->crc32, c); + } + + return ret; +} + +static int verify_io_u_md5(struct verify_header *hdr, struct io_u *io_u) +{ + unsigned char *p = (unsigned char *) io_u->buf; + struct md5_ctx md5_ctx; + int ret; + + memset(&md5_ctx, 0, sizeof(md5_ctx)); + p += sizeof(*hdr); + md5_update(&md5_ctx, p, hdr->len - sizeof(*hdr)); + + ret = memcmp(hdr->md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash)); + if (ret) { + fprintf(stderr, "md5: verify failed at %llu/%u\n", io_u->offset, io_u->buflen); + hexdump(hdr->md5_digest, sizeof(hdr->md5_digest)); + hexdump(md5_ctx.hash, sizeof(md5_ctx.hash)); + } + + return ret; +} + +static int verify_io_u(struct io_u *io_u) +{ + struct verify_header *hdr = (struct verify_header *) io_u->buf; + int ret; + + if (hdr->fio_magic != FIO_HDR_MAGIC) + return 1; + + if (hdr->verify_type == VERIFY_MD5) + ret = verify_io_u_md5(hdr, io_u); + else if (hdr->verify_type == VERIFY_CRC32) + ret = verify_io_u_crc32(hdr, io_u); + else { + fprintf(stderr, "Bad verify type %d\n", hdr->verify_type); + ret = 1; + } + + return ret; +} + +static void fill_crc32(struct verify_header *hdr, void *p, unsigned int len) +{ + hdr->crc32 = crc32(p, len); +} + +static void fill_md5(struct verify_header *hdr, void *p, unsigned int len) +{ + struct md5_ctx md5_ctx; + + memset(&md5_ctx, 0, sizeof(md5_ctx)); + md5_update(&md5_ctx, p, len); + memcpy(hdr->md5_digest, md5_ctx.hash, sizeof(md5_ctx.hash)); +} + +/* + * fill body of io_u->buf with random data and add a header with the + * (eg) sha1sum of that data. + */ +static void populate_io_u(struct thread_data *td, struct io_u *io_u) +{ + unsigned char *p = (unsigned char *) io_u->buf; + struct verify_header hdr; + + hdr.fio_magic = FIO_HDR_MAGIC; + hdr.len = io_u->buflen; + p += sizeof(hdr); + fill_random_bytes(td, p, io_u->buflen - sizeof(hdr)); + + if (td->verify == VERIFY_MD5) { + fill_md5(&hdr, p, io_u->buflen - sizeof(hdr)); + hdr.verify_type = VERIFY_MD5; + } else { + fill_crc32(&hdr, p, io_u->buflen - sizeof(hdr)); + hdr.verify_type = VERIFY_CRC32; + } + + memcpy(io_u->buf, &hdr, sizeof(hdr)); +} + +static void put_io_u(struct thread_data *td, struct io_u *io_u) +{ + list_del(&io_u->list); + list_add(&io_u->list, &td->io_u_freelist); + td->cur_depth--; +} + +#define queue_full(td) (list_empty(&(td)->io_u_freelist)) + +static struct io_u *__get_io_u(struct thread_data *td) +{ + struct io_u *io_u; + + if (queue_full(td)) + return NULL; + + io_u = list_entry(td->io_u_freelist.next, struct io_u, list); + io_u->error = 0; + io_u->resid = 0; + list_del(&io_u->list); + list_add(&io_u->list, &td->io_u_busylist); + td->cur_depth++; + return io_u; +} + +static int td_io_prep(struct thread_data *td, struct io_u *io_u, int read) +{ + if (read) + io_u->ddir = DDIR_READ; + else + io_u->ddir = DDIR_WRITE; + + if (td->io_prep && td->io_prep(td, io_u)) + return 1; + + return 0; +} + +static struct io_u *get_io_u(struct thread_data *td) +{ + struct io_u *io_u; + + io_u = __get_io_u(td); + if (!io_u) + return NULL; + + if (get_next_offset(td, &io_u->offset)) { + put_io_u(td, io_u); + return NULL; + } + + io_u->buflen = get_next_buflen(td); + if (!io_u->buflen) { + put_io_u(td, io_u); + return NULL; + } + + if (io_u->buflen + io_u->offset > td->file_size) + io_u->buflen = td->file_size - io_u->offset; + + if (!io_u->buflen) { + put_io_u(td, io_u); + return NULL; + } + + if (!td->sequential) + mark_random_map(td, io_u); + + td->last_bytes += io_u->buflen; + + if (td->verify != VERIFY_NONE) + populate_io_u(td, io_u); + + if (td_io_prep(td, io_u, td_read(td))) { + put_io_u(td, io_u); + return NULL; + } + + gettimeofday(&io_u->start_time, NULL); + return io_u; +} + +static inline void td_set_runstate(struct thread_data *td, int runstate) +{ + td->old_runstate = td->runstate; + td->runstate = runstate; +} + +static int get_next_verify(struct thread_data *td, + unsigned long long *offset, unsigned int *len) +{ + struct io_piece *ipo; + + if (list_empty(&td->io_hist_list)) + return 1; + + ipo = list_entry(td->io_hist_list.next, struct io_piece, list); + list_del(&ipo->list); + + *offset = ipo->offset; + *len = ipo->len; + free(ipo); + return 0; +} + +static void prune_io_piece_log(struct thread_data *td) +{ + struct io_piece *ipo; + + while (!list_empty(&td->io_hist_list)) { + ipo = list_entry(td->io_hist_list.next, struct io_piece, list); + + list_del(&ipo->list); + free(ipo); + } +} + +/* + * log a succesful write, so we can unwind the log for verify + */ +static void log_io_piece(struct thread_data *td, struct io_u *io_u) +{ + struct io_piece *ipo = malloc(sizeof(struct io_piece)); + struct list_head *entry; + + INIT_LIST_HEAD(&ipo->list); + ipo->offset = io_u->offset; + ipo->len = io_u->buflen; + + /* + * for random io where the writes extend the file, it will typically + * be laid out with the block scattered as written. it's faster to + * read them in in that order again, so don't sort + */ + if (td->sequential || !td->overwrite) { + list_add_tail(&ipo->list, &td->io_hist_list); + return; + } + + /* + * for random io, sort the list so verify will run faster + */ + entry = &td->io_hist_list; + while ((entry = entry->prev) != &td->io_hist_list) { + struct io_piece *__ipo = list_entry(entry, struct io_piece, list); + + if (__ipo->offset < ipo->offset) + break; + } + + list_add(&ipo->list, entry); +} + +static int sync_td(struct thread_data *td) +{ + if (td->io_sync) + return td->io_sync(td); + + return 0; +} + +static int io_u_getevents(struct thread_data *td, int min, int max, + struct timespec *t) +{ + return td->io_getevents(td, min, max, t); +} + +static int io_u_queue(struct thread_data *td, struct io_u *io_u) +{ + gettimeofday(&io_u->issue_time, NULL); + + return td->io_queue(td, io_u); +} + +#define iocb_time(iocb) ((unsigned long) (iocb)->data) + +static void io_completed(struct thread_data *td, struct io_u *io_u, + struct io_completion_data *icd) +{ + struct timeval e; + unsigned long msec; + + gettimeofday(&e, NULL); + + if (!io_u->error) { + int idx = io_u->ddir; + + td->io_blocks[idx]++; + td->io_bytes[idx] += (io_u->buflen - io_u->resid); + td->this_io_bytes[idx] += (io_u->buflen - io_u->resid); + + msec = mtime_since(&io_u->issue_time, &e); + + add_clat_sample(td, io_u->ddir, msec); + add_bw_sample(td, io_u->ddir); + + if (td_write(td) && io_u->ddir == DDIR_WRITE) + log_io_piece(td, io_u); + + icd->bytes_done[idx] += (io_u->buflen - io_u->resid); + } else + icd->error = io_u->error; +} + +static void ios_completed(struct thread_data *td,struct io_completion_data *icd) +{ + struct io_u *io_u; + int i; + + icd->error = 0; + icd->bytes_done[0] = icd->bytes_done[1] = 0; + + for (i = 0; i < icd->nr; i++) { + io_u = td->io_event(td, i); + + io_completed(td, io_u, icd); + put_io_u(td, io_u); + } +} + +static void cleanup_pending_aio(struct thread_data *td) +{ + struct timespec ts = { .tv_sec = 0, .tv_nsec = 0}; + struct list_head *entry, *n; + struct io_completion_data icd; + struct io_u *io_u; + int r; + + /* + * get immediately available events, if any + */ + r = io_u_getevents(td, 0, td->cur_depth, &ts); + if (r > 0) { + icd.nr = r; + ios_completed(td, &icd); + } + + /* + * now cancel remaining active events + */ + if (td->io_cancel) { + list_for_each_safe(entry, n, &td->io_u_busylist) { + io_u = list_entry(entry, struct io_u, list); + + r = td->io_cancel(td, io_u); + if (!r) + put_io_u(td, io_u); + } + } + + if (td->cur_depth) { + r = io_u_getevents(td, td->cur_depth, td->cur_depth, NULL); + if (r > 0) { + icd.nr = r; + ios_completed(td, &icd); + } + } +} + +static int do_io_u_verify(struct thread_data *td, struct io_u **io_u) +{ + struct io_u *v_io_u = *io_u; + int ret = 0; + + if (v_io_u) { + ret = verify_io_u(v_io_u); + put_io_u(td, v_io_u); + *io_u = NULL; + } + + return ret; +} + +static void do_verify(struct thread_data *td) +{ + struct timeval t; + struct io_u *io_u, *v_io_u = NULL; + struct io_completion_data icd; + int ret; + + td_set_runstate(td, TD_VERIFYING); + + do { + if (td->terminate) + break; + + gettimeofday(&t, NULL); + if (runtime_exceeded(td, &t)) + break; + + io_u = __get_io_u(td); + if (!io_u) + break; + + if (get_next_verify(td, &io_u->offset, &io_u->buflen)) { + put_io_u(td, io_u); + break; + } + + if (td_io_prep(td, io_u, 1)) { + put_io_u(td, io_u); + break; + } + + ret = io_u_queue(td, io_u); + if (ret) { + put_io_u(td, io_u); + td_verror(td, ret); + break; + } + + /* + * we have one pending to verify, do that while + * we are doing io on the next one + */ + if (do_io_u_verify(td, &v_io_u)) + break; + + ret = io_u_getevents(td, 1, 1, NULL); + if (ret != 1) { + if (ret < 0) + td_verror(td, ret); + break; + } + + v_io_u = td->io_event(td, 0); + icd.nr = 1; + icd.error = 0; + io_completed(td, v_io_u, &icd); + + if (icd.error) { + td_verror(td, icd.error); + put_io_u(td, v_io_u); + v_io_u = NULL; + break; + } + + td->cur_off = v_io_u->offset + v_io_u->buflen; + + /* + * if we can't submit more io, we need to verify now + */ + if (queue_full(td) && do_io_u_verify(td, &v_io_u)) + break; + + } while (1); + + do_io_u_verify(td, &v_io_u); + + if (td->cur_depth) + cleanup_pending_aio(td); + + td_set_runstate(td, TD_RUNNING); +} + +static void do_io(struct thread_data *td) +{ + struct io_completion_data icd; + struct timeval s, e; + unsigned long usec; + + while (td->this_io_bytes[td->ddir] < td->io_size) { + struct timespec ts = { .tv_sec = 0, .tv_nsec = 0}; + struct timespec *timeout; + int ret, min_evts = 0; + struct io_u *io_u; + + if (td->terminate) + break; + + io_u = get_io_u(td); + if (!io_u) + break; + + memcpy(&s, &io_u->start_time, sizeof(s)); + + ret = io_u_queue(td, io_u); + if (ret) { + put_io_u(td, io_u); + td_verror(td, ret); + break; + } + + add_slat_sample(td, io_u->ddir, mtime_since(&io_u->start_time, &io_u->issue_time)); + + if (td->cur_depth < td->iodepth) { + timeout = &ts; + min_evts = 0; + } else { + timeout = NULL; + min_evts = 1; + } + + ret = io_u_getevents(td, min_evts, td->cur_depth, timeout); + if (ret < 0) { + td_verror(td, ret); + break; + } else if (!ret) + continue; + + icd.nr = ret; + ios_completed(td, &icd); + if (icd.error) { + td_verror(td, icd.error); + break; + } + + /* + * the rate is batched for now, it should work for batches + * of completions except the very first one which may look + * a little bursty + */ + gettimeofday(&e, NULL); + usec = utime_since(&s, &e); + + rate_throttle(td, usec, icd.bytes_done[td->ddir]); + + if (check_min_rate(td, &e)) { + td_verror(td, ENOMEM); + break; + } + + if (runtime_exceeded(td, &e)) + break; + + if (td->thinktime) + usec_sleep(td, td->thinktime); + + if (should_fsync(td) && td->fsync_blocks && + (td->io_blocks[DDIR_WRITE] % td->fsync_blocks) == 0) + sync_td(td); + } + + if (td->cur_depth) + cleanup_pending_aio(td); + + if (should_fsync(td)) + sync_td(td); +} + +static void cleanup_io(struct thread_data *td) +{ + if (td->io_cleanup) + td->io_cleanup(td); +} + +static int init_io(struct thread_data *td) +{ + if (td->io_engine == FIO_SYNCIO) + return fio_syncio_init(td); + else if (td->io_engine == FIO_MMAPIO) + return fio_mmapio_init(td); + else if (td->io_engine == FIO_LIBAIO) + return fio_libaio_init(td); + else if (td->io_engine == FIO_POSIXAIO) + return fio_posixaio_init(td); + else if (td->io_engine == FIO_SGIO) + return fio_sgio_init(td); + else { + fprintf(stderr, "bad io_engine %d\n", td->io_engine); + return 1; + } +} + +static void cleanup_io_u(struct thread_data *td) +{ + struct list_head *entry, *n; + struct io_u *io_u; + + list_for_each_safe(entry, n, &td->io_u_freelist) { + io_u = list_entry(entry, struct io_u, list); + + list_del(&io_u->list); + free(io_u); + } + + if (td->mem_type == MEM_MALLOC) + free(td->orig_buffer); + else if (td->mem_type == MEM_SHM) { + struct shmid_ds sbuf; + + shmdt(td->orig_buffer); + shmctl(td->shm_id, IPC_RMID, &sbuf); + } else if (td->mem_type == MEM_MMAP) + munmap(td->orig_buffer, td->orig_buffer_size); + else + fprintf(stderr, "Bad memory type %d\n", td->mem_type); + + td->orig_buffer = NULL; +} + +static int init_io_u(struct thread_data *td) +{ + struct io_u *io_u; + int i, max_units; + char *p; + + if (td->io_engine & FIO_SYNCIO) + max_units = 1; + else + max_units = td->iodepth; + + td->orig_buffer_size = td->max_bs * max_units + MASK; + + if (td->mem_type == MEM_MALLOC) + td->orig_buffer = malloc(td->orig_buffer_size); + else if (td->mem_type == MEM_SHM) { + td->shm_id = shmget(IPC_PRIVATE, td->orig_buffer_size, IPC_CREAT | 0600); + if (td->shm_id < 0) { + td_verror(td, errno); + perror("shmget"); + return 1; + } + + td->orig_buffer = shmat(td->shm_id, NULL, 0); + if (td->orig_buffer == (void *) -1) { + td_verror(td, errno); + perror("shmat"); + td->orig_buffer = NULL; + return 1; + } + } else if (td->mem_type == MEM_MMAP) { + td->orig_buffer = mmap(NULL, td->orig_buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | OS_MAP_ANON, 0, 0); + if (td->orig_buffer == MAP_FAILED) { + td_verror(td, errno); + perror("mmap"); + td->orig_buffer = NULL; + return 1; + } + } + + INIT_LIST_HEAD(&td->io_u_freelist); + INIT_LIST_HEAD(&td->io_u_busylist); + INIT_LIST_HEAD(&td->io_hist_list); + + p = ALIGN(td->orig_buffer); + for (i = 0; i < max_units; i++) { + io_u = malloc(sizeof(*io_u)); + memset(io_u, 0, sizeof(*io_u)); + INIT_LIST_HEAD(&io_u->list); + + io_u->buf = p + td->max_bs * i; + list_add(&io_u->list, &td->io_u_freelist); + } + + return 0; +} + +static int create_file(struct thread_data *td, unsigned long long size, + int extend) +{ + unsigned long long left; + unsigned int bs; + int r, oflags; + char *b; + + /* + * unless specifically asked for overwrite, let normal io extend it + */ + if (td_write(td) && !td->overwrite) + return 0; + + if (!size) { + fprintf(stderr, "Need size for create\n"); + td_verror(td, EINVAL); + return 1; + } + + if (!extend) { + oflags = O_CREAT | O_TRUNC; + printf("Client%d: Laying out IO file (%LuMiB)\n", td->thread_number, size >> 20); + } else { + oflags = O_APPEND; + printf("Client%d: Extending IO file (%Lu -> %LuMiB)\n", td->thread_number, (td->file_size - size) >> 20, td->file_size >> 20); + } + + td->fd = open(td->file_name, O_WRONLY | oflags, 0644); + if (td->fd < 0) { + td_verror(td, errno); + return 1; + } + + if (!extend && ftruncate(td->fd, td->file_size) == -1) { + td_verror(td, errno); + return 1; + } + + td->io_size = td->file_size; + b = malloc(td->max_bs); + memset(b, 0, td->max_bs); + + left = size; + while (left && !td->terminate) { + bs = td->max_bs; + if (bs > left) + bs = left; + + r = write(td->fd, b, bs); + + if (r == (int) bs) { + left -= bs; + continue; + } else { + if (r < 0) + td_verror(td, errno); + else + td_verror(td, EIO); + + break; + } + } + + if (td->terminate) + unlink(td->file_name); + else if (td->create_fsync) + fsync(td->fd); + + close(td->fd); + td->fd = -1; + free(b); + return 0; +} + +static int file_size(struct thread_data *td) +{ + struct stat st; + + if (fstat(td->fd, &st) == -1) { + td_verror(td, errno); + return 1; + } + + if (!td->file_size) + td->file_size = st.st_size; + + return 0; +} + +static int bdev_size(struct thread_data *td) +{ + size_t bytes; + int r; + + r = blockdev_size(td->fd, &bytes); + if (r) { + td_verror(td, r); + return 1; + } + + /* + * no extend possibilities, so limit size to device size if too large + */ + if (!td->file_size || td->file_size > bytes) + td->file_size = bytes; + + return 0; +} + +static int get_file_size(struct thread_data *td) +{ + int ret; + + if (td->filetype == FIO_TYPE_FILE) + ret = file_size(td); + else + ret = bdev_size(td); + + if (ret) + return ret; + + if (td->file_offset > td->file_size) { + fprintf(stderr, "Client%d: offset larger than length (%Lu > %Lu)\n", td->thread_number, td->file_offset, td->file_size); + return 1; + } + + td->io_size = td->file_size - td->file_offset; + if (td->io_size == 0) { + fprintf(stderr, "Client%d: no io blocks\n", td->thread_number); + td_verror(td, EINVAL); + return 1; + } + + td->total_io_size = td->io_size * td->loops; + return 0; +} + +static int setup_file_mmap(struct thread_data *td) +{ + int flags; + + if (td_read(td)) + flags = PROT_READ; + else { + flags = PROT_WRITE; + + if (td->verify != VERIFY_NONE) + flags |= PROT_READ; + } + + td->mmap = mmap(NULL, td->file_size, flags, MAP_SHARED, td->fd, td->file_offset); + if (td->mmap == MAP_FAILED) { + td->mmap = NULL; + td_verror(td, errno); + return 1; + } + + if (td->invalidate_cache) { + if (madvise(td->mmap, td->file_size, MADV_DONTNEED) < 0) { + td_verror(td, errno); + return 1; + } + } + + if (td->sequential) { + if (madvise(td->mmap, td->file_size, MADV_SEQUENTIAL) < 0) { + td_verror(td, errno); + return 1; + } + } else { + if (madvise(td->mmap, td->file_size, MADV_RANDOM) < 0) { + td_verror(td, errno); + return 1; + } + } + + return 0; +} + +static int setup_file_plain(struct thread_data *td) +{ + if (td->invalidate_cache) { + if (fadvise(td->fd, td->file_offset, td->file_size, POSIX_FADV_DONTNEED) < 0) { + td_verror(td, errno); + return 1; + } + } + + if (td->sequential) { + if (fadvise(td->fd, td->file_offset, td->file_size, POSIX_FADV_SEQUENTIAL) < 0) { + td_verror(td, errno); + return 1; + } + } else { + if (fadvise(td->fd, td->file_offset, td->file_size, POSIX_FADV_RANDOM) < 0) { + td_verror(td, errno); + return 1; + } + } + + return 0; +} + +static int setup_file(struct thread_data *td) +{ + struct stat st; + int flags = 0; + + if (stat(td->file_name, &st) == -1) { + if (errno != ENOENT) { + td_verror(td, errno); + return 1; + } + if (!td->create_file) { + td_verror(td, ENOENT); + return 1; + } + if (create_file(td, td->file_size, 0)) + return 1; + } else if (td->filetype == FIO_TYPE_FILE) { + if (st.st_size < td->file_size) { + if (create_file(td, td->file_size - st.st_size, 1)) + return 1; + } + } + + if (td->odirect) + flags |= O_DIRECT; + + if (td_read(td)) + td->fd = open(td->file_name, flags | O_RDONLY); + else { + if (td->filetype == FIO_TYPE_FILE) { + if (!td->overwrite) + flags |= O_TRUNC; + + flags |= O_CREAT; + } + if (td->sync_io) + flags |= O_SYNC; + + flags |= O_RDWR; + + td->fd = open(td->file_name, flags, 0600); + } + + if (td->fd == -1) { + td_verror(td, errno); + return 1; + } + + if (get_file_size(td)) + return 1; + + if (td->io_engine != FIO_MMAPIO) + return setup_file_plain(td); + else + return setup_file_mmap(td); +} + +static int check_dev_match(dev_t dev, char *path) +{ + unsigned int major, minor; + char line[256], *p; + FILE *f; + + f = fopen(path, "r"); + if (!f) { + perror("open path"); + return 1; + } + + p = fgets(line, sizeof(line), f); + if (!p) { + fclose(f); + return 1; + } + + if (sscanf(p, "%u:%u", &major, &minor) != 2) { + fclose(f); + return 1; + } + + if (((major << 8) | minor) == dev) { + fclose(f); + return 0; + } + + fclose(f); + return 1; +} + +static int find_block_dir(dev_t dev, char *path) +{ + struct dirent *dir; + struct stat st; + int found = 0; + DIR *D; + + D = opendir(path); + if (!D) + return 0; + + while ((dir = readdir(D)) != NULL) { + char full_path[256]; + + if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, "..")) + continue; + if (!strcmp(dir->d_name, "device")) + continue; + + sprintf(full_path, "%s/%s", path, dir->d_name); + + if (!strcmp(dir->d_name, "dev")) { + if (!check_dev_match(dev, full_path)) { + found = 1; + break; + } + } + + if (stat(full_path, &st) == -1) { + perror("stat"); + break; + } + + if (!S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode)) + continue; + + found = find_block_dir(dev, full_path); + if (found) { + strcpy(path, full_path); + break; + } + } + + closedir(D); + return found; +} + +static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus) +{ + unsigned in_flight; + char line[256]; + FILE *f; + char *p; + + f = fopen(du->path, "r"); + if (!f) + return 1; + + p = fgets(line, sizeof(line), f); + if (!p) { + fclose(f); + return 1; + } + + if (sscanf(p, "%u %u %llu %u %u %u %llu %u %u %u %u\n", &dus->ios[0], &dus->merges[0], &dus->sectors[0], &dus->ticks[0], &dus->ios[1], &dus->merges[1], &dus->sectors[1], &dus->ticks[1], &in_flight, &dus->io_ticks, &dus->time_in_queue) != 11) { + fclose(f); + return 1; + } + + fclose(f); + return 0; +} + +static void update_io_tick_disk(struct disk_util *du) +{ + struct disk_util_stat __dus, *dus, *ldus; + struct timeval t; + + if (get_io_ticks(du, &__dus)) + return; + + dus = &du->dus; + ldus = &du->last_dus; + + dus->sectors[0] += (__dus.sectors[0] - ldus->sectors[0]); + dus->sectors[1] += (__dus.sectors[1] - ldus->sectors[1]); + dus->ios[0] += (__dus.ios[0] - ldus->ios[0]); + dus->ios[1] += (__dus.ios[1] - ldus->ios[1]); + dus->merges[0] += (__dus.merges[0] - ldus->merges[0]); + dus->merges[1] += (__dus.merges[1] - ldus->merges[1]); + dus->ticks[0] += (__dus.ticks[0] - ldus->ticks[0]); + dus->ticks[1] += (__dus.ticks[1] - ldus->ticks[1]); + dus->io_ticks += (__dus.io_ticks - ldus->io_ticks); + dus->time_in_queue += (__dus.time_in_queue - ldus->time_in_queue); + + gettimeofday(&t, NULL); + du->msec += mtime_since(&du->time, &t); + memcpy(&du->time, &t, sizeof(t)); + memcpy(ldus, &__dus, sizeof(__dus)); +} + +static void update_io_ticks(void) +{ + struct list_head *entry; + struct disk_util *du; + + list_for_each(entry, &disk_list) { + du = list_entry(entry, struct disk_util, list); + update_io_tick_disk(du); + } +} + +static int disk_util_exists(dev_t dev) +{ + struct list_head *entry; + struct disk_util *du; + + list_for_each(entry, &disk_list) { + du = list_entry(entry, struct disk_util, list); + + if (du->dev == dev) + return 1; + } + + return 0; +} + +static void disk_util_add(dev_t dev, char *path) +{ + struct disk_util *du = malloc(sizeof(*du)); + + memset(du, 0, sizeof(*du)); + INIT_LIST_HEAD(&du->list); + sprintf(du->path, "%s/stat", path); + du->name = strdup(basename(path)); + du->dev = dev; + + gettimeofday(&du->time, NULL); + get_io_ticks(du, &du->last_dus); + + list_add_tail(&du->list, &disk_list); +} + +static void init_disk_util(struct thread_data *td) +{ + struct stat st; + char foo[256], tmp[256]; + dev_t dev; + char *p; + + if (!td->do_disk_util) + return; + + if (!stat(td->file_name, &st)) { + if (S_ISBLK(st.st_mode)) + dev = st.st_rdev; + else + dev = st.st_dev; + } else { + /* + * must be a file, open "." in that path + */ + strcpy(foo, td->file_name); + p = dirname(foo); + if (stat(p, &st)) { + perror("disk util stat"); + return; + } + + dev = st.st_dev; + } + + if (disk_util_exists(dev)) + return; + + sprintf(foo, "/sys/block"); + if (!find_block_dir(dev, foo)) + return; + + /* + * if this is inside a partition dir, jump back to parent + */ + sprintf(tmp, "%s/queue", foo); + if (stat(tmp, &st)) { + p = dirname(foo); + sprintf(tmp, "%s/queue", p); + if (stat(tmp, &st)) { + fprintf(stderr, "unknown sysfs layout\n"); + return; + } + sprintf(foo, "%s", p); + } + + disk_util_add(dev, foo); +} + +static void disk_util_timer_arm(void) +{ + itimer.it_value.tv_sec = 0; + itimer.it_value.tv_usec = DISK_UTIL_MSEC * 1000; + setitimer(ITIMER_REAL, &itimer, NULL); +} + +static void clear_io_state(struct thread_data *td) +{ + if (td->io_engine == FIO_SYNCIO) + lseek(td->fd, SEEK_SET, 0); + + td->cur_off = 0; + td->last_bytes = 0; + td->stat_io_bytes[0] = td->stat_io_bytes[1] = 0; + td->this_io_bytes[0] = td->this_io_bytes[1] = 0; + + if (td->file_map) + memset(td->file_map, 0, td->num_maps * sizeof(long)); +} + +static void update_rusage_stat(struct thread_data *td) +{ + if (!(td->runtime[0] + td->runtime[1])) + return; + + getrusage(RUSAGE_SELF, &td->ru_end); + + td->usr_time += mtime_since(&td->ru_start.ru_utime, &td->ru_end.ru_utime); + td->sys_time += mtime_since(&td->ru_start.ru_stime, &td->ru_end.ru_stime); + td->ctx += td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw - (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw); + + + memcpy(&td->ru_start, &td->ru_end, sizeof(td->ru_end)); +} + +static void *thread_main(void *data) +{ + struct thread_data *td = data; + int ret = 1; + + if (!td->use_thread) + setsid(); + + td->pid = getpid(); + + if (init_io_u(td)) + goto err; + + if (fio_setaffinity(td) == -1) { + td_verror(td, errno); + goto err; + } + + if (init_io(td)) + goto err; + + if (td->ioprio) { + if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) { + td_verror(td, errno); + goto err; + } + } + + sem_post(&startup_sem); + sem_wait(&td->mutex); + + if (!td->create_serialize && setup_file(td)) + goto err; + + if (init_random_state(td)) + goto err; + + gettimeofday(&td->epoch, NULL); + + while (td->loops--) { + getrusage(RUSAGE_SELF, &td->ru_start); + gettimeofday(&td->start, NULL); + memcpy(&td->stat_sample_time, &td->start, sizeof(td->start)); + + if (td->ratemin) + memcpy(&td->lastrate, &td->stat_sample_time, sizeof(td->lastrate)); + + clear_io_state(td); + prune_io_piece_log(td); + + do_io(td); + + td->runtime[td->ddir] += mtime_since_now(&td->start); + update_rusage_stat(td); + + if (td->error || td->terminate) + break; + + if (td->verify == VERIFY_NONE) + continue; + + clear_io_state(td); + gettimeofday(&td->start, NULL); + + do_verify(td); + + td->runtime[DDIR_READ] += mtime_since_now(&td->start); + + if (td->error || td->terminate) + break; + } + + ret = 0; + + if (td->bw_log) + finish_log(td, td->bw_log, "bw"); + if (td->slat_log) + finish_log(td, td->slat_log, "slat"); + if (td->clat_log) + finish_log(td, td->clat_log, "clat"); + + if (exitall_on_terminate) + terminate_threads(td->groupid); + +err: + if (td->fd != -1) { + close(td->fd); + td->fd = -1; + } + if (td->mmap) + munmap(td->mmap, td->file_size); + cleanup_io(td); + cleanup_io_u(td); + if (ret) { + sem_post(&startup_sem); + sem_wait(&td->mutex); + } + td_set_runstate(td, TD_EXITED); + return NULL; + +} + +static void *fork_main(int shmid, int offset) +{ + struct thread_data *td; + void *data; + + data = shmat(shmid, NULL, 0); + if (data == (void *) -1) { + perror("shmat"); + return NULL; + } + + td = data + offset * sizeof(struct thread_data); + thread_main(td); + shmdt(data); + return NULL; +} + +static int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, + double *mean, double *dev) +{ + double n; + + if (is->samples == 0) + return 0; + + *min = is->min_val; + *max = is->max_val; + + n = (double) is->samples; + *mean = (double) is->val / n; + *dev = sqrt(((double) is->val_sq - (*mean * *mean) / n) / (n - 1)); + if (!(*min + *max) && !(*mean + *dev)) + return 0; + + return 1; +} + +static void show_ddir_status(struct thread_data *td, struct group_run_stats *rs, + int ddir) +{ + char *ddir_str[] = { "read ", "write" }; + unsigned long min, max, bw; + double mean, dev; + + if (!td->runtime[ddir]) + return; + + bw = td->io_bytes[ddir] / td->runtime[ddir]; + printf(" %s: io=%6luMiB, bw=%6luKiB/s, runt=%6lumsec\n", ddir_str[ddir], td->io_bytes[ddir] >> 20, bw, td->runtime[ddir]); + + if (calc_lat(&td->slat_stat[ddir], &min, &max, &mean, &dev)) + printf(" slat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev); + + if (calc_lat(&td->clat_stat[ddir], &min, &max, &mean, &dev)) + printf(" clat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev); + + if (calc_lat(&td->bw_stat[ddir], &min, &max, &mean, &dev)) { + double p_of_agg; + + p_of_agg = mean * 100 / (double) rs->agg[ddir]; + printf(" bw (KiB/s) : min=%5lu, max=%5lu, per=%3.2f%%, avg=%5.02f, dev=%5.02f\n", min, max, p_of_agg, mean, dev); + } +} + +static void show_thread_status(struct thread_data *td, + struct group_run_stats *rs) +{ + double usr_cpu, sys_cpu; + + if (!(td->io_bytes[0] + td->io_bytes[1]) && !td->error) + return; + + printf("Client%d (groupid=%d): err=%2d:\n", td->thread_number, td->groupid, td->error); + + show_ddir_status(td, rs, td->ddir); + show_ddir_status(td, rs, td->ddir ^ 1); + + if (td->runtime[0] + td->runtime[1]) { + double runt = td->runtime[0] + td->runtime[1]; + + usr_cpu = (double) td->usr_time * 100 / runt; + sys_cpu = (double) td->sys_time * 100 / runt; + } else { + usr_cpu = 0; + sys_cpu = 0; + } + + printf(" cpu : usr=%3.2f%%, sys=%3.2f%%, ctx=%lu\n", usr_cpu, sys_cpu, td->ctx); +} + +static void check_str_update(struct thread_data *td) +{ + char c = run_str[td->thread_number - 1]; + + if (td->runstate == td->old_runstate) + return; + + switch (td->runstate) { + case TD_REAPED: + c = '_'; + break; + case TD_EXITED: + c = 'E'; + break; + case TD_RUNNING: + if (td_read(td)) { + if (td->sequential) + c = 'R'; + else + c = 'r'; + } else { + if (td->sequential) + c = 'W'; + else + c = 'w'; + } + break; + case TD_VERIFYING: + c = 'V'; + break; + case TD_CREATED: + c = 'C'; + break; + case TD_NOT_CREATED: + c = 'P'; + break; + default: + printf("state %d\n", td->runstate); + } + + run_str[td->thread_number - 1] = c; + td->old_runstate = td->runstate; +} + +static void print_thread_status(void) +{ + unsigned long long bytes_done, bytes_total; + int i, nr_running, t_rate, m_rate; + double perc; + + bytes_done = bytes_total = 0; + nr_running = t_rate = m_rate = 0; + for (i = 0; i < thread_number; i++) { + struct thread_data *td = &threads[i]; + + if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING){ + nr_running++; + t_rate += td->rate; + m_rate += td->ratemin; + } + + bytes_total += td->total_io_size; + if (td->verify) + bytes_total += td->total_io_size; + + bytes_done += td->io_bytes[DDIR_READ] +td->io_bytes[DDIR_WRITE]; + + check_str_update(td); + } + + perc = 0; + if (bytes_total && bytes_done) { + perc = (double) 100 * bytes_done / (double) bytes_total; + if (perc > 100.0) + perc = 100.0; + } + + printf("Threads now running: %d", nr_running); + if (m_rate || t_rate) + printf(", commitrate %d/%dKiB/sec", t_rate, m_rate); + printf(" : [%s] [%3.2f%% done]\r", run_str, perc); + fflush(stdout); +} + +static void reap_threads(int *nr_running, int *t_rate, int *m_rate) +{ + int i; + + /* + * reap exited threads (TD_EXITED -> TD_REAPED) + */ + for (i = 0; i < thread_number; i++) { + struct thread_data *td = &threads[i]; + + if (td->runstate != TD_EXITED) + continue; + + td_set_runstate(td, TD_REAPED); + + if (td->use_thread) { + long ret; + + if (pthread_join(td->thread, (void *) &ret)) + perror("thread_join"); + } else + waitpid(td->pid, NULL, 0); + + (*nr_running)--; + (*m_rate) -= td->ratemin; + (*t_rate) -= td->rate; + } +} + +static void run_threads(void) +{ + struct timeval genesis; + struct thread_data *td; + unsigned long spent; + int i, todo, nr_running, m_rate, t_rate, nr_started; + + printf("Starting %d thread%s\n", thread_number, thread_number > 1 ? "s" : ""); + fflush(stdout); + + signal(SIGINT, sig_handler); + signal(SIGALRM, sig_handler); + + todo = thread_number; + nr_running = 0; + nr_started = 0; + m_rate = t_rate = 0; + + for (i = 0; i < thread_number; i++) { + td = &threads[i]; + + run_str[td->thread_number - 1] = 'P'; + + init_disk_util(td); + + if (!td->create_serialize) + continue; + + /* + * do file setup here so it happens sequentially, + * we don't want X number of threads getting their + * client data interspersed on disk + */ + if (setup_file(td)) { + td_set_runstate(td, TD_REAPED); + todo--; + } + } + + gettimeofday(&genesis, NULL); + + while (todo) { + /* + * create threads (TD_NOT_CREATED -> TD_CREATED) + */ + for (i = 0; i < thread_number; i++) { + td = &threads[i]; + + if (td->runstate != TD_NOT_CREATED) + continue; + + /* + * never got a chance to start, killed by other + * thread for some reason + */ + if (td->terminate) { + todo--; + continue; + } + + if (td->start_delay) { + spent = mtime_since_now(&genesis); + + if (td->start_delay * 1000 > spent) + continue; + } + + if (td->stonewall && (nr_started || nr_running)) + break; + + td_set_runstate(td, TD_CREATED); + sem_init(&startup_sem, 0, 1); + todo--; + nr_started++; + + if (td->use_thread) { + if (pthread_create(&td->thread, NULL, thread_main, td)) { + perror("thread_create"); + nr_started--; + } + } else { + if (fork()) + sem_wait(&startup_sem); + else { + fork_main(shm_id, i); + exit(0); + } + } + } + + /* + * start created threads (TD_CREATED -> TD_RUNNING) + */ + for (i = 0; i < thread_number; i++) { + td = &threads[i]; + + if (td->runstate != TD_CREATED) + continue; + + td_set_runstate(td, TD_RUNNING); + nr_running++; + nr_started--; + m_rate += td->ratemin; + t_rate += td->rate; + sem_post(&td->mutex); + } + + reap_threads(&nr_running, &t_rate, &m_rate); + + if (todo) + usleep(100000); + } + + while (nr_running) { + reap_threads(&nr_running, &t_rate, &m_rate); + usleep(10000); + } + + update_io_ticks(); +} + +static void show_group_stats(struct group_run_stats *rs, int id) +{ + printf("\nRun status group %d (all jobs):\n", id); + + if (rs->max_run[DDIR_READ]) + printf(" READ: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", rs->io_mb[0], rs->agg[0], rs->min_bw[0], rs->max_bw[0], rs->min_run[0], rs->max_run[0]); + if (rs->max_run[DDIR_WRITE]) + printf(" WRITE: io=%luMiB, aggrb=%lu, minb=%lu, maxb=%lu, mint=%lumsec, maxt=%lumsec\n", rs->io_mb[1], rs->agg[1], rs->min_bw[1], rs->max_bw[1], rs->min_run[1], rs->max_run[1]); +} + +static void show_disk_util(void) +{ + struct disk_util_stat *dus; + struct list_head *entry; + struct disk_util *du; + double util; + + printf("\nDisk stats (read/write):\n"); + + list_for_each(entry, &disk_list) { + du = list_entry(entry, struct disk_util, list); + dus = &du->dus; + + util = (double) 100 * du->dus.io_ticks / (double) du->msec; + if (util > 100.0) + util = 100.0; + + printf(" %s: ios=%u/%u, merge=%u/%u, ticks=%u/%u, in_queue=%u, util=%3.2f%%\n", du->name, dus->ios[0], dus->ios[1], dus->merges[0], dus->merges[1], dus->ticks[0], dus->ticks[1], dus->time_in_queue, util); + } +} + +static void show_run_stats(void) +{ + struct group_run_stats *runstats, *rs; + struct thread_data *td; + int i; + + runstats = malloc(sizeof(struct group_run_stats) * (groupid + 1)); + + for (i = 0; i < groupid + 1; i++) { + rs = &runstats[i]; + + memset(rs, 0, sizeof(*rs)); + rs->min_bw[0] = rs->min_run[0] = ~0UL; + rs->min_bw[1] = rs->min_run[1] = ~0UL; + } + + for (i = 0; i < thread_number; i++) { + unsigned long rbw, wbw; + + td = &threads[i]; + + if (td->error) { + printf("Client%d: %s\n", td->thread_number, td->verror); + continue; + } + + rs = &runstats[td->groupid]; + + if (td->runtime[0] < rs->min_run[0] || !rs->min_run[0]) + rs->min_run[0] = td->runtime[0]; + if (td->runtime[0] > rs->max_run[0]) + rs->max_run[0] = td->runtime[0]; + if (td->runtime[1] < rs->min_run[1] || !rs->min_run[1]) + rs->min_run[1] = td->runtime[1]; + if (td->runtime[1] > rs->max_run[1]) + rs->max_run[1] = td->runtime[1]; + + rbw = wbw = 0; + if (td->runtime[0]) + rbw = td->io_bytes[0] / td->runtime[0]; + if (td->runtime[1]) + wbw = td->io_bytes[1] / td->runtime[1]; + + if (rbw < rs->min_bw[0]) + rs->min_bw[0] = rbw; + if (wbw < rs->min_bw[1]) + rs->min_bw[1] = wbw; + if (rbw > rs->max_bw[0]) + rs->max_bw[0] = rbw; + if (wbw > rs->max_bw[1]) + rs->max_bw[1] = wbw; + + rs->io_mb[0] += td->io_bytes[0] >> 20; + rs->io_mb[1] += td->io_bytes[1] >> 20; + } + + for (i = 0; i < groupid + 1; i++) { + rs = &runstats[i]; + + if (rs->max_run[0]) + rs->agg[0] = (rs->io_mb[0]*1024*1000) / rs->max_run[0]; + if (rs->max_run[1]) + rs->agg[1] = (rs->io_mb[1]*1024*1000) / rs->max_run[1]; + } + + /* + * don't overwrite last signal output + */ + printf("\n"); + + for (i = 0; i < thread_number; i++) { + td = &threads[i]; + rs = &runstats[td->groupid]; + + show_thread_status(td, rs); + } + + for (i = 0; i < groupid + 1; i++) + show_group_stats(&runstats[i], i); + + show_disk_util(); +} + +int main(int argc, char *argv[]) +{ + if (parse_options(argc, argv)) + return 1; + + if (!thread_number) { + printf("Nothing to do\n"); + return 1; + } + + disk_util_timer_arm(); + + run_threads(); + show_run_stats(); + + return 0; +} diff --git a/fio.h b/fio.h new file mode 100644 index 00000000..dbce4af3 --- /dev/null +++ b/fio.h @@ -0,0 +1,312 @@ +#ifndef FIO_H +#define FIO_H + +#include +#include +#include +#include +#include +#include + +#include "list.h" +#include "md5.h" +#include "crc32.h" +#include "arch.h" +#include "os.h" + +struct io_stat { + unsigned long val; + unsigned long val_sq; + unsigned long max_val; + unsigned long min_val; + unsigned long samples; +}; + +struct io_sample { + unsigned long time; + unsigned long val; + unsigned int ddir; +}; + +struct io_log { + unsigned long nr_samples; + unsigned long max_samples; + struct io_sample *log; +}; + +struct io_piece { + struct list_head list; + unsigned long long offset; + unsigned int len; +}; + +/* + * The io unit + */ +struct io_u { + union { +#ifdef FIO_HAVE_LIBAIO + struct iocb iocb; +#endif +#ifdef FIO_HAVE_POSIXAIO + struct aiocb aiocb; +#endif +#ifdef FIO_HAVE_SGIO + struct sg_io_hdr hdr; +#endif + }; + struct timeval start_time; + struct timeval issue_time; + + char *buf; + unsigned int buflen; + unsigned long long offset; + + unsigned int resid; + unsigned int error; + + unsigned char seen; + unsigned char ddir; + + struct list_head list; +}; + +#define FIO_HDR_MAGIC 0xf00baaef + +enum { + VERIFY_NONE = 0, + VERIFY_MD5, + VERIFY_CRC32, +}; + +struct verify_header { + unsigned int fio_magic; + unsigned int len; + unsigned int verify_type; + union { + char md5_digest[MD5_HASH_WORDS * 4]; + unsigned long crc32; + }; +}; + +struct group_run_stats { + unsigned long max_run[2], min_run[2]; + unsigned long max_bw[2], min_bw[2]; + unsigned long io_mb[2]; + unsigned long agg[2]; +}; + +struct thread_data { + char file_name[256]; + char directory[256]; + char verror[80]; + pthread_t thread; + int thread_number; + int groupid; + int filetype; + int error; + int fd; + void *mmap; + pid_t pid; + char *orig_buffer; + size_t orig_buffer_size; + volatile int terminate; + volatile int runstate; + volatile int old_runstate; + unsigned int ddir; + unsigned int ioprio; + unsigned int sequential; + unsigned int bs; + unsigned int min_bs; + unsigned int max_bs; + unsigned int odirect; + unsigned int thinktime; + unsigned int fsync_blocks; + unsigned int start_delay; + unsigned int timeout; + unsigned int io_engine; + unsigned int create_file; + unsigned int overwrite; + unsigned int invalidate_cache; + unsigned int bw_avg_time; + unsigned int create_serialize; + unsigned int create_fsync; + unsigned int loops; + unsigned long long file_size; + unsigned long long file_offset; + unsigned int sync_io; + unsigned int mem_type; + unsigned int verify; + unsigned int stonewall; + unsigned int numjobs; + unsigned int use_thread; + unsigned int iodepth; + os_cpu_mask_t cpumask; + + struct drand48_data bsrange_state; + struct drand48_data verify_state; + + int shm_id; + + unsigned long long cur_off; + + void *io_data; + char io_engine_name[16]; + int (*io_prep)(struct thread_data *, struct io_u *); + int (*io_queue)(struct thread_data *, struct io_u *); + int (*io_getevents)(struct thread_data *, int, int, struct timespec *); + struct io_u *(*io_event)(struct thread_data *, int); + int (*io_cancel)(struct thread_data *, struct io_u *); + void (*io_cleanup)(struct thread_data *); + int (*io_sync)(struct thread_data *); + + unsigned int cur_depth; + struct list_head io_u_freelist; + struct list_head io_u_busylist; + + unsigned int rate; + unsigned int ratemin; + unsigned int ratecycle; + unsigned long rate_usec_cycle; + long rate_pending_usleep; + unsigned long rate_bytes; + struct timeval lastrate; + + unsigned long runtime[2]; /* msec */ + unsigned long long io_size; + unsigned long long total_io_size; + + unsigned long io_blocks[2]; + unsigned long io_bytes[2]; + unsigned long this_io_bytes[2]; + unsigned long last_bytes; + sem_t mutex; + + struct drand48_data random_state; + unsigned long *file_map; + unsigned int num_maps; + + /* + * bandwidth and latency stats + */ + struct io_stat clat_stat[2]; /* completion latency */ + struct io_stat slat_stat[2]; /* submission latency */ + struct io_stat bw_stat[2]; /* bandwidth stats */ + + unsigned long stat_io_bytes[2]; + struct timeval stat_sample_time[2]; + + struct io_log *slat_log; + struct io_log *clat_log; + struct io_log *bw_log; + + struct timeval start; /* start of this loop */ + struct timeval epoch; /* time job was started */ + + struct rusage ru_start; + struct rusage ru_end; + unsigned long usr_time; + unsigned long sys_time; + unsigned long ctx; + + unsigned int do_disk_util; + unsigned int override_sync; + + struct list_head io_hist_list; +}; + +#define td_verror(td, err) \ + do { \ + int e = (err); \ + (td)->error = e; \ + snprintf(td->verror, sizeof(td->verror) - 1, "file:%s:%d, error=%s", __FILE__, __LINE__, strerror(e)); \ + } while (0) + +extern int parse_jobs_ini(char *); +extern int parse_options(int, char **); +extern void finish_log(struct thread_data *, struct io_log *, const char *); +extern int init_random_state(struct thread_data *); + +extern int rate_quit; +extern int write_lat_log; +extern int write_bw_log; +extern int exitall_on_terminate; +extern int thread_number; +extern int shm_id; +extern int groupid; + +extern struct thread_data *threads; + +enum { + DDIR_READ = 0, + DDIR_WRITE, +}; + +/* + * What type of allocation to use for io buffers + */ +enum { + MEM_MALLOC, /* ordinary malloc */ + MEM_SHM, /* use shared memory segments */ + MEM_MMAP, /* use anonynomous mmap */ +}; + +/* + * The type of object we are working on + */ +enum { + FIO_TYPE_FILE = 1, + FIO_TYPE_BD, +}; + +enum { + FIO_SYNCIO = 1 << 0, + FIO_MMAPIO = 1 << 1 | FIO_SYNCIO, + FIO_LIBAIO = 1 << 2, + FIO_POSIXAIO = 1 << 3, + FIO_SGIO = 1 << 4, +}; + +#define td_read(td) ((td)->ddir == DDIR_READ) +#define td_write(td) ((td)->ddir == DDIR_WRITE) + +#define BLOCKS_PER_MAP (8 * sizeof(long)) +#define TO_MAP_BLOCK(td, b) ((b) - ((td)->file_offset / (td)->min_bs)) +#define RAND_MAP_IDX(td, b) (TO_MAP_BLOCK(td, b) / BLOCKS_PER_MAP) +#define RAND_MAP_BIT(td, b) (TO_MAP_BLOCK(td, b) & (BLOCKS_PER_MAP - 1)) + +#define MAX_JOBS (1024) + +struct disk_util_stat { + unsigned ios[2]; + unsigned merges[2]; + unsigned long long sectors[2]; + unsigned ticks[2]; + unsigned io_ticks; + unsigned time_in_queue; +}; + +struct disk_util { + struct list_head list; + + char *name; + char path[256]; + dev_t dev; + + struct disk_util_stat dus; + struct disk_util_stat last_dus; + + unsigned long msec; + struct timeval time; +}; + +struct io_completion_data { + int nr; /* input */ + + int error; /* output */ + unsigned long bytes_done[2]; /* output */ +}; + +#define DISK_UTIL_MSEC (250) + +#endif diff --git a/fio_generate_plots b/fio_generate_plots new file mode 100755 index 00000000..f410efe2 --- /dev/null +++ b/fio_generate_plots @@ -0,0 +1,61 @@ +#!/bin/bash + +# Use gnuplot to generate plots from fio run with -l and/or -w + +if [ "$1"x == "x" ]; then + echo Need title as arg + exit 1 +fi + +TITLE=$1 + +PLOT_LINE="" +for i in *bw.log; do + if [ ! -r $i ]; then + continue + fi + if [ "$PLOT_LINE"x != "x" ]; then + PLOT_LINE=$PLOT_LINE", " + fi + + PLOT_LINE=$PLOT_LINE"'$i' with lines" +done + +if [ "$PLOT_LINE"x != "x" ]; then + echo Making bw logs + echo "set title 'Bandwidth - $TITLE'; set xlabel 'time (msec)'; set ylabel 'KiB/sec'; set terminal png; set output '$TITLE-bw.png'; plot " $PLOT_LINE | gnuplot - +fi + +PLOT_LINE="" +for i in *slat.log; do + if [ ! -r $i ]; then + continue + fi + if [ "$PLOT_LINE"x != "x" ]; then + PLOT_LINE=$PLOT_LINE", " + fi + + PLOT_LINE=$PLOT_LINE"'$i' with lines" +done + +if [ "$PLOT_LINE"x != "x" ]; then + echo Making slat logs $PLOT_LINE + echo "set title 'Submission latency - $TITLE'; set xlabel 'time (msec)'; set ylabel 'latency (msec)'; set terminal png; set output '$TITLE-slat.png'; plot " $PLOT_LINE | gnuplot - +fi + +PLOT_LINE="" +for i in *clat.log; do + if [ ! -r $i ]; then + continue + fi + if [ "$PLOT_LINE"x != "x" ]; then + PLOT_LINE=$PLOT_LINE", " + fi + + PLOT_LINE=$PLOT_LINE"'$i' with lines" +done + +if [ "$PLOT_LINE"x != "x" ]; then + echo Making clat logs $PLOT_LINE + echo "set title 'Completion latency - $TITLE'; set xlabel 'time (msec)'; set ylabel 'latency (msec)'; set terminal png; set output '$TITLE-clat.png'; plot " $PLOT_LINE | gnuplot - +fi diff --git a/list.h b/list.h new file mode 100644 index 00000000..cedbafaa --- /dev/null +++ b/list.h @@ -0,0 +1,134 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +#undef offsetof +#ifdef __compiler_offsetof +#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER) +#else +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = NULL; + entry->prev = NULL; +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#endif diff --git a/md5.c b/md5.c new file mode 100644 index 00000000..cf1f814a --- /dev/null +++ b/md5.c @@ -0,0 +1,118 @@ +/* + * Shamelessly lifted from the 2.6 kernel (crypto/md5.c) + */ +#include +#include +#include "md5.h" + +static void md5_transform(uint32_t *hash, uint32_t const *in) +{ + uint32_t a, b, c, d; + + a = hash[0]; + b = hash[1]; + c = hash[2]; + d = hash[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + hash[0] += a; + hash[1] += b; + hash[2] += c; + hash[3] += d; +} + +void md5_update(struct md5_ctx *mctx, const uint8_t *data, unsigned int len) +{ + const uint32_t avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f); + + mctx->byte_count += len; + + if (avail > len) { + memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), + data, len); + return; + } + + memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), + data, avail); + + md5_transform(mctx->hash, mctx->block); + data += avail; + len -= avail; + + while (len >= sizeof(mctx->block)) { + memcpy(mctx->block, data, sizeof(mctx->block)); + md5_transform(mctx->hash, mctx->block); + data += sizeof(mctx->block); + len -= sizeof(mctx->block); + } + + memcpy(mctx->block, data, len); +} diff --git a/md5.h b/md5.h new file mode 100644 index 00000000..9d1cf4cf --- /dev/null +++ b/md5.h @@ -0,0 +1,27 @@ +#ifndef MD5_H +#define MD5_H + +#include + +#define MD5_DIGEST_SIZE 16 +#define MD5_HMAC_BLOCK_SIZE 64 +#define MD5_BLOCK_WORDS 16 +#define MD5_HASH_WORDS 4 + +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w<>(32-s)) + x) + +struct md5_ctx { + uint32_t hash[MD5_HASH_WORDS]; + uint32_t block[MD5_BLOCK_WORDS]; + uint64_t byte_count; +}; + +extern void md5_update(struct md5_ctx *, const uint8_t *, unsigned int); + +#endif diff --git a/os-freebsd.h b/os-freebsd.h new file mode 100644 index 00000000..26dba908 --- /dev/null +++ b/os-freebsd.h @@ -0,0 +1,23 @@ +#ifndef FIO_OS_FREEBSD_H +#define FIO_OS_FREEBSD_H + +#undef FIO_HAVE_LIBAIO +#define FIO_HAVE_POSIXAIO +#undef FIO_HAVE_FADVISE +#undef FIO_HAVE_CPU_AFFINITY +#undef FIO_HAVE_DISK_UTIL +#undef FIO_HAVE_SGIO + +#define OS_MAP_ANON (MAP_ANON) + +typedef unsigned long os_cpu_mask_t; + +/* + * FIXME + */ +static inline int blockdev_size(int fd, size_t *bytes) +{ + return 1; +} + +#endif diff --git a/os-linux.h b/os-linux.h new file mode 100644 index 00000000..0b1fc00a --- /dev/null +++ b/os-linux.h @@ -0,0 +1,53 @@ +#ifndef FIO_OS_LINUX_H +#define FIO_OS_LINUX_H + +#include + +#define FIO_HAVE_LIBAIO +#define FIO_HAVE_POSIXAIO +#define FIO_HAVE_FADVISE +#define FIO_HAVE_CPU_AFFINITY +#define FIO_HAVE_DISK_UTIL +#define FIO_HAVE_SGIO + +#define OS_MAP_ANON (MAP_ANONYMOUS) + +typedef cpu_set_t os_cpu_mask_t; + +/* + * we want fadvise64 really, but it's so tangled... later + */ +#define fadvise(fd, off, len, advice) \ + posix_fadvise((fd), (off_t)(off), (len), (advice)) + +#define fio_setaffinity(td) \ + sched_setaffinity((td)->pid, sizeof((td)->cpumask), &(td)->cpumask) +#define fio_getaffinity(pid, ptr) \ + sched_getaffinity((pid), sizeof(cpu_set_t), (ptr)) + +static inline int ioprio_set(int which, int who, int ioprio) +{ + return syscall(__NR_ioprio_set, which, who, ioprio); +} + +enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, +}; + +#define IOPRIO_CLASS_SHIFT 13 + +#ifndef BLKGETSIZE64 +#define BLKGETSIZE64 _IOR(0x12,114,size_t) +#endif + +static inline int blockdev_size(int fd, size_t *bytes) +{ + if (!ioctl(fd, BLKGETSIZE64, bytes)) + return 0; + + return errno; +} + +#endif diff --git a/os.h b/os.h new file mode 100644 index 00000000..3ee13689 --- /dev/null +++ b/os.h @@ -0,0 +1,49 @@ +#ifndef FIO_OS_H +#define FIO_OS_H + +#if defined(__linux__) +#include "os-linux.h" +#elif defined(__FreeBSD__) +#include "os-freebsd.h" +#else +#error "unsupported os" +#endif + +#ifdef FIO_HAVE_LIBAIO +#include +#endif + +#ifdef FIO_HAVE_POSIXAIO +#include +#endif + +#ifdef FIO_HAVE_SGIO +#include +#include +#endif + +#ifndef FIO_HAVE_FADVISE +#define fadvise(fd, off, len, advice) (0) + +#define POSIX_FADV_DONTNEED (0) +#define POSIX_FADV_SEQUENTIAL (0) +#define POSIX_FADV_RANDOM (0) +#endif /* FIO_HAVE_FADVISE */ + +#ifndef FIO_HAVE_CPU_AFFINITY +#define fio_setaffinity(td) (0) +#define fio_getaffinity(pid, mask) (0) +#endif + +#ifndef FIO_HAVE_IOPRIO +#define ioprio_set(which, who, prio) (0) +#endif + +struct thread_data; +extern int fio_libaio_init(struct thread_data *); +extern int fio_posixaio_init(struct thread_data *); +extern int fio_syncio_init(struct thread_data *); +extern int fio_mmapio_init(struct thread_data *); +extern int fio_sgio_init(struct thread_data *); + +#endif -- 2.25.1