file hash: don't close file fd on lookup/add race
authorJens Axboe <axboe@kernel.dk>
Wed, 12 Dec 2012 07:16:27 +0000 (08:16 +0100)
committerJens Axboe <axboe@kernel.dk>
Wed, 12 Dec 2012 07:16:27 +0000 (08:16 +0100)
On Linux, udev often has a rule that triggers blkid to read data off
the device, if the device has been opened for writing. This triggers
when the device is closed.

Fio has an internal file hash for lookup of files. When threads start,
it can happen that one thread does a hash lookup without finding the
file, but when it has opened the file and attempts to insert it into
the hash, another thread has already completed that operation. When
that race happens, fio closes the file handle and does the lookup
again. That then triggers blkid to read pages off the device. As data
in the cache is invalidated on open of the device, we know have page
cache entries for the device again.

This is a problem for unbuffered workloads, where the existance of
page cache pages slows it down due to having to check for invalidation.
The user observed problem is that fio exhibits bi-modal performance
results, depending on whether the file hash race was hit during setup
or not.

Fix this by NOT closing the file if we hit this race, but instead wait
until the file is closed after the run.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
file.h
filesetup.c

diff --git a/file.h b/file.h
index 3024c5440094f4a711b993e80dc6b81264f42893..11695e2f84a0d54158b6ea76614f693aec005b46 100644 (file)
--- a/file.h
+++ b/file.h
@@ -65,6 +65,7 @@ struct fio_file {
 
        void *file_data;
        int fd;
+       int shadow_fd;
 #ifdef WIN32
        HANDLE hFile;
        HANDLE ioCP;
index f4e1adcb01a10ae22c352cfe75a3a351305ccba1..9d3e06268fffc35786749ea0eb482d0f9bb7ce11 100644 (file)
@@ -434,6 +434,12 @@ int generic_close_file(struct thread_data fio_unused *td, struct fio_file *f)
                ret = errno;
 
        f->fd = -1;
+
+       if (f->shadow_fd != -1) {
+               close(f->shadow_fd);
+               f->shadow_fd = -1;
+       }
+
        return ret;
 }
 
@@ -462,6 +468,24 @@ int file_lookup_open(struct fio_file *f, int flags)
        return from_hash;
 }
 
+static int file_close_shadow_fds(struct thread_data *td)
+{
+       struct fio_file *f;
+       int num_closed = 0;
+       unsigned int i;
+
+       for_each_file(td, f, i) {
+               if (f->shadow_fd == -1)
+                       continue;
+
+               close(f->shadow_fd);
+               f->shadow_fd = -1;
+               num_closed++;
+       }
+
+       return num_closed;
+}
+
 int generic_open_file(struct thread_data *td, struct fio_file *f)
 {
        int is_std = 0;
@@ -536,6 +560,8 @@ open_again:
                        flags &= ~FIO_O_NOATIME;
                        goto open_again;
                }
+               if (__e == EMFILE && file_close_shadow_fds(td))
+                       goto open_again;
 
                snprintf(buf, sizeof(buf) - 1, "open(%s)", f->file_name);
 
@@ -552,9 +578,22 @@ open_again:
                        int fio_unused ret;
 
                        /*
-                        * OK to ignore, we haven't done anything with it
+                        * Stash away descriptor for later close. This is to
+                        * work-around a "feature" on Linux, where a close of
+                        * an fd that has been opened for write will trigger
+                        * udev to call blkid to check partitions, fs id, etc.
+                        * That polutes the device cache, which can slow down
+                        * unbuffered accesses.
                         */
-                       ret = generic_close_file(td, f);
+                       if (f->shadow_fd == -1)
+                               f->shadow_fd = f->fd;
+                       else {
+                               /*
+                                * OK to ignore, we haven't done anything
+                                * with it
+                                */
+                               ret = generic_close_file(td, f);
+                       }
                        goto open_again;
                }
        }
@@ -1029,6 +1068,7 @@ int add_file(struct thread_data *td, const char *fname)
        }
 
        f->fd = -1;
+       f->shadow_fd = -1;
        fio_file_reset(f);
 
        if (td->files_size <= td->files_index) {