From 67897036632676ce1bfdb45992df078df87c1a27 Mon Sep 17 00:00:00 2001 From: Bruce Cran Date: Thu, 12 May 2011 10:06:10 +0200 Subject: [PATCH] windowsaio: fix race condition between IOCP and queue functions Signed-off-by: Jens Axboe --- engines/windowsaio.c | 521 +++++++++++++++++++++---------------------- 1 file changed, 253 insertions(+), 268 deletions(-) diff --git a/engines/windowsaio.c b/engines/windowsaio.c index e389b673..299acc47 100644 --- a/engines/windowsaio.c +++ b/engines/windowsaio.c @@ -14,24 +14,21 @@ typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped); -typedef struct { - OVERLAPPED o; - struct io_u *io_u; -} FIO_OVERLAPPED; +struct fio_overlapped { + OVERLAPPED o; + struct io_u *io_u; + BOOL io_complete; + BOOL io_free; +}; struct windowsaio_data { - HANDLE *io_handles; - unsigned int io_index; - FIO_OVERLAPPED *ovls; - - HANDLE iothread; - HANDLE iothread_stopped; - BOOL iothread_running; - + struct fio_overlapped *ovls; struct io_u **aio_events; + HANDLE iothread; HANDLE iocomplete_event; CANCELIOEX pCancelIoEx; - BOOL useIOCP; + BOOL iothread_running; + BOOL use_iocp; }; struct thread_ctx { @@ -42,7 +39,7 @@ struct thread_ctx { static void PrintError(LPCSTR lpszFunction); static int fio_windowsaio_cancel(struct thread_data *td, struct io_u *io_u); -static BOOL TimedOut(DWORD start_count, DWORD end_count); +static BOOL timeout_expired(DWORD start_count, DWORD end_count); static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, unsigned int max, struct timespec *t); static struct io_u *fio_windowsaio_event(struct thread_data *td, int event); @@ -82,228 +79,12 @@ static void PrintError(LPCSTR lpszFunction) LocalFree(lpMsgBuf); } -static int fio_windowsaio_cancel(struct thread_data *td, - struct io_u *io_u) -{ - int rc = 0; - - struct windowsaio_data *wd = td->io_ops->data; - - /* If we're running on Vista or newer, we can cancel individual IO requests */ - if (wd->pCancelIoEx != NULL) { - FIO_OVERLAPPED *ovl = io_u->engine_data; - if (!wd->pCancelIoEx(io_u->file->hFile, &ovl->o)) - rc = 1; - } else - rc = 1; - - return rc; -} - -static BOOL TimedOut(DWORD start_count, DWORD end_count) -{ - BOOL expired = FALSE; - DWORD current_time; - - current_time = GetTickCount(); - - if ((end_count > start_count) && current_time >= end_count) - expired = TRUE; - else if (current_time < start_count && current_time > end_count) - expired = TRUE; - - return expired; -} - -static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, - unsigned int max, struct timespec *t) -{ - struct windowsaio_data *wd = td->io_ops->data; - struct flist_head *entry; - unsigned int dequeued = 0; - struct io_u *io_u; - DWORD start_count = 0, end_count = 0; - BOOL timedout = FALSE; - unsigned int mswait = 100; - - if (t != NULL) { - mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000); - start_count = GetTickCount(); - end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000); - } - - while (dequeued < min && !timedout) { - flist_for_each(entry, &td->io_u_busylist) { - io_u = flist_entry(entry, struct io_u, list); - - if (io_u->seen == 0) { - io_u->seen = 1; - wd->aio_events[dequeued] = io_u; - dequeued++; - } - - if (dequeued == max) - break; - } - - if (dequeued < min) - WaitForSingleObject(wd->iocomplete_event, mswait); - - if (t != NULL && TimedOut(start_count, end_count)) - timedout = TRUE; - } - - return dequeued; -} - -static struct io_u *fio_windowsaio_event(struct thread_data *td, int event) -{ - struct windowsaio_data *wd = td->io_ops->data; - return wd->aio_events[event]; -} - -static int fio_windowsaio_queue(struct thread_data *td, - struct io_u *io_u) -{ - struct windowsaio_data *wd; - LPOVERLAPPED lpOvl; - DWORD iobytes; - BOOL success = TRUE; - int ind; - int rc; - - fio_ro_check(td, io_u); - - wd = td->io_ops->data; - ind = wd->io_index; - - ResetEvent(wd->io_handles[ind]); - - if (wd->useIOCP) { - lpOvl = &wd->ovls[ind].o; - - lpOvl->Internal = STATUS_PENDING; - lpOvl->InternalHigh = 0; - lpOvl->Offset = io_u->offset & 0xFFFFFFFF; - lpOvl->OffsetHigh = io_u->offset >> 32; - lpOvl->hEvent = wd->io_handles[ind]; - lpOvl->Pointer = NULL; - wd->ovls[ind].io_u = io_u; - } else { - lpOvl = NULL; - } - - io_u->engine_data = &wd->ovls[ind]; - io_u->seen = 0; - - if (io_u->ddir == DDIR_WRITE) { - success = WriteFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl); - } else if (io_u->ddir == DDIR_READ) { - success = ReadFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl); - } else if (io_u->ddir == DDIR_SYNC || - io_u->ddir == DDIR_DATASYNC || - io_u->ddir == DDIR_SYNC_FILE_RANGE) - { - FlushFileBuffers(io_u->file->hFile); - return FIO_Q_COMPLETED; - } else if (io_u->ddir == DDIR_TRIM) { - log_info("Manual TRIM isn't supported on Windows"); - return FIO_Q_COMPLETED; - } else - assert(0); - - if (wd->useIOCP && (success || GetLastError() == ERROR_IO_PENDING)) { - wd->io_index = (wd->io_index + 1) % td->o.iodepth; - rc = FIO_Q_QUEUED; - } else if (success && !wd->useIOCP) { - io_u->resid = io_u->xfer_buflen - iobytes; - io_u->error = 0; - rc = FIO_Q_COMPLETED; - } else { - PrintError(__func__); - io_u->error = GetLastError(); - io_u->resid = io_u->xfer_buflen; - rc = FIO_Q_COMPLETED; - } - - return rc; -} - -static void fio_windowsaio_cleanup(struct thread_data *td) -{ - int i; - struct windowsaio_data *wd; - - wd = td->io_ops->data; - - WaitForSingleObject(wd->iothread_stopped, INFINITE); - - if (wd != NULL) { - CloseHandle(wd->iothread); - CloseHandle(wd->iothread_stopped); - CloseHandle(wd->iocomplete_event); - - for (i = 0; i < td->o.iodepth; i++) { - CloseHandle(wd->io_handles[i]); - } - - free(wd->aio_events); - free(wd->io_handles); - free(wd->ovls); - free(wd); - - td->io_ops->data = NULL; - } -} - -/* Runs as a thread and waits for queued IO to complete */ -static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter) -{ - OVERLAPPED *ovl; - FIO_OVERLAPPED *fov; - struct io_u *io_u; - struct windowsaio_data *wd; - struct thread_ctx *ctx; - ULONG_PTR ulKey = 0; - DWORD bytes; - - ctx = (struct thread_ctx*)lpParameter; - wd = ctx->wd; - - do { - if (!GetQueuedCompletionStatus(ctx->iocp, &bytes, &ulKey, &ovl, 250)) - continue; - - fov = CONTAINING_RECORD(ovl, FIO_OVERLAPPED, o); - io_u = fov->io_u; - - /* We sometimes get an IO request that hasn't completed yet. Ignore it. */ - if (ovl->Internal == STATUS_PENDING) - continue; - - if (ovl->Internal == ERROR_SUCCESS) { - io_u->resid = io_u->xfer_buflen - ovl->InternalHigh; - io_u->error = 0; - } else { - io_u->resid = io_u->xfer_buflen; - io_u->error = ovl->Internal; - } - - SetEvent(wd->iocomplete_event); - } while (ctx->wd->iothread_running); - - CloseHandle(ctx->iocp); - SetEvent(ctx->wd->iothread_stopped); - free(ctx); - - return 0; -} - static int fio_windowsaio_init(struct thread_data *td) { struct windowsaio_data *wd; HANDLE hKernel32Dll; int rc = 0; + int i; wd = malloc(sizeof(struct windowsaio_data)); if (wd != NULL) @@ -318,15 +99,22 @@ static int fio_windowsaio_init(struct thread_data *td) } if (!rc) { - wd->io_handles = malloc(td->o.iodepth * sizeof(HANDLE)); - if (wd->io_handles == NULL) + wd->ovls = malloc(td->o.iodepth * sizeof(struct fio_overlapped)); + if (wd->ovls == NULL) rc = 1; } if (!rc) { - wd->ovls = malloc(td->o.iodepth * sizeof(FIO_OVERLAPPED)); - if (wd->ovls == NULL) - rc = 1; + for (i = 0; i < td->o.iodepth; i++) { + wd->ovls[i].io_free = TRUE; + wd->ovls[i].io_complete = FALSE; + + wd->ovls[i].o.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (wd->ovls[i].o.hEvent == NULL) { + rc = 1; + break; + } + } } if (!rc) { @@ -341,8 +129,6 @@ static int fio_windowsaio_init(struct thread_data *td) if (wd != NULL) { if (wd->ovls != NULL) free(wd->ovls); - if (wd->io_handles != NULL) - free(wd->io_handles); if (wd->aio_events != NULL) free(wd->aio_events); @@ -357,6 +143,33 @@ static int fio_windowsaio_init(struct thread_data *td) return 0; } +static void fio_windowsaio_cleanup(struct thread_data *td) +{ + int i; + struct windowsaio_data *wd; + + wd = td->io_ops->data; + + if (wd != NULL) { + wd->iothread_running = FALSE; + WaitForSingleObject(wd->iothread, INFINITE); + + CloseHandle(wd->iothread); + CloseHandle(wd->iocomplete_event); + + for (i = 0; i < td->o.iodepth; i++) { + CloseHandle(wd->ovls[i].o.hEvent); + } + + free(wd->aio_events); + free(wd->ovls); + free(wd); + + td->io_ops->data = NULL; + } +} + + static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f) { int rc = 0; @@ -365,7 +178,6 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f) DWORD sharemode = FILE_SHARE_READ | FILE_SHARE_WRITE; DWORD openmode = OPEN_ALWAYS; DWORD access; - int i; dprint(FD_FILE, "fd open %s\n", f->file_name); @@ -419,33 +231,14 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f) struct windowsaio_data *wd; hFile = CreateIoCompletionPort(f->hFile, NULL, 0, 0); - wd = td->io_ops->data; if (!td->o.odirect && !td->o.sync_io) - wd->useIOCP = 1; + wd->use_iocp = 1; else - wd->useIOCP = 0; + wd->use_iocp = 0; - wd->io_index = 0; wd->iothread_running = TRUE; - /* Create a manual-reset event */ - wd->iothread_stopped = CreateEvent(NULL, TRUE, FALSE, NULL); - - if (wd->iothread_stopped == NULL) - rc = 1; - - if (!rc) { - for (i = 0; i < td->o.iodepth; i++) { - /* Create a manual-reset event for putting in OVERLAPPED */ - wd->io_handles[i] = CreateEvent(NULL, TRUE, FALSE, NULL); - if (wd->io_handles[i] == NULL) { - PrintError(__func__); - rc = 1; - break; - } - } - } if (!rc) { ctx = malloc(sizeof(struct thread_ctx)); @@ -466,16 +259,8 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f) static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct fio_file *f) { - struct windowsaio_data *wd; - dprint(FD_FILE, "fd close %s\n", f->file_name); - if (td->io_ops->data != NULL) { - wd = td->io_ops->data; - wd->iothread_running = FALSE; - WaitForSingleObject(wd->iothread_stopped, INFINITE); - } - if (f->hFile != INVALID_HANDLE_VALUE) { if (!CloseHandle(f->hFile)) PrintError(__func__); @@ -485,6 +270,206 @@ static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct f return 0; } +static BOOL timeout_expired(DWORD start_count, DWORD end_count) +{ + BOOL expired = FALSE; + DWORD current_time; + + current_time = GetTickCount(); + + if ((end_count > start_count) && current_time >= end_count) + expired = TRUE; + else if (current_time < start_count && current_time > end_count) + expired = TRUE; + + return expired; +} + +static struct io_u* fio_windowsaio_event(struct thread_data *td, int event) +{ + struct windowsaio_data *wd = td->io_ops->data; + return wd->aio_events[event]; +} + +static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, + unsigned int max, struct timespec *t) +{ + struct windowsaio_data *wd = td->io_ops->data; + struct flist_head *entry; + unsigned int dequeued = 0; + struct io_u *io_u; + struct fio_overlapped *fov; + DWORD start_count = 0; + DWORD end_count = 0; + DWORD status; + DWORD mswait = 250; + + if (t != NULL) { + mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000); + start_count = GetTickCount(); + end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000); + } + + do { + flist_for_each(entry, &td->io_u_busylist) { + io_u = flist_entry(entry, struct io_u, list); + fov = (struct fio_overlapped*)io_u->engine_data; + + if (fov->io_complete) { + fov->io_complete = FALSE; + fov->io_free = TRUE; + wd->aio_events[dequeued] = io_u; + dequeued++; + } + + if (dequeued >= min) + break; + } + + if (dequeued < min) { + status = WaitForSingleObject(wd->iocomplete_event, mswait); + if (status != WAIT_OBJECT_0 && dequeued > 0) + break; + } + + if (dequeued >= min || (t != NULL && timeout_expired(start_count, end_count))) + break; + } while (1); + + return dequeued; +} + +static int fio_windowsaio_queue(struct thread_data *td, + struct io_u *io_u) +{ + LPOVERLAPPED lpOvl = NULL; + struct windowsaio_data *wd; + DWORD iobytes; + BOOL success; + int index; + int rc = FIO_Q_COMPLETED; + + fio_ro_check(td, io_u); + + wd = td->io_ops->data; + + if (wd->use_iocp) { + for (index = 0; index < td->o.iodepth; index++) { + if (wd->ovls[index].io_free) { + wd->ovls[index].io_free = FALSE; + ResetEvent(wd->ovls[index].o.hEvent); + break; + } + } + + assert(index < td->o.iodepth); + + lpOvl = &wd->ovls[index].o; + wd->ovls[index].io_u = io_u; + lpOvl->Internal = STATUS_PENDING; + lpOvl->InternalHigh = 0; + lpOvl->Offset = io_u->offset & 0xFFFFFFFF; + lpOvl->OffsetHigh = io_u->offset >> 32; + lpOvl->Pointer = NULL; + io_u->engine_data = &wd->ovls[index]; + } + + switch (io_u->ddir) { + case DDIR_WRITE: + success = WriteFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl); + break; + case DDIR_READ: + success = ReadFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl); + break; + case DDIR_SYNC: + case DDIR_DATASYNC: + case DDIR_SYNC_FILE_RANGE: + success = FlushFileBuffers(io_u->file->hFile); + if (!success) + io_u->error = GetLastError(); + + return FIO_Q_COMPLETED; + break; + case DDIR_TRIM: + log_err("manual TRIM isn't supported on Windows"); + io_u->error = 1; + io_u->resid = io_u->xfer_buflen; + return FIO_Q_COMPLETED; + break; + default: + assert(0); + } + + if (wd->use_iocp && (success || GetLastError() == ERROR_IO_PENDING)) { + rc = FIO_Q_QUEUED; + } else if (success && !wd->use_iocp) { + io_u->resid = io_u->xfer_buflen - iobytes; + io_u->error = 0; + } else { + PrintError(__func__); + io_u->error = GetLastError(); + io_u->resid = io_u->xfer_buflen; + } + + return rc; +} + +/* Runs as a thread and waits for queued IO to complete */ +static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter) +{ + OVERLAPPED *ovl; + struct fio_overlapped *fov; + struct io_u *io_u; + struct windowsaio_data *wd; + struct thread_ctx *ctx; + ULONG_PTR ulKey = 0; + DWORD bytes; + + ctx = (struct thread_ctx*)lpParameter; + wd = ctx->wd; + + do { + if (!GetQueuedCompletionStatus(ctx->iocp, &bytes, &ulKey, &ovl, 250)) + continue; + + fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o); + io_u = fov->io_u; + + if (ovl->Internal == ERROR_SUCCESS) { + io_u->resid = io_u->xfer_buflen - ovl->InternalHigh; + io_u->error = 0; + } else { + io_u->resid = io_u->xfer_buflen; + io_u->error = ovl->Internal; + } + + fov->io_complete = TRUE; + SetEvent(wd->iocomplete_event); + } while (ctx->wd->iothread_running); + + CloseHandle(ctx->iocp); + free(ctx); + return 0; +} + +static int fio_windowsaio_cancel(struct thread_data *td, + struct io_u *io_u) +{ + int rc = 0; + + struct windowsaio_data *wd = td->io_ops->data; + + /* If we're running on Vista or newer, we can cancel individual IO requests */ + if (wd->pCancelIoEx != NULL) { + struct fio_overlapped *ovl = io_u->engine_data; + if (!wd->pCancelIoEx(io_u->file->hFile, &ovl->o)) + rc = 1; + } else + rc = 1; + + return rc; +} + static struct ioengine_ops ioengine = { .name = "windowsaio", .version = FIO_IOOPS_VERSION, -- 2.25.1