From f2a2ce0eedb44eaa8689e4cbfa77c79b1751b216 Mon Sep 17 00:00:00 2001 From: Huadong Liu Date: Wed, 30 Jan 2013 13:22:24 +0100 Subject: [PATCH] Add support for idletime profiling Idletime profiling allows a benchmark to run while filling the idle cycles on the server, hence giving you some metric of how much pressure the system is under. This is useful to be able to profile and compare changes in a setup or driver, for instance. Signed-off-by: Jens Axboe --- HOWTO | 15 ++ Makefile | 3 +- README | 3 + backend.c | 8 + fio.1 | 3 + idletime.c | 472 ++++++++++++++++++++++++++++++++++++++++++++++++ idletime.h | 57 ++++++ init.c | 17 ++ os/os-linux.h | 7 + os/os-windows.h | 8 + stat.c | 5 + 11 files changed, 597 insertions(+), 1 deletion(-) create mode 100644 idletime.c create mode 100644 idletime.h diff --git a/HOWTO b/HOWTO index 8073240e..c46b8834 100644 --- a/HOWTO +++ b/HOWTO @@ -1693,3 +1693,18 @@ write Write 'length' bytes beginning from 'offset' sync fsync() the file datasync fdatasync() the file trim trim the given file from the given 'offset' for 'length' bytes + + +9.0 CPU idleness profiling + +In some cases, we want to understand CPU overhead in a test. For example, +we test patches for the specific goodness of whether they reduce CPU usage. +fio implements a balloon approach to create a thread per CPU that runs at +idle priority, meaning that it only runs when nobody else needs the cpu. +By measuring the amount of work completed by the thread, idleness of each +CPU can be derived accordingly. + +An unit work is defined as touching a full page of unsigned characters. Mean +and standard deviation of time to complete an unit work is reported in "unit +work" section. Options can be chosen to report detailed percpu idleness or +overall system idleness by aggregating percpu stats. diff --git a/Makefile b/Makefile index 52ec4a2d..a4a478d9 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,8 @@ SOURCE := gettime.c fio.c ioengines.c init.c stat.c log.c time.c filesetup.c \ engines/mmap.c engines/sync.c engines/null.c engines/net.c \ memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \ json.c lib/zipf.c lib/axmap.c lib/lfsr.c gettime-thread.c \ - helpers.c lib/flist_sort.c lib/hweight.c lib/getrusage.c + helpers.c lib/flist_sort.c lib/hweight.c lib/getrusage.c \ + idletime.c ifdef CONFIG_64BIT_LLP64 CFLAGS += -DBITS_PER_LONG=32 diff --git a/README b/README index 7c4552d9..c43b795c 100644 --- a/README +++ b/README @@ -145,6 +145,9 @@ $ fio --max-jobs Maximum number of threads/processes to support --server=args Start backend server. See Client/Server section. --client=host Connect to specified backend. + --idle-prof=option Report cpu idleness on a system or percpu basis + (option=system,percpu) or run unit work + calibration only (option=calibrate). Any parameters following the options will be assumed to be job files, diff --git a/backend.c b/backend.c index 87810e89..218ae254 100644 --- a/backend.c +++ b/backend.c @@ -51,6 +51,7 @@ #include "memalign.h" #include "server.h" #include "lib/getrusage.h" +#include "idletime.h" static pthread_t disk_util_thread; static struct fio_mutex *disk_thread_mutex; @@ -1499,6 +1500,8 @@ static void run_threads(void) if (fio_gtod_offload && fio_start_gtod_thread()) return; + + fio_idle_prof_init(); set_sig_handlers(); @@ -1556,6 +1559,9 @@ static void run_threads(void) } } + /* start idle threads before io threads start to run */ + fio_idle_prof_start(); + set_genesis_time(); while (todo) { @@ -1718,6 +1724,8 @@ static void run_threads(void) usleep(10000); } + fio_idle_prof_stop(); + update_io_ticks(); fio_unpin_memory(); } diff --git a/fio.1 b/fio.1 index 1f81ea7b..6b7e9459 100644 --- a/fio.1 +++ b/fio.1 @@ -77,6 +77,9 @@ Background a fio server, writing the pid to the given pid file. .TP .BI \-\-client \fR=\fPhost Instead of running the jobs locally, send and run them on the given host. +.TP +.BI \-\-idle\-prof \fR=\fPoption +Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate). .SH "JOB FILE FORMAT" Job files are in `ini' format. They consist of one or more job definitions, which begin with a job name in square brackets and diff --git a/idletime.c b/idletime.c new file mode 100644 index 00000000..6100134f --- /dev/null +++ b/idletime.c @@ -0,0 +1,472 @@ +#include +#include "json.h" +#include "idletime.h" + +static volatile struct idle_prof_common ipc; + +/* Get time to complete an unit work on a particular cpu. + * The minimum number in CALIBRATE_RUNS runs is returned. + */ +static double calibrate_unit(unsigned char *data) +{ + unsigned long t, i, j, k; + struct timeval tps; + double tunit = 0.0; + + for (i=0; iinit_lock); + + /* exit if any other thread failed to start */ + if (ipc.status == IDLE_PROF_STATUS_ABORT) + return NULL; + +#if defined(FIO_HAVE_CPU_AFFINITY) + os_cpu_mask_t cpu_mask; + memset(&cpu_mask, 0, sizeof(cpu_mask)); + fio_cpu_set(&cpu_mask, ipt->cpu); + + if ((retval=fio_setaffinity(gettid(), cpu_mask)) == -1) + log_err("fio: fio_setaffinity failed\n"); +#else + retval = -1; + log_err("fio: fio_setaffinity not supported\n"); +#endif + if (retval == -1) { + ipt->state = TD_EXITED; + pthread_mutex_unlock(&ipt->init_lock); + return NULL; + } + + ipt->cali_time = calibrate_unit(ipt->data); + + /* delay to set IDLE class till now for better calibration accuracy */ +#if defined(FIO_HAVE_SCHED_IDLE) + if ((retval = fio_set_sched_idle())) + log_err("fio: fio_set_sched_idle failed\n"); +#else + retval = -1; + log_err("fio: fio_set_sched_idle not supported\n"); +#endif + if (retval == -1) { + ipt->state = TD_EXITED; + pthread_mutex_unlock(&ipt->init_lock); + return NULL; + } + + ipt->state = TD_INITIALIZED; + + /* signal the main thread that calibration is done */ + pthread_cond_signal(&ipt->cond); + pthread_mutex_unlock(&ipt->init_lock); + + /* wait for other calibration to finish */ + pthread_mutex_lock(&ipt->start_lock); + + /* exit if other threads failed to initialize */ + if (ipc.status == IDLE_PROF_STATUS_ABORT) + return NULL; + + /* exit if we are doing calibration only */ + if (ipc.status == IDLE_PROF_STATUS_CALI_STOP) + return NULL; + + fio_gettime(&ipt->tps, NULL); + ipt->state = TD_RUNNING; + + j = 0; + while (1) { + for (k=0; k < page_size; k++) { + ipt->data[(k+j)%page_size] = k%256; + if (ipc.status == IDLE_PROF_STATUS_PROF_STOP) { + fio_gettime(&ipt->tpe, NULL); + goto idle_prof_done; + } + } + j++; + } + +idle_prof_done: + + ipt->loops = j + (double)k/page_size; + ipt->state = TD_EXITED; + pthread_mutex_unlock(&ipt->start_lock); + + return NULL; +} + +/* calculate mean and standard deviation to complete an unit of work */ +static void calibration_stats(void) +{ + int i; + double sum=0.0, var=0.0; + struct idle_prof_thread *ipt; + + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + sum += ipt->cali_time; + } + + ipc.cali_mean = sum/ipc.nr_cpus; + + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + var += pow(ipt->cali_time-ipc.cali_mean, 2); + } + + ipc.cali_stddev = sqrt(var/(ipc.nr_cpus-1)); +} + +void fio_idle_prof_init(void) +{ + int i, ret; + struct timeval tp; + struct timespec ts; + pthread_attr_t tattr; + struct idle_prof_thread *ipt; + + ipc.nr_cpus = cpus_online(); + ipc.status = IDLE_PROF_STATUS_OK; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return; + + if ((ret = pthread_attr_init(&tattr))) { + log_err("fio: pthread_attr_init %s\n", strerror(ret)); + return; + } + if ((ret = pthread_attr_setscope(&tattr, PTHREAD_SCOPE_SYSTEM))) { + log_err("fio: pthread_attr_setscope %s\n", strerror(ret)); + return; + } + + ipc.ipts = malloc(ipc.nr_cpus * sizeof(struct idle_prof_thread)); + if (!ipc.ipts) { + log_err("fio: malloc failed\n"); + return; + } + + ipc.buf = malloc(ipc.nr_cpus * page_size); + if (!ipc.buf) { + log_err("fio: malloc failed\n"); + free(ipc.ipts); + return; + } + + /* profiling aborts on any single thread failure since the + * result won't be accurate if any cpu is not used. + */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + + ipt->cpu = i; + ipt->state = TD_NOT_CREATED; + ipt->data = (unsigned char *)(ipc.buf + page_size * i); + + if ((ret = pthread_mutex_init(&ipt->init_lock, NULL))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_mutex_init %s\n", strerror(ret)); + break; + } + + if ((ret = pthread_mutex_init(&ipt->start_lock, NULL))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_mutex_init %s\n", strerror(ret)); + break; + } + + if ((ret = pthread_cond_init(&ipt->cond, NULL))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_cond_init %s\n", strerror(ret)); + break; + } + + /* make sure all threads are spawned before they start */ + pthread_mutex_lock(&ipt->init_lock); + + /* make sure all threads finish init before profiling starts */ + pthread_mutex_lock(&ipt->start_lock); + + if ((ret = pthread_create(&ipt->thread, &tattr, idle_prof_thread_fn, ipt))) { + ipc.status = IDLE_PROF_STATUS_ABORT; + log_err("fio: pthread_create %s\n", strerror(ret)); + break; + } else { + ipt->state = TD_CREATED; + } + + if ((ret = pthread_detach(ipt->thread))) { + /* log error and let the thread spin */ + log_err("fio: pthread_detatch %s\n", strerror(ret)); + } + } + + /* let good threads continue so that they can exit + * if errors on other threads occurred previously. + */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_unlock(&ipt->init_lock); + } + + if (ipc.status == IDLE_PROF_STATUS_ABORT) + return; + + /* wait for calibration to finish */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_lock(&ipt->init_lock); + while ((ipt->state!=TD_EXITED) && (ipt->state!=TD_INITIALIZED)) { + fio_gettime(&tp, NULL); + ts.tv_sec = tp.tv_sec + 1; + ts.tv_nsec = tp.tv_usec * 1000; + pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts); + } + pthread_mutex_unlock(&ipt->init_lock); + + /* any thread failed to initialize would abort other threads + * later after fio_idle_prof_start. + */ + if (ipt->state == TD_EXITED) + ipc.status = IDLE_PROF_STATUS_ABORT; + } + + if (ipc.status != IDLE_PROF_STATUS_ABORT) + calibration_stats(); + else + ipc.cali_mean = ipc.cali_stddev = 0.0; + + if (ipc.opt == IDLE_PROF_OPT_CALI) + ipc.status = IDLE_PROF_STATUS_CALI_STOP; +} + +void fio_idle_prof_start(void) +{ + int i; + struct idle_prof_thread *ipt; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return; + + /* unlock regardless abort is set or not */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_unlock(&ipt->start_lock); + } +} + +void fio_idle_prof_stop(void) +{ + int i; + uint64_t runt; + struct timeval tp; + struct timespec ts; + struct idle_prof_thread *ipt; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return; + + if (ipc.opt == IDLE_PROF_OPT_CALI) + return; + + ipc.status = IDLE_PROF_STATUS_PROF_STOP; + + /* wait for all threads to exit from profiling */ + for (i = 0; i < ipc.nr_cpus; i++) { + ipt = &ipc.ipts[i]; + pthread_mutex_lock(&ipt->start_lock); + while ((ipt->state!=TD_EXITED) && (ipt->state!=TD_NOT_CREATED)) { + fio_gettime(&tp, NULL); + ts.tv_sec = tp.tv_sec + 1; + ts.tv_nsec = tp.tv_usec * 1000; + /* timed wait in case a signal is not received */ + pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts); + } + pthread_mutex_unlock(&ipt->start_lock); + + /* calculate idleness */ + if (ipc.cali_mean != 0.0) { + runt = utime_since(&ipt->tps, &ipt->tpe); + ipt->idleness = ipt->loops * ipc.cali_mean / runt; + } else + ipt->idleness = 0.0; + } + + /* memory allocations are freed via explicit fio_idle_prof_cleanup + * after profiling stats are collected by apps. + */ + + return; +} + +/* return system idle percentage when cpu is -1; + * return one cpu idle percentage otherwise. + */ +static double fio_idle_prof_cpu_stat(int cpu) +{ + int i, nr_cpus = ipc.nr_cpus; + struct idle_prof_thread *ipt; + double p = 0.0; + + if (ipc.opt == IDLE_PROF_OPT_NONE) + return 0.0; + + if ((cpu >= nr_cpus) || (cpu < -1)) { + log_err("fio: idle profiling invalid cpu index\n"); + return 0.0; + } + + if (cpu == -1) { + for (i = 0; i < nr_cpus; i++) { + ipt = &ipc.ipts[i]; + p += ipt->idleness; + } + p /= nr_cpus; + } else { + ipt = &ipc.ipts[cpu]; + p = ipt->idleness; + } + + return p*100.0; +} + +void fio_idle_prof_cleanup(void) +{ + if (ipc.ipts) { + free(ipc.ipts); + ipc.ipts = NULL; + } + + if (ipc.buf) { + free(ipc.buf); + ipc.buf = NULL; + } +} + +int fio_idle_prof_parse_opt(const char *args) +{ + ipc.opt = IDLE_PROF_OPT_NONE; /* default */ + + if (!args) { + log_err("fio: empty idle-prof option string\n"); + return -1; + } + +#if defined(FIO_HAVE_CPU_AFFINITY) && defined(FIO_HAVE_SCHED_IDLE) + if (strcmp("calibrate", args) == 0) { + ipc.opt = IDLE_PROF_OPT_CALI; + fio_idle_prof_init(); + fio_idle_prof_start(); + fio_idle_prof_stop(); + show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL); + return 1; + } else if (strcmp("system", args) == 0) { + ipc.opt = IDLE_PROF_OPT_SYSTEM; + return 0; + } else if (strcmp("percpu", args) == 0) { + ipc.opt = IDLE_PROF_OPT_PERCPU; + return 0; + } else { + log_err("fio: incorrect idle-prof option\n", args); + return -1; + } +#else + log_err("fio: idle-prof not supported on this platform\n"); + return -1; +#endif +} + +void show_idle_prof_stats(int output, struct json_object *parent) +{ + int i, nr_cpus = ipc.nr_cpus; + struct json_object *tmp; + char s[MAX_CPU_STR_LEN]; + + if (output == FIO_OUTPUT_NORMAL) { + if (ipc.opt > IDLE_PROF_OPT_CALI) + log_info("\nCPU idleness:\n"); + else if (ipc.opt == IDLE_PROF_OPT_CALI) + log_info("CPU idleness:\n"); + + if (ipc.opt >= IDLE_PROF_OPT_SYSTEM) + log_info(" system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1)); + + if (ipc.opt == IDLE_PROF_OPT_PERCPU) { + log_info(" percpu: %3.2f%%", fio_idle_prof_cpu_stat(0)); + for (i=1; i= IDLE_PROF_OPT_CALI) { + log_info(" unit work: mean=%3.2fus,", ipc.cali_mean); + log_info(" stddev=%3.2f\n", ipc.cali_stddev); + } + + /* dynamic mem allocations can now be freed */ + if (ipc.opt != IDLE_PROF_OPT_NONE) + fio_idle_prof_cleanup(); + + return; + } + + if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output == FIO_OUTPUT_JSON)) { + if (!parent) + return; + + tmp = json_create_object(); + if (!tmp) + return; + + json_object_add_value_object(parent, "cpu_idleness", tmp); + json_object_add_value_float(tmp, "system", fio_idle_prof_cpu_stat(-1)); + + if (ipc.opt == IDLE_PROF_OPT_PERCPU) { + for (i=0; i"); printf("\n Jens Axboe \n"); } @@ -1611,6 +1620,14 @@ int parse_cmd_line(int argc, char *argv[]) case 'D': pid_file = strdup(optarg); break; + case 'I': + if ((ret = fio_idle_prof_parse_opt(optarg))) { + /* exit on error and calibration only */ + do_exit++; + if (ret == -1) + exit_val = 1; + } + break; case 'C': if (is_backend) { log_err("fio: can't be both client and server\n"); diff --git a/os/os-linux.h b/os/os-linux.h index 4e837da9..acc5a202 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -35,6 +35,7 @@ #define FIO_HAVE_TRIM #define FIO_HAVE_BINJECT #define FIO_HAVE_GETTID +#define FIO_HAVE_SCHED_IDLE #define FIO_USE_GENERIC_INIT_RANDOM_STATE #ifdef MAP_HUGETLB @@ -246,4 +247,10 @@ static inline int os_trim(int fd, unsigned long long start, return errno; } +static inline int fio_set_sched_idle(void) +{ + struct sched_param p = { .sched_priority = 0, }; + return sched_setscheduler(gettid(), SCHED_IDLE, &p); +} + #endif diff --git a/os/os-windows.h b/os/os-windows.h index ef71dd70..2d57891f 100644 --- a/os/os-windows.h +++ b/os/os-windows.h @@ -21,6 +21,7 @@ #define FIO_HAVE_CPU_AFFINITY #define FIO_HAVE_CHARDEV_SIZE #define FIO_HAVE_GETTID +#define FIO_HAVE_SCHED_IDLE #define FIO_USE_GENERIC_RAND #define FIO_PREFERRED_ENGINE "windowsaio" @@ -247,4 +248,11 @@ static inline int init_random_state(struct thread_data *td, unsigned long *rand_ } +static inline int fio_set_sched_idle(void) +{ + /* SetThreadPriority returns nonzero for success */ + return (SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE))? 0 : -1; +} + + #endif /* FIO_OS_WINDOWS_H */ diff --git a/stat.c b/stat.c index 5cc6b7b9..7e2feea2 100644 --- a/stat.c +++ b/stat.c @@ -12,6 +12,7 @@ #include "lib/ieee754.h" #include "json.h" #include "lib/getrusage.h" +#include "idletime.h" void update_rusage_stat(struct thread_data *td) { @@ -1335,6 +1336,8 @@ void show_run_stats(void) /* disk util stats, if any */ show_disk_util(1, root); + show_idle_prof_stats(FIO_OUTPUT_JSON, root); + json_print_object(root); log_info("\n"); json_free_object(root); @@ -1355,6 +1358,8 @@ void show_run_stats(void) else if (output_format == FIO_OUTPUT_NORMAL) show_disk_util(0, NULL); + show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL); + free(runstats); free(threadstats); } -- 2.25.1