#include <libaio.h>
#include <math.h>
#include <limits.h>
+#include <assert.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/shm.h>
#include <asm/unistd.h>
#include <asm/types.h>
+#include <asm/bitops.h>
#include "list.h"
#include "md5.h"
#define td_read(td) ((td)->ddir == DDIR_READ)
#define should_fsync(td) (!td_read(td) && !(td)->odirect)
+#define BLOCKS_PER_MAP (8 * sizeof(long))
+#define RAND_MAP_IDX(sector) ((sector) / BLOCKS_PER_MAP)
+#define RAND_MAP_BIT(sector) ((sector) & (BLOCKS_PER_MAP - 1))
+
struct thread_data {
char file_name[256];
int thread_number;
unsigned int ratecycle;
unsigned long rate_usec_cycle;
long rate_pending_usleep;
- unsigned long rate_sectors;
+ unsigned long rate_bytes;
struct timeval lastrate;
unsigned long runtime; /* sec */
- unsigned long sectors;
+ unsigned long long io_size;
unsigned long io_blocks;
- unsigned long io_sectors;
- unsigned long this_io_sectors;
- unsigned long last_sectors;
+ unsigned long io_bytes;
+ unsigned long this_io_bytes;
+ unsigned long last_bytes;
sem_t mutex;
+
struct drand48_data random_state;
+ unsigned long *file_map;
+ unsigned int num_maps;
/*
* bandwidth and latency stats
struct io_stat slat_stat; /* submission latency */
struct io_stat bw_stat; /* bandwidth stats */
- unsigned long stat_io_sectors;
+ unsigned long stat_io_bytes;
struct timeval stat_sample_time;
struct io_log *lat_log;
static int init_random_state(struct thread_data *td)
{
unsigned long seed;
- int fd;
+ int fd, num_maps, blocks;
fd = open("/dev/random", O_RDONLY);
if (fd == -1) {
if (repeatable)
seed = DEF_RANDSEED;
+ blocks = (td->io_size + td->min_bs - 1) / td->min_bs;
+ num_maps = blocks / BLOCKS_PER_MAP;
+ td->file_map = malloc(num_maps * sizeof(long));
+ td->num_maps = num_maps;
+ memset(td->file_map, 0, num_maps * sizeof(long));
+
srand48_r(seed, &td->random_state);
return 0;
}
return s->tv_sec * 1000 + s->tv_usec / 1000;
}
-static unsigned long long get_next_offset(struct thread_data *td)
+static int random_map_free(struct thread_data *td, unsigned long long block)
+{
+ unsigned int idx = RAND_MAP_IDX(block);
+ unsigned int bit = RAND_MAP_BIT(block);
+
+ return (td->file_map[idx] & (1UL << bit)) == 0;
+}
+
+static int get_next_free_block(struct thread_data *td, unsigned long long *b)
{
- unsigned long long kb;
+ int i;
+
+ *b = 0;
+ i = 0;
+ while ((*b) * td->min_bs < td->io_size) {
+ if (td->file_map[i] != -1UL) {
+ *b += ffz(td->file_map[i]);
+ return 0;
+ }
+
+ *b += BLOCKS_PER_MAP;
+ i++;
+ }
+
+ return 1;
+}
+
+static void mark_random_map(struct thread_data *td, struct io_u *io_u)
+{
+ unsigned long block = io_u->offset / td->min_bs;
+ unsigned int blocks = 0;
+
+ while (blocks < (io_u->buflen / td->min_bs)) {
+ int idx, bit;
+
+ if (!random_map_free(td, block))
+ break;
+
+ idx = RAND_MAP_IDX(block);
+ bit = RAND_MAP_BIT(block);
+
+ assert(idx < td->num_maps);
+
+ td->file_map[idx] |= (1UL << bit);
+ block++;
+ blocks++;
+ }
+
+ if ((blocks * td->min_bs) < io_u->buflen)
+ io_u->buflen = blocks * td->min_bs;
+}
+
+static int get_next_offset(struct thread_data *td, unsigned long long *offset)
+{
+ unsigned long long b;
long r;
if (!td->sequential) {
- int min_bs_kb = td->min_bs >> 10;
- unsigned long max_kb = td->sectors << 1;
-
- lrand48_r(&td->random_state, &r);
- kb = (1+(double) (max_kb-1) * r / (RAND_MAX+1.0));
- kb = (kb + min_bs_kb - 1) & ~(min_bs_kb - 1);
+ unsigned long max_blocks = td->io_size / td->min_bs;
+ int loops = 50;
+
+ do {
+ lrand48_r(&td->random_state, &r);
+ b = ((max_blocks - 1) * r / (RAND_MAX+1.0));
+ loops--;
+ } while (!random_map_free(td, b) && loops);
+
+ if (!loops) {
+ if (get_next_free_block(td, &b))
+ return 1;
+ }
} else
- kb = td->last_sectors << 1;
+ b = td->last_bytes / td->min_bs;
- return (kb << 10) + td->file_offset;
+ *offset = (b * td->min_bs) + td->file_offset;
+ return 0;
}
static unsigned int get_next_buflen(struct thread_data *td)
buflen = (buflen + td->min_bs - 1) & ~(td->min_bs - 1);
}
- if (buflen > ((td->sectors - td->this_io_sectors) << 9))
- buflen = (td->sectors - td->this_io_sectors) << 9;
+ if (buflen > td->io_size - td->this_io_bytes)
+ buflen = td->io_size - td->this_io_bytes;
- td->last_sectors += buflen >> 9;
return buflen;
}
if (spent < td->bw_avg_time)
return;
- rate = ((td->this_io_sectors - td->stat_io_sectors) << 9) / spent;
+ rate = (td->this_io_bytes - td->stat_io_bytes) / spent;
add_stat_sample(td, &td->bw_stat, rate);
if (td->bw_log)
add_log_sample(td, td->bw_log, rate);
gettimeofday(&td->stat_sample_time, NULL);
- td->stat_io_sectors = td->this_io_sectors;
+ td->stat_io_bytes = td->this_io_bytes;
}
static void usec_sleep(int usec)
/*
* if rate blocks is set, sample is running
*/
- if (td->rate_sectors) {
+ if (td->rate_bytes) {
spent = mtime_since(&td->lastrate, now);
if (spent < td->ratecycle)
return 0;
- rate = ((td->this_io_sectors - td->rate_sectors) << 9) / spent;
+ rate = (td->this_io_bytes - td->rate_bytes) / spent;
if (rate < td->ratemin) {
printf("Client%d: min rate %d not met, got %ldKiB/sec\n", td->thread_number, td->ratemin, rate);
if (rate_quit)
}
}
- td->rate_sectors = td->this_io_sectors;
+ td->rate_bytes = td->this_io_bytes;
memcpy(&td->lastrate, now, sizeof(*now));
return 0;
}
if (!io_u)
return NULL;
- io_u->offset = get_next_offset(td);
+ if (get_next_offset(td, &io_u->offset))
+ return NULL;
+
io_u->buflen = get_next_buflen(td);
if (!io_u->buflen) {
put_io_u(td, io_u);
return NULL;
}
+ if (io_u->buflen + io_u->offset > td->io_size)
+ io_u->buflen = td->io_size - io_u->offset;
+
+ if (!td->sequential)
+ mark_random_map(td, io_u);
+
+ td->last_bytes += io_u->buflen;
+
if (td->verify)
populate_io_u(td, io_u);
}
}
-/*
- * if ipo's overlap, kill old ipo
- */
-static int ipo_overlap(struct io_piece *old, struct io_piece *new)
-{
- unsigned long long old_end = old->offset + old->len;
- unsigned long long new_end = new->offset + new->len;
-
- if ((new->offset > old->offset && new->offset < old_end) ||
- (new_end > old->offset && new_end < old_end)) {
- list_add(&new->list, &old->list);
- list_del(&old->list);
- free(old);
- return 1;
- }
-
- return 0;
-}
-
/*
* log a succesful write, so we can unwind the log for verify
*/
static void log_io_piece(struct thread_data *td, struct io_u *io_u)
{
- struct io_piece *ipo = malloc(sizeof(*ipo));
+ struct io_piece *ipo = malloc(sizeof(struct io_piece));
struct list_head *entry;
INIT_LIST_HEAD(&ipo->list);
ipo->offset = io_u->offset;
ipo->len = io_u->buflen;
- if (td->sequential) {
+ /*
+ * for random io where the writes extend the file, it will typically
+ * be laid out with the block scattered as written. it's faster to
+ * read them in in that order again, so don't sort
+ */
+ if (td->sequential || !td->overwrite) {
list_add_tail(&ipo->list, &td->io_hist_list);
return;
}
while ((entry = entry->prev) != &td->io_hist_list) {
struct io_piece *__ipo = list_entry(entry, struct io_piece, list);
- if (ipo_overlap(__ipo, ipo))
- return;
-
if (__ipo->offset < ipo->offset)
break;
}
io_u = __get_io_u(td);
if (!td->odirect) {
- unsigned long size = td->sectors << 9;
-
- if (fadvise(td->fd, 0, size, POSIX_FADV_DONTNEED) < 0) {
+ if (fadvise(td->fd, td->file_offset, td->io_size, POSIX_FADV_DONTNEED) < 0) {
td->error = errno;
goto out;
}
struct io_u *io_u = NULL;
struct timeval e;
- while (td->this_io_sectors < td->sectors) {
+ while (td->this_io_bytes < td->io_size) {
int ret;
if (td->terminate)
log_io_piece(td, io_u);
td->io_blocks++;
- td->io_sectors += io_u->buflen >> 9;
- td->this_io_sectors += io_u->buflen >> 9;
+ td->io_bytes += io_u->buflen;
+ td->this_io_bytes += io_u->buflen;
td->cur_off = io_u->offset + io_u->buflen;
gettimeofday(&e, NULL);
io_u = ev_to_iou(td->aio_events + i);
td->io_blocks++;
- td->io_sectors += io_u->buflen >> 9;
- td->this_io_sectors += io_u->buflen >> 9;
+ td->io_bytes += io_u->buflen;
+ td->this_io_bytes += io_u->buflen;
msec = mtime_since(&io_u->issue_time, &e);
struct timeval s, e;
unsigned long usec;
- while (td->this_io_sectors < td->sectors) {
+ while (td->this_io_bytes < td->io_size) {
struct timespec ts = { .tv_sec = 0, .tv_nsec = 0};
struct timespec *timeout;
int ret, min_evts = 0;
return 1;
}
- td->sectors = td->file_size >> 9;
+ td->io_size = td->file_size;
b = malloc(td->max_bs);
memset(b, 0, td->max_bs);
st.st_size = td->file_size;
}
- td->sectors = (st.st_size - td->file_offset) / 1024;
- if (!td->sectors) {
+ if (td->file_offset > st.st_size) {
+ fprintf(stderr, "Client%d: offset larger than length\n", td->thread_number);
+ return 1;
+ }
+
+ td->io_size = st.st_size - td->file_offset;
+ if (td->io_size == 0) {
fprintf(stderr, "Client%d: no io blocks\n", td->thread_number);
td->error = EINVAL;
return 1;
lseek(td->fd, SEEK_SET, 0);
td->cur_off = 0;
- td->last_sectors = 0;
- td->stat_io_sectors = 0;
- td->this_io_sectors = 0;
+ td->last_bytes = 0;
+ td->stat_io_bytes = 0;
+ td->this_io_bytes = 0;
+
+ if (td->file_map)
+ memset(td->file_map, 0, td->num_maps * sizeof(long));
}
static void *thread_main(int shm_id, int offset, char *argv[])
if (td->use_aio && init_aio(td))
goto err;
- if (init_random_state(td))
- goto err;
-
if (td->ioprio) {
if (ioprio_set(IOPRIO_WHO_PROCESS, 0, td->ioprio) == -1) {
td->error = errno;
if (!td->create_serialize && setup_file(td))
goto err;
+ if (init_random_state(td))
+ goto err;
+
gettimeofday(&td->start, NULL);
while (td->loops--) {
unsigned long min, max, bw = 0;
double mean, dev;
- if (!td->io_sectors && !td->error)
+ if (!td->io_bytes && !td->error)
return;
if (td->runtime)
- bw = (td->io_sectors << 9) / td->runtime;
+ bw = td->io_bytes / td->runtime;
prio = td->ioprio & 0xff;
prio_class = td->ioprio >> IOPRIO_CLASS_SHIFT;
- printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/s, runt=%6lumsec\n", td->thread_number, td->error, td->io_sectors << 9, bw, td->runtime);
+ printf("Client%d: err=%2d, io=%6luMiB, bw=%6luKiB/s, runt=%6lumsec\n", td->thread_number, td->error, td->io_bytes >> 20, bw, td->runtime);
if (calc_lat(&td->slat_stat, &min, &max, &mean, &dev))
printf(" slat (msec): min=%5lu, max=%5lu, avg=%5.02f, dev=%5.02f\n", min, max, mean, dev);
return -1;
}
- nr_reads_per_sec = td->rate * 1024 / td->min_bs;
+ nr_reads_per_sec = (td->rate * 1024) / td->min_bs;
td->rate_usec_cycle = 1000000 / nr_reads_per_sec;
td->rate_pending_usleep = 0;
return 0;
max_run[td->ddir] = td->runtime;
if (td->runtime)
- bw = (td->io_sectors << 9) / td->runtime;
+ bw = td->io_bytes / td->runtime;
if (bw < min_bw[td->ddir])
min_bw[td->ddir] = bw;
if (bw > max_bw[td->ddir])
max_bw[td->ddir] = bw;
- io_mb[td->ddir] += td->io_sectors >> 9;
+ io_mb[td->ddir] += td->io_bytes >> 20;
}
show_thread_status(td);
}
if (max_run[0])
- agg[0] = io_mb[0] * 1024 * 1000 / max_run[0];
+ agg[0] = (io_mb[0] * 1024 * 1000) / max_run[0];
if (max_run[1])
- agg[1] = io_mb[1] * 1024 * 1000 / max_run[1];
+ agg[1] = (io_mb[1] * 1024 * 1000) / max_run[1];
printf("\nRun status:\n");
if (max_run[DDIR_READ])