From d0ca268b05b1f0135e6b0d39da6ef8101fe686d6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Aug 2005 13:49:39 +0200 Subject: [PATCH] Initial commit --- Makefile | 12 ++ README | 19 +++ blkparse.c | 388 +++++++++++++++++++++++++++++++++++++++++++++++++++++ blktrace.c | 313 ++++++++++++++++++++++++++++++++++++++++++ blktrace.h | 79 +++++++++++ 5 files changed, 811 insertions(+) create mode 100644 Makefile create mode 100644 README create mode 100644 blkparse.c create mode 100644 blktrace.c create mode 100644 blktrace.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6b2f30a --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +CC = gcc +CFLAGS = -Wall -O2 -D_GNU_SOURCE +PROG = blkparse blktrace +TRACE_LIBS = -lpthread + +all: $(PROG) + +blkparse: blkparse.o +blktrace: blktrace.o $(TRACE_LIBS) + +clean: + -rm -f *.o $(PROG) diff --git a/README b/README new file mode 100644 index 0000000..3d3ed7e --- /dev/null +++ b/README @@ -0,0 +1,19 @@ +Block IO Tracing +---------------- + +Written by Jens Axboe (initial version and kernel support) and +Alan D. Brunelle (threading and splitup into two seperate programs). + + +To run: + % blktrace + --- run task to generate load to be traced --- + + --- Generates: + _dat.[0..ncpus] : Contains binary pdu data + _out.[0..ncpus] : Contains binary trace data + + % blkparse + --- Generates: + _log.[0..ncpus] : Contains formatted trace data + Merged formatted trace data to stdout diff --git a/blkparse.c b/blkparse.c new file mode 100644 index 0000000..818f44f --- /dev/null +++ b/blkparse.c @@ -0,0 +1,388 @@ +#include +#include +#include +#include +#include +#include + +#include "blktrace.h" + +#define NELEMS(pfi) ((pfi)->stat.st_size / sizeof(struct blk_io_trace)) + +#define MAX_CPUS (1 << 8) /* only 8 bits for now */ + +struct per_file_info { + int cpu; + int nelems; + struct stat stat; + + int fd; + char *fname; + FILE *ofp; + char *ofname; + int dfd; + char *dname; + + unsigned long long start_time; +}; + +struct per_file_info per_file_info[MAX_CPUS]; +struct per_file_info *current; + +static unsigned long qreads, qwrites, creads, cwrites, mreads, mwrites; +static unsigned long long qread_kb, qwrite_kb, cread_kb, cwrite_kb; +static unsigned long long events, missed_events; + +static inline void account_m(int rw, unsigned int bytes) +{ + if (rw) { + mwrites++; + qwrite_kb += bytes >> 10; + } else { + mreads++; + qread_kb += bytes >> 10; + } +} + +static inline void account_q(int rw, unsigned int bytes) +{ + if (rw) { + qwrites++; + qwrite_kb += bytes >> 10; + } else { + qreads++; + qread_kb += bytes >> 10; + } +} + +static inline void account_c(int rw, unsigned int bytes) +{ + if (rw) { + cwrites++; + cwrite_kb += bytes >> 10; + } else { + creads++; + cread_kb += bytes >> 10; + } +} + +void output(char *s) +{ + printf("%s", s); + fprintf(current->ofp,"%s",s); +} + +char hstring[256]; +char tstring[256]; + +inline char *setup_header(struct blk_io_trace *t, char act) +{ + int w = t->action & BLK_TC_ACT(BLK_TC_WRITE); + int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER); + int s = t->action & BLK_TC_ACT(BLK_TC_SYNC); + char rwbs[4]; + int i = 0; + + if (w) + rwbs[i++] = 'W'; + else + rwbs[i++] = 'R'; + if (b) + rwbs[i++] = 'B'; + if (s) + rwbs[i++] = 'S'; + + rwbs[i] = '\0'; + + sprintf(hstring, "%3d %15ld %12Lu %5u %c %3s", current->cpu, + (unsigned long)t->sequence, (unsigned long long)t->time, t->pid, + act, rwbs); + + return hstring; +} + +void log_complete(struct blk_io_trace *t, char act) +{ + sprintf(tstring,"%s %Lu + %u [%d]\n", setup_header(t, act), + (unsigned long long)t->sector, t->bytes >> 9, t->error); + output(tstring); +} + +void log_queue(struct blk_io_trace *t, char act) +{ + sprintf(tstring,"%s %Lu + %u\n", setup_header(t, act), + (unsigned long long)t->sector, t->bytes >> 9); + output(tstring); +} + +void log_issue(struct blk_io_trace *t, char act) +{ + sprintf(tstring,"%s %Lu + %u\n", setup_header(t, act), + (unsigned long long)t->sector, t->bytes >> 9); + output(tstring); +} + +void log_merge(struct blk_io_trace *t, char act) +{ + sprintf(tstring,"%s %Lu + %u\n", setup_header(t, act), + (unsigned long long)t->sector, t->bytes >> 9); + output(tstring); +} + +void log_generic(struct blk_io_trace *t, char act) +{ + sprintf(tstring,"%s %Lu + %u\n", setup_header(t, act), + (unsigned long long)t->sector, t->bytes >> 9); + output(tstring); +} + +void log_pc(struct blk_io_trace *t, char act) +{ + int i, ret; + unsigned char buf[64]; + + sprintf(tstring,"%s\n", setup_header(t, act)); + output(tstring); + + if (t->pdu_len > sizeof(buf)) { + fprintf(stderr, "Payload too large %d\n", t->pdu_len); + return; + } + + ret = read(current->dfd, buf, t->pdu_len); + if (ret != t->pdu_len) { + fprintf(stderr,"read(%d) failed on %s - %d\n", t->pdu_len, + current->dname, ret); + exit(1); + } + + for (i = 0; i < t->pdu_len; i++) { + sprintf(tstring,"%02x ", buf[i]); + output(tstring); + } + + if (act == 'C') { + sprintf(tstring,"[%d]", t->error); + output(tstring); + } + + printf("\n"); +} + +void dump_trace_pc(struct blk_io_trace *t) +{ + switch (t->action & 0xffff) { + case __BLK_TA_QUEUE: + log_generic(t, 'Q'); + break; + case __BLK_TA_GETRQ: + log_generic(t, 'G'); + break; + case __BLK_TA_SLEEPRQ: + log_generic(t, 'S'); + break; + case __BLK_TA_REQUEUE: + log_generic(t, 'R'); + break; + case __BLK_TA_ISSUE: + log_pc(t, 'D'); + break; + case __BLK_TA_COMPLETE: + log_pc(t, 'C'); + break; + default: + fprintf(stderr, "Bad pc action %x\n", t->action); + return; + } + + events++; +} + +void dump_trace_fs(struct blk_io_trace *t) +{ + int w = t->action & BLK_TC_ACT(BLK_TC_WRITE); + + switch (t->action & 0xffff) { + case __BLK_TA_QUEUE: + account_q(w, t->bytes); + log_queue(t, 'Q'); + break; + case __BLK_TA_BACKMERGE: + account_m(w, t->bytes); + log_merge(t, 'M'); + break; + case __BLK_TA_FRONTMERGE: + account_m(w, t->bytes); + log_merge(t, 'F'); + break; + case __BLK_TA_GETRQ: + log_generic(t, 'G'); + break; + case __BLK_TA_SLEEPRQ: + log_generic(t, 'S'); + break; + case __BLK_TA_REQUEUE: + log_queue(t, 'R'); + break; + case __BLK_TA_ISSUE: + log_issue(t, 'D'); + break; + case __BLK_TA_COMPLETE: + account_c(w, t->bytes); + log_complete(t, 'C'); + break; + default: + fprintf(stderr, "Bad fs action %x\n", t->action); + return; + } + + events++; +} + +void dump_trace(struct blk_io_trace *t) +{ + if (t->action & BLK_TC_ACT(BLK_TC_PC)) + dump_trace_pc(t); + else + dump_trace_fs(t); +} + +void show_stats(void) +{ + printf("\nReads:"); + printf("\tQueued: %'8lu, %'8LuKiB\n", qreads, qread_kb); + printf("\tCompleted: %'8lu, %'8LuKiB\n", creads, cread_kb); + printf("\tMerges: %'8lu\n", mreads); + + printf("Writes:"); + printf("\tQueued: %'8lu, %'8LuKiB\n", qwrites, qwrite_kb); + printf("\tCompleted: %'8lu, %'8LuKiB\n", cwrites, cwrite_kb); + printf("\tMerges: %'8lu\n", mwrites); + + printf("Events: %'Lu\n", events); + printf("Missed events: %'Lu\n", missed_events); +} + +int compar(const void *t1, const void *t2) +{ + long v1 = (long)(((struct blk_io_trace *)t1)->sequence); + long v2 = (long)(((struct blk_io_trace *)t2)->sequence); + + return v1 - v2; +} + +int main(int argc, char *argv[]) +{ + char *p, *dev; + int i, nfiles, nelems, nb, ret; + struct per_file_info *pfi; + struct blk_io_trace *traces, *tip; + + if (argc != 2) { + fprintf(stderr, "Usage %s \n", argv[0]); + return 1; + } + + dev = argv[1]; + + printf("First pass:\n"); + nfiles = nelems = 0; + for (i = 0, pfi = &per_file_info[0]; i < MAX_CPUS; i++, pfi++) { + pfi->cpu = i; + pfi->start_time = 0; + + pfi->fname = malloc(128); + sprintf(pfi->fname, "%s_out.%d", dev, i); + if (stat(pfi->fname, &pfi->stat) < 0) + break; + if (!S_ISREG(pfi->stat.st_mode)) { + fprintf(stderr, "Bad file type %s\n", pfi->fname); + return 1; + } + + nfiles++; + pfi->nelems = NELEMS(pfi); + nelems += pfi->nelems; + printf("\t%2d %10s %15d\n", i, pfi->fname, pfi->nelems); + + } + printf("\t %15d\n", nelems); + + if (!i) { + fprintf(stderr, "No files found\n"); + return 1; + } + + traces = malloc(nelems * sizeof(struct blk_io_trace)); + if (traces == NULL) { + fprintf(stderr, "Can not allocate %d\n", + nelems * (int) sizeof(struct blk_io_trace)); + return 1; + } + + printf("Second pass:\n"); + p = (char *)traces; + for (i = 0, pfi = per_file_info; i < nfiles; i++, pfi++) { + pfi->fd = open(pfi->fname, O_RDONLY); + if (pfi->fd < 0) { + perror(pfi->fname); + return 1; + } + + pfi->dname = malloc(128); + sprintf(pfi->dname, "%s_dat.%d", dev, i); + pfi->dfd = open(pfi->dname, O_RDONLY); + if (pfi->dfd < 0) { + perror(pfi->dname); + return 1; + } + + pfi->ofname = malloc(128); + sprintf(pfi->ofname, "%s_log.%d", dev, i); + pfi->ofp = fopen(pfi->ofname, "w"); + if (pfi->ofp == NULL) { + perror(pfi->ofname); + return 1; + } + + printf("\tProcessing %s...", pfi->fname); fflush(stdout); + nb = pfi->stat.st_size; + ret = read(pfi->fd, p, nb); + if (ret != nb) { + perror(pfi->fname); + fprintf(stderr,"\nFATAL: read(%d) -> %d\n", nb, ret); + return 1; + } + printf("\n"); fflush(stdout); + p += nb; + close(pfi->fd); + } + + printf("Sorting..."); fflush(stdout); + qsort(traces, nelems, sizeof(struct blk_io_trace), compar); + printf("\n\n"); + + for (i = 0, tip = traces; i < nelems; i++, tip++) { + int cpu = tip->magic & 0xff; + + current = &per_file_info[cpu]; + + /* + * offset time by first trace event. + * + * NOTE: This is *cpu* relative, thus you can not + * compare times ACROSS cpus. + */ + if (current->start_time == 0) + current->start_time = tip->time; + + tip->time -= current->start_time; + + dump_trace(tip); + } + + show_stats(); + return 0; +} + + diff --git a/blktrace.c b/blktrace.c new file mode 100644 index 0000000..3083f04 --- /dev/null +++ b/blktrace.c @@ -0,0 +1,313 @@ +/* + * block queue tracing application + * + * TODO (in no particular order): + * - Add ability to specify capture mask instead of logging all events + * - Add option for relayfs mount point + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blktrace.h" + +#define BUF_SIZE (128 *1024) +#define BUF_NR (4) + +struct thread_information { + int cpu; + pthread_t thread; + unsigned long events_processed; +}; + +static char relay_path[] = "/relay/"; + +#define is_done() (*(volatile int *)(&done)) +static volatile int done; + +static int devfd, ncpus; +static struct thread_information *thread_information; +static char *buts_name_p; + +int start_trace(char *dev) +{ + struct blk_user_trace_setup buts; + + devfd = open(dev, O_RDONLY); + if (devfd < 0) { + perror(dev); + return 1; + } + + memset(&buts, sizeof(buts), 0); + buts.buf_size = BUF_SIZE; + buts.buf_nr = BUF_NR; + + printf("Starting trace on %s\n", dev); + if (ioctl(devfd, BLKSTARTTRACE, &buts) < 0) { + perror("BLKSTARTTRACE"); + return 1; + } + + buts_name_p = strdup(buts.name); + return 0; +} + +void stop_trace(void) +{ + if (ioctl(devfd, BLKSTOPTRACE) < 0) + perror("BLKSTOPTRACE"); + + close(devfd); +} + +inline int verify_trace(struct blk_io_trace *t) +{ + if (!CHECK_MAGIC(t)) { + fprintf(stderr, "bad trace magic %x\n", t->magic); + return 1; + } + if ((t->magic & 0xff) != SUPPORTED_VERSION) { + fprintf(stderr, "unsupported trace version %x\n", + t->magic & 0xff); + return 1; + } + + return 0; +} + +void extract_data(int cpu, char *ifn, int ifd, char *ofn, int ofd, int nb) +{ + int ret, bytes_left; + unsigned char buf[nb], *p; + + p = buf; + bytes_left = nb; + while (bytes_left > 0) { + ret = read(ifd, p, bytes_left); + if (ret < 0) { + perror(ifn); + fprintf(stderr, "Thread %d extract_data %s failed\n", + cpu, ifn); + exit(1); + } else if (ret == 0) + usleep(1000); + else { + p += ret; + bytes_left -= ret; + } + } + + ret = write(ofd, buf, nb); + if (ret != nb) { + perror(ofn); + fprintf(stderr,"Thread %d extract_data %s failed\n", cpu, ofn); + exit(1); + } +} + +void *extract(void *arg) +{ + struct thread_information *tip = arg; + int tracefd, ret, ofd, dfd; + char ip[64], op[64], dp[64]; + struct blk_io_trace t; + pid_t pid = getpid(); + cpu_set_t cpu_mask; + + CPU_ZERO(&cpu_mask); + CPU_SET(tip->cpu, &cpu_mask); + + if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) { + perror("sched_setaffinity"); + exit(1); + } + + sprintf(op, "%s_out.%d", buts_name_p, tip->cpu); + ofd = open(op, O_CREAT|O_TRUNC|O_WRONLY, 0644); + if (ofd < 0) { + perror(op); + fprintf(stderr,"Thread %d failed creat of %s\n", tip->cpu, op); + exit(1); + } + + sprintf(dp, "%s_dat.%d", buts_name_p, tip->cpu); + dfd = open(dp, O_CREAT|O_TRUNC|O_WRONLY, 0644); + if (dfd < 0) { + perror(dp); + fprintf(stderr,"Thread %d failed creat of %s\n", tip->cpu, dp); + exit(1); + } + + sprintf(ip, "%s%s%d", relay_path, buts_name_p, tip->cpu); + tracefd = open(ip, O_RDONLY); + if (tracefd < 0) { + perror(ip); + fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu, ip); + exit(1); + } + + while (!is_done()) { + ret = read(tracefd, &t, sizeof(t)); + if (ret != sizeof(t)) { + if (ret < 0) { + perror(ip); + fprintf(stderr,"Thread %d failed read of %s\n", + tip->cpu, ip); + exit(1); + } else if (ret > 0) { + fprintf(stderr,"Thread %d misread %s %d,%ld\n", + tip->cpu, ip, ret, sizeof(t)); + exit(1); + } else { + usleep(10000); + continue; + } + } + + if (verify_trace(&t)) + exit(1); + + switch (t.action & 0xffff) { + case __BLK_TA_ISSUE: + case __BLK_TA_COMPLETE: + if (!t.pdu_len) + break; + else if (t.pdu_len > 64) { + fprintf(stderr, + "Thread %d Payload too large %d\n", + tip->cpu, t.pdu_len); + exit(1); + } + extract_data(tip->cpu, ip, tracefd, dp, dfd, t.pdu_len); + break; + } + + /* version is verified, stuff with CPU number now */ + t.magic = (t.magic & ~0xff) | tip->cpu; + ret = write(ofd, &t, sizeof(t)); + if (ret < 0) { + perror(op); + fprintf(stderr,"Thread %d failed write of %s\n", + tip->cpu, op); + exit(1); + } + + tip->events_processed++; + } + + return NULL; +} + +int start_threads(void) +{ + struct thread_information *tip; + int i; + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + if (ncpus < 0) { + fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n"); + return 1; + } + printf("Processors online: %d\n", ncpus); + + thread_information = malloc(ncpus * sizeof(struct thread_information)); + for (i = 0, tip = thread_information; i < ncpus; i++, tip++) { + tip->cpu = i; + tip->events_processed = 0; + + if (pthread_create(&tip->thread, NULL, extract, tip)) { + perror( "pthread_create"); + return 0; + } + } + + return ncpus; +} + +void show_stats(void) +{ + int i; + struct thread_information *tip; + unsigned long events_processed = 0; + + for (i = 0, tip = thread_information; i < ncpus; i++, tip++) { + printf("CPU%3d: %20ld events\n", + tip->cpu, tip->events_processed); + events_processed += tip->events_processed; + } + + printf("Total: %20ld events\n", events_processed); +} + +void handle_sigint(int sig) +{ + printf("exiting on signal %d\n", sig); + done = 1; +} + +int main(int argc, char *argv[]) +{ + struct thread_information *tip; + struct stat st; + int i; + + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + if (stat(relay_path, &st) < 0) { + fprintf(stderr,"%s does not appear to be mounted\n", + relay_path); + return 2; + } + + if (start_trace(argv[1])) { + fprintf(stderr, "Failed to start trace\n"); + stop_trace(); + return 3; + } + + setlocale(LC_NUMERIC, "en_US"); + + i = start_threads(); + if (!i) { + fprintf(stderr, "Failed to start worker threads\n"); + stop_trace(); + return 4; + } + + printf("Threads started : %d\n", i); + + signal(SIGINT, handle_sigint); + signal(SIGHUP, handle_sigint); + signal(SIGTERM, handle_sigint); + + while (!is_done()) + sleep(1); + + for (i = 0, tip = thread_information; i < ncpus; i++, tip++) { + int ret; + + if (pthread_join(tip->thread, (void *) &ret)) + perror("thread_join"); + } + + stop_trace(); + close(devfd); + show_stats(); + + return 0; +} + diff --git a/blktrace.h b/blktrace.h new file mode 100644 index 0000000..2f35a45 --- /dev/null +++ b/blktrace.h @@ -0,0 +1,79 @@ +#if !defined(__BLKTRACE_H__) +#define __BLKTRACE_H__ 1 + +#include + +enum { + BLK_TC_READ = 1 << 0, /* reads */ + BLK_TC_WRITE = 1 << 1, /* writes */ + BLK_TC_BARRIER = 1 << 2, /* barrier */ + BLK_TC_SYNC = 1 << 3, /* barrier */ + BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ + BLK_TC_REQUEUE = 1 << 5, /* requeueing */ + BLK_TC_ISSUE = 1 << 6, /* issue */ + BLK_TC_COMPLETE = 1 << 7, /* completions */ + BLK_TC_FS = 1 << 8, /* fs requests */ + BLK_TC_PC = 1 << 9, /* pc requests */ + + BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ +}; + +#define BLK_TC_SHIFT (16) +#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) + +/* + * Basic trace actions + */ +enum { + __BLK_TA_QUEUE = 1, /* queued */ + __BLK_TA_BACKMERGE, /* back merged to existing rq */ + __BLK_TA_FRONTMERGE, /* front merge to existing rq */ + __BLK_TA_GETRQ, /* allocated new request */ + __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */ + __BLK_TA_REQUEUE, /* request requeued */ + __BLK_TA_ISSUE, /* sent to driver */ + __BLK_TA_COMPLETE, /* completed by driver */ +}; + +/* + * Trace actions in full. Additionally, read or write is masked + */ +#define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE)) +#define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE)) + +#define BLK_IO_TRACE_MAGIC (0x65617400) +#define CHECK_MAGIC(t) (((t)->magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) +#define SUPPORTED_VERSION (0x02) + +struct blk_io_trace { + __u32 magic; + __u32 sequence; + __u64 time; + __u64 sector; + __u32 bytes; + __u32 action; + __u32 pid; + __u16 error; + __u16 pdu_len; +}; + +struct blk_user_trace_setup { + char name[32]; + __u16 act_mask; + __u32 buf_size; + __u32 buf_nr; +}; + + +#ifndef BLKSTARTTRACE +#define BLKSTARTTRACE _IOWR(0x12,115,struct blk_user_trace_setup) +#define BLKSTOPTRACE _IO(0x12,116) +#endif + +#endif -- 2.25.1