diff options
-rw-r--r-- | blkparse.c | 399 | ||||
-rw-r--r-- | blktrace.h | 8 | ||||
-rw-r--r-- | blktrace_api.h | 2 | ||||
-rw-r--r-- | kernel/blk-trace-2.6.16-rc1-git-U0 (renamed from kernel/blk-trace-2.6.16-rc1-git-T0) | 79 |
4 files changed, 256 insertions, 232 deletions
@@ -52,15 +52,11 @@ struct per_dev_info { unsigned long long last_reported_time; unsigned long long last_read_time; struct io_stats io_stats; - unsigned long last_sequence; unsigned long skips, nskips; unsigned long long seq_skips, seq_nskips; unsigned int max_depth[2]; unsigned int cur_depth[2]; - struct rb_root rb_last; - unsigned long rb_last_entries; - struct rb_root rb_track; int nfiles; @@ -231,6 +227,7 @@ struct io_track { static int ndevices; static struct per_dev_info *devices; static char *get_dev_name(struct per_dev_info *, char *, int); +static int trace_rb_insert_last(struct per_dev_info *, struct trace *); FILE *ofp = NULL; static char *output_name; @@ -238,7 +235,6 @@ static char *input_dir; static unsigned long long genesis_time; static unsigned long long last_allowed_time; -static unsigned int smallest_seq_read; static unsigned long long stopwatch_start; /* start from zero by default */ static unsigned long long stopwatch_end = -1ULL; /* "infinity" */ @@ -267,6 +263,94 @@ static volatile int done; #define CPU_IDX(cpu) ((cpu) / CPUS_PER_LONG) #define CPU_BIT(cpu) ((cpu) & (CPUS_PER_LONG - 1)) +static void resize_cpu_info(struct per_dev_info *pdi, int cpu) +{ + struct per_cpu_info *cpus = pdi->cpus; + int ncpus = pdi->ncpus; + int new_count = cpu + 1; + int new_space, size; + char *new_start; + + size = new_count * sizeof(struct per_cpu_info); + cpus = realloc(cpus, size); + if (!cpus) { + char name[20]; + fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n", + get_dev_name(pdi, name, sizeof(name)), size); + exit(1); + } + + new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info)); + new_space = (new_count - ncpus) * sizeof(struct per_cpu_info); + memset(new_start, 0, new_space); + + pdi->ncpus = new_count; + pdi->cpus = cpus; + + for (new_count = 0; new_count < pdi->ncpus; new_count++) { + struct per_cpu_info *pci = &pdi->cpus[new_count]; + + if (!pci->fd) { + pci->fd = -1; + memset(&pci->rb_last, 0, sizeof(pci->rb_last)); + pci->rb_last_entries = 0; + pci->last_sequence = -1; + } + } +} + +static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu) +{ + struct per_cpu_info *pci; + + if (cpu >= pdi->ncpus) + resize_cpu_info(pdi, cpu); + + pci = &pdi->cpus[cpu]; + pci->cpu = cpu; + return pci; +} + + +static int resize_devices(char *name) +{ + int size = (ndevices + 1) * sizeof(struct per_dev_info); + + devices = realloc(devices, size); + if (!devices) { + fprintf(stderr, "Out of memory, device %s (%d)\n", name, size); + return 1; + } + memset(&devices[ndevices], 0, sizeof(struct per_dev_info)); + devices[ndevices].name = name; + ndevices++; + return 0; +} + +static struct per_dev_info *get_dev_info(dev_t dev) +{ + struct per_dev_info *pdi; + int i; + + for (i = 0; i < ndevices; i++) { + if (!devices[i].dev) + devices[i].dev = dev; + if (devices[i].dev == dev) + return &devices[i]; + } + + if (resize_devices(NULL)) + return NULL; + + pdi = &devices[ndevices - 1]; + pdi->dev = dev; + pdi->first_reported_time = 0; + pdi->last_read_time = 0; + pdi->skips_head = pdi->skips_tail = NULL; + + return pdi; +} + static void insert_skip(struct per_dev_info *pdi, unsigned long start, unsigned long end) { @@ -465,6 +549,80 @@ static struct per_process_info *find_process(__u32 pid, char *name) return ppi; } +/* + * struct trace and blktrace allocation cache, we do potentially + * millions of mallocs for these structures while only using at most + * a few thousand at the time + */ +static inline void t_free(struct trace *t) +{ + if (t_alloc_cache < 1024) { + t->next = t_alloc_list; + t_alloc_list = t; + t_alloc_cache++; + } else + free(t); +} + +static inline struct trace *t_alloc(void) +{ + struct trace *t = t_alloc_list; + + if (t) { + t_alloc_list = t->next; + t_alloc_cache--; + return t; + } + + return malloc(sizeof(*t)); +} + +static inline void bit_free(struct blk_io_trace *bit) +{ + if (bit_alloc_cache < 1024 && !bit->pdu_len) { + /* + * abuse a 64-bit field for a next pointer for the free item + */ + bit->time = (__u64) (unsigned long) bit_alloc_list; + bit_alloc_list = (struct blk_io_trace *) bit; + bit_alloc_cache++; + } else + free(bit); +} + +static inline struct blk_io_trace *bit_alloc(void) +{ + struct blk_io_trace *bit = bit_alloc_list; + + if (bit) { + bit_alloc_list = (struct blk_io_trace *) (unsigned long) \ + bit->time; + bit_alloc_cache--; + return bit; + } + + return malloc(sizeof(*bit)); +} + +static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t) +{ + struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu); + + rb_erase(&t->rb_node, &pci->rb_last); + pci->rb_last_entries--; + + bit_free(t->bit); + t_free(t); +} + +static void put_trace(struct per_dev_info *pdi, struct trace *t) +{ + rb_erase(&t->rb_node, &rb_sort_root); + rb_sort_entries--; + + trace_rb_insert_last(pdi, t); +} + static inline int trace_rb_insert(struct trace *t, struct rb_root *root) { struct rb_node **p = &root->rb_node; @@ -505,14 +663,23 @@ static inline int trace_rb_insert_sort(struct trace *t) return 1; } -static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t) +static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t) { - if (!trace_rb_insert(t, &pdi->rb_last)) { - pdi->rb_last_entries++; - return 0; + struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu); + + if (trace_rb_insert(t, &pci->rb_last)) + return 1; + + pci->rb_last_entries++; + + if (pci->rb_last_entries > rb_batch * pdi->nfiles) { + struct rb_node *n = rb_first(&pci->rb_last); + + t = rb_entry(n, struct trace, rb_node); + __put_trace_last(pdi, t); } - return 1; + return 0; } static struct trace *trace_rb_find(dev_t device, unsigned long sequence, @@ -562,15 +729,11 @@ static struct trace *trace_rb_find(dev_t device, unsigned long sequence, return NULL; } -static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq) -{ - return trace_rb_find(dev, seq, &rb_sort_root, 1); -} - static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi, + struct per_cpu_info *pci, unsigned long seq) { - return trace_rb_find(pdi->dev, seq, &pdi->rb_last, 0); + return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0); } static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot) @@ -798,91 +961,6 @@ static struct io_stats *find_process_io_stats(__u32 pid, char *name) return &ppi->io_stats; } -static void resize_cpu_info(struct per_dev_info *pdi, int cpu) -{ - struct per_cpu_info *cpus = pdi->cpus; - int ncpus = pdi->ncpus; - int new_count = cpu + 1; - int new_space, size; - char *new_start; - - size = new_count * sizeof(struct per_cpu_info); - cpus = realloc(cpus, size); - if (!cpus) { - char name[20]; - fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n", - get_dev_name(pdi, name, sizeof(name)), size); - exit(1); - } - - new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info)); - new_space = (new_count - ncpus) * sizeof(struct per_cpu_info); - memset(new_start, 0, new_space); - - pdi->ncpus = new_count; - pdi->cpus = cpus; - - for (new_count = 0; new_count < pdi->ncpus; new_count++) - if (!pdi->cpus[new_count].fd) - pdi->cpus[new_count].fd = -1; -} - -static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu) -{ - struct per_cpu_info *pci; - - if (cpu >= pdi->ncpus) - resize_cpu_info(pdi, cpu); - - pci = &pdi->cpus[cpu]; - pci->cpu = cpu; - return pci; -} - - -static int resize_devices(char *name) -{ - int size = (ndevices + 1) * sizeof(struct per_dev_info); - - devices = realloc(devices, size); - if (!devices) { - fprintf(stderr, "Out of memory, device %s (%d)\n", name, size); - return 1; - } - memset(&devices[ndevices], 0, sizeof(struct per_dev_info)); - devices[ndevices].name = name; - ndevices++; - return 0; -} - -static struct per_dev_info *get_dev_info(dev_t dev) -{ - struct per_dev_info *pdi; - int i; - - for (i = 0; i < ndevices; i++) { - if (!devices[i].dev) - devices[i].dev = dev; - if (devices[i].dev == dev) - return &devices[i]; - } - - if (resize_devices(NULL)) - return NULL; - - pdi = &devices[ndevices - 1]; - pdi->dev = dev; - pdi->first_reported_time = 0; - pdi->last_sequence = -1; - pdi->last_read_time = 0; - memset(&pdi->rb_last, 0, sizeof(pdi->rb_last)); - pdi->rb_last_entries = 0; - - pdi->skips_head = pdi->skips_tail = NULL; - - return pdi; -} - static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size) { if (pdi->name) @@ -1426,61 +1504,6 @@ static void show_device_and_cpu_stats(void) } } -/* - * struct trace and blktrace allocation cache, we do potentially - * millions of mallocs for these structures while only using at most - * a few thousand at the time - */ -static inline void t_free(struct trace *t) -{ - if (t_alloc_cache < 1024) { - t->next = t_alloc_list; - t_alloc_list = t; - t_alloc_cache++; - } else - free(t); -} - -static inline struct trace *t_alloc(void) -{ - struct trace *t = t_alloc_list; - - if (t) { - t_alloc_list = t->next; - t_alloc_cache--; - return t; - } - - return malloc(sizeof(*t)); -} - -static inline void bit_free(struct blk_io_trace *bit) -{ - if (bit_alloc_cache < 1024 && !bit->pdu_len) { - /* - * abuse a 64-bit field for a next pointer for the free item - */ - bit->time = (__u64) (unsigned long) bit_alloc_list; - bit_alloc_list = (struct blk_io_trace *) bit; - bit_alloc_cache++; - } else - free(bit); -} - -static inline struct blk_io_trace *bit_alloc(void) -{ - struct blk_io_trace *bit = bit_alloc_list; - - if (bit) { - bit_alloc_list = (struct blk_io_trace *) (unsigned long) \ - bit->time; - bit_alloc_cache--; - return bit; - } - - return malloc(sizeof(*bit)); -} - static void find_genesis(void) { struct trace *t = trace_list; @@ -1508,6 +1531,8 @@ static inline int check_stopwatch(struct blk_io_trace *bit) */ static int sort_entries(unsigned long long *youngest) { + struct per_dev_info *pdi = NULL; + struct per_cpu_info *pci = NULL; struct trace *t; if (!genesis_time) @@ -1524,8 +1549,16 @@ static int sort_entries(unsigned long long *youngest) if (bit->time < *youngest || !*youngest) *youngest = bit->time; - if (bit->sequence < smallest_seq_read) - smallest_seq_read = bit->sequence; + if (!pdi || pdi->dev != bit->device) { + pdi = get_dev_info(bit->device); + pci = NULL; + } + + if (!pci || pci->cpu != bit->cpu) + pci = get_cpu_info(pdi, bit->cpu); + + if (bit->sequence < pci->smallest_seq_read) + pci->smallest_seq_read = bit->sequence; if (check_stopwatch(bit)) { bit_free(bit); @@ -1540,30 +1573,6 @@ static int sort_entries(unsigned long long *youngest) return 0; } -static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t) -{ - rb_erase(&t->rb_node, &pdi->rb_last); - pdi->rb_last_entries--; - - bit_free(t->bit); - t_free(t); -} - -static void put_trace(struct per_dev_info *pdi, struct trace *t) -{ - rb_erase(&t->rb_node, &rb_sort_root); - rb_sort_entries--; - - trace_rb_insert_last(pdi, t); - - if (pdi->rb_last_entries > rb_batch * pdi->nfiles) { - struct rb_node *n = rb_first(&pdi->rb_last); - - t = rb_entry(n, struct trace, rb_node); - __put_trace_last(pdi, t); - } -} - /* * to continue, we must have traces from all online cpus in the tree */ @@ -1606,17 +1615,21 @@ static int check_cpu_map(struct per_dev_info *pdi) static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force) { - unsigned long expected_sequence = pdi->last_sequence + 1; struct blk_io_trace *bit = t->bit; + unsigned long expected_sequence; + struct per_cpu_info *pci; struct trace *__t; + pci = get_cpu_info(pdi, bit->cpu); + expected_sequence = pci->last_sequence + 1; + if (!expected_sequence) { /* * 1 should be the first entry, just allow it */ if (bit->sequence == 1) return 0; - if (bit->sequence == smallest_seq_read) + if (bit->sequence == pci->smallest_seq_read) return 0; return check_cpu_map(pdi); @@ -1629,8 +1642,8 @@ static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force) * we may not have seen that sequence yet. if we are not doing * the final run, break and wait for more entries. */ - if (expected_sequence < smallest_seq_read) { - __t = trace_rb_find_last(pdi, expected_sequence); + if (expected_sequence < pci->smallest_seq_read) { + __t = trace_rb_find_last(pdi, pci, expected_sequence); if (!__t) goto skip; @@ -1664,8 +1677,10 @@ static void show_entries_rb(int force) t = rb_entry(n, struct trace, rb_node); bit = t->bit; - if (!pdi || pdi->dev != bit->device) + if (!pdi || pdi->dev != bit->device) { pdi = get_dev_info(bit->device); + pci = NULL; + } if (!pdi) { fprintf(stderr, "Unknown device ID? (%d,%d)\n", @@ -1679,13 +1694,13 @@ static void show_entries_rb(int force) if (!force && bit->time > last_allowed_time) break; - pdi->last_sequence = bit->sequence; - check_time(pdi, bit); if (!pci || pci->cpu != bit->cpu) pci = get_cpu_info(pdi, bit->cpu); + pci->last_sequence = bit->sequence; + pci->nelems++; if (bit->action & (act_mask << BLK_TC_SHIFT)) @@ -1822,7 +1837,6 @@ static int do_file(void) for (i = 0; i < ndevices; i++) { pdi = &devices[i]; pdi->nfiles = 0; - pdi->last_sequence = -1; for (j = 0;; j++) { struct stat st; @@ -1872,7 +1886,6 @@ static int do_file(void) events_added = 0; last_allowed_time = -1ULL; - smallest_seq_read = -1U; for (i = 0; i < ndevices; i++) { pdi = &devices[i]; @@ -1884,6 +1897,8 @@ static int do_file(void) if (pci->fd == -1) continue; + pci->smallest_seq_read = -1; + events = read_events(pci->fd, 1, &pci->fdblock); if (events <= 0) { cpu_mark_offline(pdi, pci->cpu); @@ -1930,7 +1945,9 @@ static int do_stdin(void) fdblock = -1; while ((events = read_events(fd, 0, &fdblock)) > 0) { +#if 0 smallest_seq_read = -1U; +#endif if (sort_entries(&youngest)) break; @@ -6,6 +6,7 @@ #include <endian.h> #include "blktrace_api.h" +#include "rbtree.h" #define MINORBITS 20 #define MINORMASK ((1U << MINORBITS) - 1) @@ -41,12 +42,17 @@ struct per_cpu_info { char fname[128]; struct io_stats io_stats; + + struct rb_root rb_last; + unsigned long rb_last_entries; + unsigned long last_sequence; + unsigned long smallest_seq_read; }; extern FILE *ofp; #define CHECK_MAGIC(t) (((t)->magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) -#define SUPPORTED_VERSION (0x05) +#define SUPPORTED_VERSION (0x06) #if __BYTE_ORDER == __LITTLE_ENDIAN #define be16_to_cpu(x) __bswap_16(x) diff --git a/blktrace_api.h b/blktrace_api.h index 7a75f2d..7cc3cf5 100644 --- a/blktrace_api.h +++ b/blktrace_api.h @@ -65,7 +65,7 @@ enum { #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_IO_TRACE_MAGIC 0x65617400 -#define BLK_IO_TRACE_VERSION 0x05 +#define BLK_IO_TRACE_VERSION 0x06 /* * The trace itself diff --git a/kernel/blk-trace-2.6.16-rc1-git-T0 b/kernel/blk-trace-2.6.16-rc1-git-U0 index 827b843..677852b 100644 --- a/kernel/blk-trace-2.6.16-rc1-git-T0 +++ b/kernel/blk-trace-2.6.16-rc1-git-U0 @@ -30,10 +30,10 @@ index 7e4f93e..c05de0e 100644 +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o diff --git a/block/blktrace.c b/block/blktrace.c new file mode 100644 -index 0000000..d12d166 +index 0000000..21b381d --- /dev/null +++ b/block/blktrace.c -@@ -0,0 +1,360 @@ +@@ -0,0 +1,362 @@ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/blkdev.h> @@ -48,9 +48,9 @@ index 0000000..d12d166 +void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + int rw, u32 what, int error, int pdu_len, void *pdu_data) +{ -+ struct blk_io_trace t; ++ struct blk_io_trace *t; + unsigned long flags; -+ void *rbuf; ++ unsigned long *sequence; + pid_t pid; + int cpu; + @@ -73,38 +73,35 @@ index 0000000..d12d166 + if (bt->pid && pid != bt->pid) + return; + -+ t.magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; ++ local_irq_save(flags); + -+ t.device = bt->dev; -+ t.sector = sector; -+ t.bytes = bytes; -+ t.action = what; -+ t.error = error; -+ t.pdu_len = pdu_len; ++ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); ++ if (unlikely(!t)) { ++ local_irq_restore(flags); ++ return; ++ } + -+ t.pid = pid; -+ memcpy(t.comm, current->comm, sizeof(t.comm)); ++ cpu = smp_processor_id(); ++ sequence = per_cpu_ptr(bt->sequence, cpu); ++ t->sequence = ++(*sequence); ++ t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu); ++ t->cpu = cpu; + -+ /* -+ * need to serialize this part completely to prevent multiple CPUs -+ * from misordering events -+ */ -+ spin_lock_irqsave(&bt->lock, flags); ++ local_irq_restore(flags); + -+ t.sequence = ++bt->sequence; ++ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; ++ t->device = bt->dev; ++ t->sector = sector; ++ t->bytes = bytes; ++ t->action = what; ++ t->error = error; ++ t->pdu_len = pdu_len; + -+ cpu = smp_processor_id(); -+ t.cpu = cpu; -+ t.time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu); -+ -+ rbuf = relay_reserve(bt->rchan, sizeof(t) + pdu_len); -+ if (rbuf) { -+ memcpy(rbuf, &t, sizeof(t)); -+ if (pdu_len) -+ memcpy(rbuf + sizeof(t), pdu_data, pdu_len); -+ } ++ t->pid = pid; ++ memcpy(t->comm, current->comm, sizeof(t->comm)); + -+ spin_unlock_irqrestore(&bt->lock, flags); ++ if (pdu_len) ++ memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); +} + +EXPORT_SYMBOL_GPL(__blk_add_trace); @@ -260,10 +257,14 @@ index 0000000..d12d166 + goto err; + + ret = -ENOMEM; -+ bt = kmalloc(sizeof(*bt), GFP_KERNEL); ++ bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + ++ bt->sequence = alloc_percpu(unsigned long); ++ if (!bt->sequence) ++ goto err; ++ + ret = -ENOENT; + dir = blk_create_tree(buts.name); + if (!dir) @@ -271,8 +272,6 @@ index 0000000..d12d166 + + bt->dir = dir; + bt->dev = bdev->bd_dev; -+ bt->sequence = 0; -+ spin_lock_init(&bt->lock); + atomic_set(&bt->dropped, 0); + + ret = -EIO; @@ -305,8 +304,11 @@ index 0000000..d12d166 + relayfs_remove_file(bt->dropped_file); + if (dir) + blk_remove_tree(dir); -+ if (bt) ++ if (bt) { ++ if (bt->sequence) ++ free_percpu(bt->sequence); + kfree(bt); ++ } + return ret; +} + @@ -727,10 +729,10 @@ index 02a585f..195c3b9 100644 */ diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h new file mode 100644 -index 0000000..d6b4317 +index 0000000..026b995 --- /dev/null +++ b/include/linux/blktrace_api.h -@@ -0,0 +1,216 @@ +@@ -0,0 +1,215 @@ +#ifndef BLKTRACE_H +#define BLKTRACE_H + @@ -800,7 +802,7 @@ index 0000000..d6b4317 +#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) + +#define BLK_IO_TRACE_MAGIC 0x65617400 -+#define BLK_IO_TRACE_VERSION 0x05 ++#define BLK_IO_TRACE_VERSION 0x06 + +/* + * The trace itself @@ -834,8 +836,7 @@ index 0000000..d6b4317 + struct rchan *rchan; + struct dentry *dropped_file; + atomic_t dropped; -+ spinlock_t lock; -+ unsigned long sequence; ++ unsigned long *sequence; + u32 dev; + u16 act_mask; + u64 start_lba; |