[PATCH] blktrace: need to free ts->buf for networked transfer
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <sys/poll.h>
33 #include <sys/mman.h>
34 #include <sys/socket.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <sched.h>
38 #include <ctype.h>
39 #include <getopt.h>
40 #include <errno.h>
41 #include <netinet/in.h>
42 #include <arpa/inet.h>
43 #include <netdb.h>
44 #include <sys/sendfile.h>
45
46 #include "blktrace.h"
47 #include "barrier.h"
48
49 static char blktrace_version[] = "0.99";
50
51 /*
52  * You may want to increase this even more, if you are logging at a high
53  * rate and see skipped/missed events
54  */
55 #define BUF_SIZE        (512 * 1024)
56 #define BUF_NR          (4)
57
58 #define OFILE_BUF       (128 * 1024)
59
60 #define RELAYFS_TYPE    0xF0B4A981
61
62 #define S_OPTS  "d:a:A:r:o:kw:Vb:n:D:lh:p:"
63 static struct option l_opts[] = {
64         {
65                 .name = "dev",
66                 .has_arg = required_argument,
67                 .flag = NULL,
68                 .val = 'd'
69         },
70         {
71                 .name = "act-mask",
72                 .has_arg = required_argument,
73                 .flag = NULL,
74                 .val = 'a'
75         },
76         {
77                 .name = "set-mask",
78                 .has_arg = required_argument,
79                 .flag = NULL,
80                 .val = 'A'
81         },
82         {
83                 .name = "relay",
84                 .has_arg = required_argument,
85                 .flag = NULL,
86                 .val = 'r'
87         },
88         {
89                 .name = "output",
90                 .has_arg = required_argument,
91                 .flag = NULL,
92                 .val = 'o'
93         },
94         {
95                 .name = "kill",
96                 .has_arg = no_argument,
97                 .flag = NULL,
98                 .val = 'k'
99         },
100         {
101                 .name = "stopwatch",
102                 .has_arg = required_argument,
103                 .flag = NULL,
104                 .val = 'w'
105         },
106         {
107                 .name = "version",
108                 .has_arg = no_argument,
109                 .flag = NULL,
110                 .val = 'V'
111         },
112         {
113                 .name = "buffer-size",
114                 .has_arg = required_argument,
115                 .flag = NULL,
116                 .val = 'b'
117         },
118         {
119                 .name = "num-sub-buffers",
120                 .has_arg = required_argument,
121                 .flag = NULL,
122                 .val = 'n'
123         },
124         {
125                 .name = "output-dir",
126                 .has_arg = required_argument,
127                 .flag = NULL,
128                 .val = 'D'
129         },
130         {
131                 .name = "listen",
132                 .has_arg = no_argument,
133                 .flag = NULL,
134                 .val = 'l'
135         },
136         {
137                 .name = "host",
138                 .has_arg = required_argument,
139                 .flag = NULL,
140                 .val = 'h'
141         },
142         {
143                 .name = "port",
144                 .has_arg = required_argument,
145                 .flag = NULL,
146                 .val = 'p'
147         },
148         {
149                 .name = NULL,
150         }
151 };
152
153 struct tip_subbuf {
154         void *buf;
155         unsigned int len;
156         unsigned int max_len;
157 };
158
159 #define FIFO_SIZE       (1024)  /* should be plenty big! */
160 #define CL_SIZE         (128)   /* cache line, any bigger? */
161
162 struct tip_subbuf_fifo {
163         int tail __attribute__((aligned(CL_SIZE)));
164         int head __attribute__((aligned(CL_SIZE)));
165         struct tip_subbuf *q[FIFO_SIZE];
166 };
167
168 struct thread_information {
169         int cpu;
170         pthread_t thread;
171
172         int fd;
173         void *fd_buf;
174         char fn[MAXPATHLEN + 64];
175
176         FILE *ofile;
177         char *ofile_buffer;
178         int ofile_stdout;
179         int ofile_mmap;
180
181         unsigned long events_processed;
182         unsigned long long data_read;
183         struct device_information *device;
184
185         int exited;
186
187         /*
188          * piped fifo buffers
189          */
190         struct tip_subbuf_fifo fifo;
191         struct tip_subbuf *leftover_ts;
192
193         /*
194          * mmap controlled output files
195          */
196         unsigned long long fs_size;
197         unsigned long long fs_max_size;
198         unsigned long fs_off;
199         void *fs_buf;
200         unsigned long fs_buf_len;
201 };
202
203 struct device_information {
204         int fd;
205         char *path;
206         char buts_name[32];
207         volatile int trace_started;
208         unsigned long drop_count;
209         struct thread_information *threads;
210 };
211
212 static int ncpus;
213 static struct thread_information *thread_information;
214 static int ndevs;
215 static struct device_information *device_information;
216
217 /* command line option globals */
218 static char *relay_path;
219 static char *output_name;
220 static char *output_dir;
221 static int act_mask = ~0U;
222 static int kill_running_trace;
223 static unsigned long buf_size = BUF_SIZE;
224 static unsigned long buf_nr = BUF_NR;
225 static unsigned int page_size;
226
227 #define is_done()       (*(volatile int *)(&done))
228 static volatile int done;
229
230 #define is_trace_stopped()      (*(volatile int *)(&trace_stopped))
231 static volatile int trace_stopped;
232
233 #define is_stat_shown() (*(volatile int *)(&stat_shown))
234 static volatile int stat_shown;
235
236 int data_is_native = -1;
237
238 static void exit_trace(int status);
239
240 #define dip_tracing(dip)        (*(volatile int *)(&(dip)->trace_started))
241 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
242
243 #define __for_each_dip(__d, __i, __e)   \
244         for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
245
246 #define for_each_dip(__d, __i)  __for_each_dip(__d, __i, ndevs)
247 #define for_each_tip(__d, __t, __j)     \
248         for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
249
250 /*
251  * networking stuff follows. we include a magic number so we know whether
252  * to endianness convert or not
253  */
254 struct blktrace_net_hdr {
255         u32 magic;              /* same as trace magic */
256         char buts_name[32];     /* trace name */
257         u32 cpu;                /* for which cpu */
258         u32 max_cpus;
259         u32 len;                /* length of following trace data */
260 };
261
262 #define TRACE_NET_PORT          (8462)
263
264 enum {
265         Net_none = 0,
266         Net_server,
267         Net_client,
268 };
269
270 /*
271  * network cmd line params
272  */
273 static char hostname[MAXHOSTNAMELEN];
274 static int net_port = TRACE_NET_PORT;
275 static int net_mode = 0;
276
277 static int net_in_fd = -1;
278 static int net_out_fd = -1;
279
280 static void handle_sigint(__attribute__((__unused__)) int sig)
281 {
282         done = 1;
283 }
284
285 static int get_dropped_count(const char *buts_name)
286 {
287         int fd;
288         char tmp[MAXPATHLEN + 64];
289
290         snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
291                  relay_path, buts_name);
292
293         fd = open(tmp, O_RDONLY);
294         if (fd < 0) {
295                 /*
296                  * this may be ok, if the kernel doesn't support dropped counts
297                  */
298                 if (errno == ENOENT)
299                         return 0;
300
301                 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
302                 return -1;
303         }
304
305         if (read(fd, tmp, sizeof(tmp)) < 0) {
306                 perror(tmp);
307                 close(fd);
308                 return -1;
309         }
310
311         close(fd);
312
313         return atoi(tmp);
314 }
315
316 static int start_trace(struct device_information *dip)
317 {
318         struct blk_user_trace_setup buts;
319
320         memset(&buts, 0, sizeof(buts));
321         buts.buf_size = buf_size;
322         buts.buf_nr = buf_nr;
323         buts.act_mask = act_mask;
324
325         if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
326                 perror("BLKTRACESETUP");
327                 return 1;
328         }
329
330         if (ioctl(dip->fd, BLKTRACESTART) < 0) {
331                 perror("BLKTRACESTART");
332                 return 1;
333         }
334
335         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
336         dip_set_tracing(dip, 1);
337         return 0;
338 }
339
340 static void stop_trace(struct device_information *dip)
341 {
342         if (dip_tracing(dip) || kill_running_trace) {
343                 dip_set_tracing(dip, 0);
344
345                 if (ioctl(dip->fd, BLKTRACESTOP) < 0)
346                         perror("BLKTRACESTOP");
347                 if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
348                         perror("BLKTRACETEARDOWN");
349
350                 close(dip->fd);
351                 dip->fd = -1;
352         }
353 }
354
355 static void stop_all_traces(void)
356 {
357         struct device_information *dip;
358         int i;
359
360         for_each_dip(dip, i) {
361                 dip->drop_count = get_dropped_count(dip->buts_name);
362                 stop_trace(dip);
363         }
364 }
365
366 static void wait_for_data(struct thread_information *tip)
367 {
368         struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
369
370         do {
371                 poll(&pfd, 1, 100);
372                 if (pfd.revents & POLLIN)
373                         break;
374                 if (tip->ofile_stdout)
375                         break;
376         } while (!is_done());
377 }
378
379 static int read_data_file(struct thread_information *tip, void *buf, int len)
380 {
381         int ret = 0;
382
383         do {
384                 wait_for_data(tip);
385
386                 ret = read(tip->fd, buf, len);
387                 if (!ret)
388                         continue;
389                 else if (ret > 0)
390                         return ret;
391                 else {
392                         if (errno != EAGAIN) {
393                                 perror(tip->fn);
394                                 fprintf(stderr,"Thread %d failed read of %s\n",
395                                         tip->cpu, tip->fn);
396                                 break;
397                         }
398                         continue;
399                 }
400         } while (!is_done());
401
402         return ret;
403
404 }
405
406 static int read_data_net(struct thread_information *tip, void *buf, int len)
407 {
408         unsigned int bytes_left = len;
409         int ret = 0;
410
411         do {
412                 ret = recv(net_in_fd, buf, bytes_left, MSG_WAITALL);
413
414                 if (!ret)
415                         continue;
416                 else if (ret < 0) {
417                         if (errno != EAGAIN) {
418                                 perror(tip->fn);
419                                 fprintf(stderr, "server: failed read\n");
420                                 return 0;
421                         }
422                         continue;
423                 } else {
424                         buf += ret;
425                         bytes_left -= ret;
426                 }
427         } while (!is_done() && bytes_left);
428
429         return len;
430 }
431
432 static int read_data(struct thread_information *tip, void *buf, int len)
433 {
434         if (net_mode == Net_server)
435                 return read_data_net(tip, buf, len);
436
437         return read_data_file(tip, buf, len);
438 }
439
440 static inline struct tip_subbuf *
441 subbuf_fifo_dequeue(struct thread_information *tip)
442 {
443         const int head = tip->fifo.head;
444         const int next = (head + 1) & (FIFO_SIZE - 1);
445
446         if (head != tip->fifo.tail) {
447                 struct tip_subbuf *ts = tip->fifo.q[head];
448
449                 store_barrier();
450                 tip->fifo.head = next;
451                 return ts;
452         }
453
454         return NULL;
455 }
456
457 static inline int subbuf_fifo_queue(struct thread_information *tip,
458                                     struct tip_subbuf *ts)
459 {
460         const int tail = tip->fifo.tail;
461         const int next = (tail + 1) & (FIFO_SIZE - 1);
462
463         if (next != tip->fifo.head) {
464                 tip->fifo.q[tail] = ts;
465                 store_barrier();
466                 tip->fifo.tail = next;
467                 return 0;
468         }
469
470         fprintf(stderr, "fifo too small!\n");
471         return 1;
472 }
473
474 /*
475  * For file output, truncate and mmap the file appropriately
476  */
477 static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
478 {
479         int ofd = fileno(tip->ofile);
480         int ret;
481
482         /*
483          * extend file, if we have to. use chunks of 16 subbuffers.
484          */
485         if (tip->fs_off + buf_size > tip->fs_buf_len) {
486                 if (tip->fs_buf) {
487                         munlock(tip->fs_buf, tip->fs_buf_len);
488                         munmap(tip->fs_buf, tip->fs_buf_len);
489                         tip->fs_buf = NULL;
490                 }
491
492                 tip->fs_off = tip->fs_size & (page_size - 1);
493                 tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
494                 tip->fs_max_size += tip->fs_buf_len;
495
496                 if (ftruncate(ofd, tip->fs_max_size) < 0) {
497                         perror("ftruncate");
498                         return -1;
499                 }
500
501                 tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
502                                    MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
503                 if (tip->fs_buf == MAP_FAILED) {
504                         perror("mmap");
505                         return -1;
506                 }
507                 mlock(tip->fs_buf, tip->fs_buf_len);
508         }
509
510         ret = read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
511         if (ret >= 0) {
512                 tip->data_read += ret;
513                 tip->fs_size += ret;
514                 tip->fs_off += ret;
515                 return 0;
516         }
517
518         return -1;
519 }
520
521 /*
522  * Use the copy approach for pipes and network
523  */
524 static int get_subbuf(struct thread_information *tip)
525 {
526         struct tip_subbuf *ts = malloc(sizeof(*ts));
527         int ret;
528
529         ts->buf = malloc(buf_size);
530         ts->max_len = buf_size;
531
532         ret = read_data(tip, ts->buf, ts->max_len);
533         if (ret > 0) {
534                 ts->len = ret;
535                 return subbuf_fifo_queue(tip, ts);
536         }
537
538         return ret;
539 }
540
541 static void close_thread(struct thread_information *tip)
542 {
543         if (tip->fd != -1)
544                 close(tip->fd);
545         if (tip->ofile)
546                 fclose(tip->ofile);
547         if (tip->ofile_buffer)
548                 free(tip->ofile_buffer);
549         if (tip->fd_buf)
550                 free(tip->fd_buf);
551
552         tip->fd = -1;
553         tip->ofile = NULL;
554         tip->ofile_buffer = NULL;
555         tip->fd_buf = NULL;
556 }
557
558 static void tip_ftrunc_final(struct thread_information *tip)
559 {
560         /*
561          * truncate to right size and cleanup mmap
562          */
563         if (tip->ofile_mmap) {
564                 int ofd = fileno(tip->ofile);
565
566                 if (tip->fs_buf)
567                         munmap(tip->fs_buf, tip->fs_buf_len);
568
569                 ftruncate(ofd, tip->fs_size);
570         }
571 }
572
573 static void *thread_main(void *arg)
574 {
575         struct thread_information *tip = arg;
576         pid_t pid = getpid();
577         cpu_set_t cpu_mask;
578
579         CPU_ZERO(&cpu_mask);
580         CPU_SET((tip->cpu), &cpu_mask);
581
582         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
583                 perror("sched_setaffinity");
584                 exit_trace(1);
585         }
586
587         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
588                         relay_path, tip->device->buts_name, tip->cpu);
589         tip->fd = open(tip->fn, O_RDONLY);
590         if (tip->fd < 0) {
591                 perror(tip->fn);
592                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
593                         tip->fn);
594                 exit_trace(1);
595         }
596
597         while (!is_done()) {
598                 if (tip->ofile_mmap && net_mode != Net_client) {
599                         if (mmap_subbuf(tip, buf_size))
600                                 break;
601                 } else {
602                         if (get_subbuf(tip))
603                                 break;
604                 }
605         }
606
607         tip_ftrunc_final(tip);
608         tip->exited = 1;
609         return NULL;
610 }
611
612 static int write_data_net(int fd, void *buf, unsigned int buf_len)
613 {
614         unsigned int bytes_left = buf_len;
615         int ret;
616
617         while (bytes_left) {
618                 ret = send(fd, buf, bytes_left, 0);
619                 if (ret < 0) {
620                         perror("send");
621                         return 1;
622                 }
623
624                 buf += ret;
625                 bytes_left -= ret;
626         }
627
628         return 0;
629 }
630
631 static int flush_subbuf_net(struct thread_information *tip,
632                             struct tip_subbuf *ts)
633 {
634         struct blktrace_net_hdr hdr;
635
636         hdr.magic = BLK_IO_TRACE_MAGIC;
637         strcpy(hdr.buts_name, tip->device->buts_name);
638         hdr.cpu = tip->cpu;
639         hdr.max_cpus = ncpus;
640         hdr.len = ts->len;
641
642         if (write_data_net(net_out_fd, &hdr, sizeof(hdr)))
643                 return 1;
644
645         if (write_data_net(net_out_fd, ts->buf, ts->len))
646                 return 1;
647
648         free(ts->buf);
649         free(ts);
650         return 0;
651 }
652
653 static int write_data(struct thread_information *tip, void *buf,
654                       unsigned int buf_len)
655 {
656         int ret;
657
658         if (!buf_len)
659                 return 0;
660
661         while (1) {
662                 ret = fwrite(buf, buf_len, 1, tip->ofile);
663                 if (ret == 1)
664                         break;
665
666                 if (ret < 0) {
667                         perror("write");
668                         return 1;
669                 }
670         }
671
672         if (tip->ofile_stdout)
673                 fflush(tip->ofile);
674
675         return 0;
676 }
677
678 static int flush_subbuf_file(struct thread_information *tip,
679                              struct tip_subbuf *ts)
680 {
681         unsigned int offset = 0;
682         struct blk_io_trace *t;
683         int pdu_len, events = 0;
684
685         /*
686          * surplus from last run
687          */
688         if (tip->leftover_ts) {
689                 struct tip_subbuf *prev_ts = tip->leftover_ts;
690
691                 if (prev_ts->len + ts->len > prev_ts->max_len) {
692                         prev_ts->max_len += ts->len;
693                         prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
694                 }
695
696                 memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
697                 prev_ts->len += ts->len;
698
699                 free(ts->buf);
700                 free(ts);
701
702                 ts = prev_ts;
703                 tip->leftover_ts = NULL;
704         }
705
706         while (offset + sizeof(*t) <= ts->len) {
707                 t = ts->buf + offset;
708
709                 if (verify_trace(t)) {
710                         write_data(tip, ts->buf, offset);
711                         return -1;
712                 }
713
714                 pdu_len = t->pdu_len;
715
716                 if (offset + sizeof(*t) + pdu_len > ts->len)
717                         break;
718
719                 offset += sizeof(*t) + pdu_len;
720                 tip->events_processed++;
721                 tip->data_read += sizeof(*t) + pdu_len;
722                 events++;
723         }
724
725         if (write_data(tip, ts->buf, offset))
726                 return -1;
727
728         /*
729          * leftover bytes, save them for next time
730          */
731         if (offset != ts->len) {
732                 tip->leftover_ts = ts;
733                 ts->len -= offset;
734                 memmove(ts->buf, ts->buf + offset, ts->len);
735         } else {
736                 free(ts->buf);
737                 free(ts);
738         }
739
740         return events;
741 }
742
743 static int write_tip_events(struct thread_information *tip)
744 {
745         struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
746
747         if (ts) {
748                 if (net_mode == Net_client)
749                         return flush_subbuf_net(tip, ts);
750
751                 return flush_subbuf_file(tip, ts);
752         }
753
754         return 0;
755 }
756
757 /*
758  * scans the tips we know and writes out the subbuffers we accumulate
759  */
760 static void get_and_write_events(void)
761 {
762         struct device_information *dip;
763         struct thread_information *tip;
764         int i, j, events, ret, tips_running;
765
766         while (!is_done()) {
767                 events = 0;
768
769                 for_each_dip(dip, i) {
770                         for_each_tip(dip, tip, j) {
771                                 ret = write_tip_events(tip);
772                                 if (ret > 0)
773                                         events += ret;
774                         }
775                 }
776
777                 if (!events)
778                         usleep(10);
779         }
780
781         /*
782          * reap stored events
783          */
784         do {
785                 events = 0;
786                 tips_running = 0;
787                 for_each_dip(dip, i) {
788                         for_each_tip(dip, tip, j) {
789                                 ret = write_tip_events(tip);
790                                 if (ret > 0)
791                                         events += ret;
792                                 tips_running += !tip->exited;
793                         }
794                 }
795                 usleep(10);
796         } while (events || tips_running);
797 }
798
799 static void wait_for_threads(void)
800 {
801         /*
802          * for piped or network output, poll and fetch data for writeout.
803          * for files, we just wait around for trace threads to exit
804          */
805         if ((output_name && !strcmp(output_name, "-")) ||
806             net_mode == Net_client)
807                 get_and_write_events();
808         else {
809                 struct device_information *dip;
810                 struct thread_information *tip;
811                 int i, j, tips_running;
812
813                 do {
814                         tips_running = 0;
815                         usleep(1000);
816
817                         for_each_dip(dip, i)
818                                 for_each_tip(dip, tip, j)
819                                         tips_running += !tip->exited;
820                 } while (tips_running);
821         }
822 }
823
824 static void fill_ofname(char *dst, char *buts_name, int cpu)
825 {
826         int len = 0;
827
828         if (output_dir)
829                 len = sprintf(dst, "%s/", output_dir);
830
831         if (output_name)
832                 sprintf(dst + len, "%s.blktrace.%d", output_name, cpu);
833         else
834                 sprintf(dst + len, "%s.blktrace.%d", buts_name, cpu);
835 }
836
837 static int start_threads(struct device_information *dip)
838 {
839         struct thread_information *tip;
840         int j, pipeline = output_name && !strcmp(output_name, "-");
841         int mode, vbuf_size;
842         char op[64];
843
844         for_each_tip(dip, tip, j) {
845                 tip->cpu = j;
846                 tip->device = dip;
847                 tip->events_processed = 0;
848                 memset(&tip->fifo, 0, sizeof(tip->fifo));
849                 tip->leftover_ts = NULL;
850
851                 if (pipeline) {
852                         tip->ofile = fdopen(STDOUT_FILENO, "w");
853                         tip->ofile_stdout = 1;
854                         tip->ofile_mmap = 0;
855                         mode = _IOLBF;
856                         vbuf_size = 512;
857                 } else {
858                         fill_ofname(op, dip->buts_name, tip->cpu);
859                         tip->ofile = fopen(op, "w+");
860                         tip->ofile_stdout = 0;
861                         tip->ofile_mmap = 1;
862                         mode = _IOFBF;
863                         vbuf_size = OFILE_BUF;
864                 }
865
866                 if (tip->ofile == NULL) {
867                         perror(op);
868                         return 1;
869                 }
870
871                 tip->ofile_buffer = malloc(vbuf_size);
872                 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
873                         perror("setvbuf");
874                         close_thread(tip);
875                         return 1;
876                 }
877
878                 if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
879                         perror("pthread_create");
880                         close_thread(tip);
881                         return 1;
882                 }
883         }
884
885         return 0;
886 }
887
888 static void stop_threads(struct device_information *dip)
889 {
890         struct thread_information *tip;
891         unsigned long ret;
892         int i;
893
894         for_each_tip(dip, tip, i) {
895                 (void) pthread_join(tip->thread, (void *) &ret);
896                 close_thread(tip);
897         }
898 }
899
900 static void stop_all_threads(void)
901 {
902         struct device_information *dip;
903         int i;
904
905         for_each_dip(dip, i)
906                 stop_threads(dip);
907 }
908
909 static void stop_all_tracing(void)
910 {
911         struct device_information *dip;
912         int i;
913
914         for_each_dip(dip, i)
915                 stop_trace(dip);
916 }
917
918 static void exit_trace(int status)
919 {
920         if (!is_trace_stopped()) {
921                 trace_stopped = 1;
922                 stop_all_threads();
923                 stop_all_tracing();
924         }
925
926         exit(status);
927 }
928
929 static int resize_devices(char *path)
930 {
931         int size = (ndevs + 1) * sizeof(struct device_information);
932
933         device_information = realloc(device_information, size);
934         if (!device_information) {
935                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
936                 return 1;
937         }
938         device_information[ndevs].path = path;
939         ndevs++;
940         return 0;
941 }
942
943 static int open_devices(void)
944 {
945         struct device_information *dip;
946         int i;
947
948         for_each_dip(dip, i) {
949                 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
950                 if (dip->fd < 0) {
951                         perror(dip->path);
952                         return 1;
953                 }
954         }
955
956         return 0;
957 }
958
959 static int start_devices(void)
960 {
961         struct device_information *dip;
962         int i, j, size;
963
964         size = ncpus * sizeof(struct thread_information);
965         thread_information = malloc(size * ndevs);
966         if (!thread_information) {
967                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
968                 return 1;
969         }
970
971         for_each_dip(dip, i) {
972                 if (start_trace(dip)) {
973                         close(dip->fd);
974                         fprintf(stderr, "Failed to start trace on %s\n",
975                                 dip->path);
976                         break;
977                 }
978         }
979
980         if (i != ndevs) {
981                 __for_each_dip(dip, j, i)
982                         stop_trace(dip);
983
984                 return 1;
985         }
986
987         for_each_dip(dip, i) {
988                 dip->threads = thread_information + (i * ncpus);
989                 if (start_threads(dip)) {
990                         fprintf(stderr, "Failed to start worker threads\n");
991                         break;
992                 }
993         }
994
995         if (i != ndevs) {
996                 __for_each_dip(dip, j, i)
997                         stop_threads(dip);
998                 for_each_dip(dip, i)
999                         stop_trace(dip);
1000
1001                 return 1;
1002         }
1003
1004         return 0;
1005 }
1006
1007 static void show_stats(void)
1008 {
1009         struct device_information *dip;
1010         struct thread_information *tip;
1011         unsigned long long events_processed, data_read;
1012         unsigned long total_drops;
1013         int i, j, no_stdout = 0;
1014
1015         if (is_stat_shown())
1016                 return;
1017
1018         if (output_name && !strcmp(output_name, "-"))
1019                 no_stdout = 1;
1020
1021         stat_shown = 1;
1022
1023         total_drops = 0;
1024         for_each_dip(dip, i) {
1025                 if (!no_stdout)
1026                         printf("Device: %s\n", dip->path);
1027                 events_processed = 0;
1028                 data_read = 0;
1029                 for_each_tip(dip, tip, j) {
1030                         if (!no_stdout)
1031                                 printf("  CPU%3d: %20lu events, %8llu KiB data\n",
1032                                         tip->cpu, tip->events_processed,
1033                                         tip->data_read >> 10);
1034                         events_processed += tip->events_processed;
1035                         data_read += tip->data_read;
1036                 }
1037                 total_drops += dip->drop_count;
1038                 if (!no_stdout)
1039                         printf("  Total:  %20llu events (dropped %lu), %8llu KiB data\n",
1040                                         events_processed, dip->drop_count,
1041                                         data_read >> 10);
1042         }
1043
1044         if (total_drops)
1045                 fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
1046 }
1047
1048 static struct device_information *net_get_dip(char *buts_name)
1049 {
1050         struct device_information *dip;
1051         int i;
1052
1053         for (i = 0; i < ndevs; i++) {
1054                 dip = &device_information[i];
1055
1056                 if (!strcmp(dip->buts_name, buts_name))
1057                         return dip;
1058         }
1059
1060         device_information = realloc(device_information, (ndevs + 1) * sizeof(*dip));
1061         dip = &device_information[ndevs];
1062         strcpy(dip->buts_name, buts_name);
1063         ndevs++;
1064         dip->threads = malloc(ncpus * sizeof(struct thread_information));
1065         memset(dip->threads, 0, ncpus * sizeof(struct thread_information));
1066
1067         /*
1068          * open all files
1069          */
1070         for (i = 0; i < ncpus; i++) {
1071                 struct thread_information *tip = &dip->threads[i];
1072                 char op[64];
1073
1074                 tip->cpu = i;
1075                 tip->ofile_stdout = 0;
1076                 tip->ofile_mmap = 1;
1077                 tip->device = dip;
1078
1079                 fill_ofname(op, dip->buts_name, tip->cpu);
1080
1081                 tip->ofile = fopen(op, "w+");
1082                 if (!tip->ofile) {
1083                         perror("fopen");
1084                         return NULL;
1085                 }
1086         }
1087
1088         return dip;
1089 }
1090
1091 static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh)
1092 {
1093         struct device_information *dip;
1094
1095         ncpus = bnh->max_cpus;
1096         dip = net_get_dip(bnh->buts_name);
1097         return &dip->threads[bnh->cpu];
1098 }
1099
1100 static int net_get_header(struct blktrace_net_hdr *bnh)
1101 {
1102         int fl = fcntl(net_in_fd, F_GETFL);
1103         int bytes_left, ret;
1104         void *p = bnh;
1105
1106         fcntl(net_in_fd, F_SETFL, fl | O_NONBLOCK);
1107         bytes_left = sizeof(*bnh);
1108         while (bytes_left && !is_done()) {
1109                 ret = recv(net_in_fd, p, bytes_left, MSG_WAITALL);
1110                 if (ret < 0) {
1111                         if (errno != EAGAIN) {
1112                                 perror("recv header");
1113                                 return 1;
1114                         }
1115                         usleep(100);
1116                         continue;
1117                 } else if (!ret) {
1118                         usleep(100);
1119                         continue;
1120                 } else {
1121                         p += ret;
1122                         bytes_left -= ret;
1123                 }
1124         }
1125         fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
1126         return 0;
1127 }
1128
1129 static int net_server_loop(void)
1130 {
1131         struct thread_information *tip;
1132         struct blktrace_net_hdr bnh;
1133
1134         if (net_get_header(&bnh))
1135                 return 1;
1136
1137         if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
1138                 fprintf(stderr, "server: received data is bad\n");
1139                 return 1;
1140         }
1141
1142         if (!data_is_native) {
1143                 bnh.cpu = be32_to_cpu(bnh.cpu);
1144                 bnh.len = be32_to_cpu(bnh.len);
1145         }
1146
1147         tip = net_get_tip(&bnh);
1148         if (!tip)
1149                 return 1;
1150
1151         if (mmap_subbuf(tip, bnh.len))
1152                 return 1;
1153
1154         return 0;
1155 }
1156
1157 /*
1158  * Start here when we are in server mode - just fetch data from the network
1159  * and dump to files
1160  */
1161 static int net_server(void)
1162 {
1163         struct sockaddr_in addr;
1164         socklen_t socklen;
1165         int fd, opt, i, j;
1166
1167         fd = socket(AF_INET, SOCK_STREAM, 0);
1168         if (fd < 0) {
1169                 perror("server: socket");
1170                 return 1;
1171         }
1172
1173         opt = 1;
1174         if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
1175                 perror("setsockopt");
1176                 return 1;
1177         }
1178
1179         memset(&addr, 0, sizeof(addr));
1180         addr.sin_family = AF_INET;
1181         addr.sin_addr.s_addr = htonl(INADDR_ANY);
1182         addr.sin_port = htons(net_port);
1183
1184         if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1185                 perror("bind");
1186                 return 1;
1187         }
1188
1189         if (listen(fd, 1) < 0) {
1190                 perror("listen");
1191                 return 1;
1192         }
1193
1194         printf("blktrace: waiting for incoming connection...\n");
1195
1196         socklen = sizeof(addr);
1197         net_in_fd = accept(fd, (struct sockaddr *) &addr, &socklen);
1198         if (net_in_fd < 0) {
1199                 perror("accept");
1200                 return 1;
1201         }
1202
1203         signal(SIGINT, handle_sigint);
1204         signal(SIGHUP, handle_sigint);
1205         signal(SIGTERM, handle_sigint);
1206         signal(SIGALRM, handle_sigint);
1207
1208         printf("blktrace: connected!\n");
1209
1210         while (!is_done()) {
1211                 if (net_server_loop())
1212                         break;
1213         }
1214
1215         for (i = 0; i < ndevs; i++) {
1216                 struct device_information *dip = &device_information[i];
1217
1218                 for (j = 0; j < ncpus; j++)
1219                         tip_ftrunc_final(&dip->threads[j]);
1220         }
1221
1222         return 0;
1223 }
1224
1225 /*
1226  * Setup outgoing network connection where we will transmit data
1227  */
1228 static int net_setup_client(void)
1229 {
1230         struct sockaddr_in addr;
1231         int fd;
1232
1233         fd = socket(AF_INET, SOCK_STREAM, 0);
1234         if (fd < 0) {
1235                 perror("client: socket");
1236                 return 1;
1237         }
1238
1239         memset(&addr, 0, sizeof(addr));
1240         addr.sin_family = AF_INET;
1241         addr.sin_port = htons(net_port);
1242
1243         if (inet_aton(hostname, &addr.sin_addr) != 1) {
1244                 struct hostent *hent = gethostbyname(hostname);
1245                 if (!hent) {
1246                         perror("gethostbyname");
1247                         return 1;
1248                 }
1249
1250                 memcpy(&addr.sin_addr, hent->h_addr, 4);
1251                 strcpy(hostname, hent->h_name);
1252         }
1253
1254         printf("blktrace: connecting to %s\n", hostname);
1255
1256         if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1257                 perror("client: connect");
1258                 return 1;
1259         }
1260
1261         printf("blktrace: connected!\n");
1262         net_out_fd = fd;
1263         return 0;
1264 }
1265
1266 static char usage_str[] = \
1267         "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
1268         "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
1269         "\t-d Use specified device. May also be given last after options\n" \
1270         "\t-r Path to mounted relayfs, defaults to /relay\n" \
1271         "\t-o File(s) to send output to\n" \
1272         "\t-D Directory to prepend to output file names\n" \
1273         "\t-k Kill a running trace\n" \
1274         "\t-w Stop after defined time, in seconds\n" \
1275         "\t-a Only trace specified actions. See documentation\n" \
1276         "\t-A Give trace mask as a single value. See documentation\n" \
1277         "\t-b Sub buffer size in KiB\n" \
1278         "\t-n Number of sub buffers\n" \
1279         "\t-v Print program version info\n\n";
1280
1281 static void show_usage(char *program)
1282 {
1283         fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
1284 }
1285
1286 int main(int argc, char *argv[])
1287 {
1288         static char default_relay_path[] = "/relay";
1289         struct statfs st;
1290         int i, c;
1291         int stop_watch = 0;
1292         int act_mask_tmp = 0;
1293
1294         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
1295                 switch (c) {
1296                 case 'a':
1297                         i = find_mask_map(optarg);
1298                         if (i < 0) {
1299                                 fprintf(stderr,"Invalid action mask %s\n",
1300                                         optarg);
1301                                 return 1;
1302                         }
1303                         act_mask_tmp |= i;
1304                         break;
1305
1306                 case 'A':
1307                         if ((sscanf(optarg, "%x", &i) != 1) || 
1308                                                         !valid_act_opt(i)) {
1309                                 fprintf(stderr,
1310                                         "Invalid set action mask %s/0x%x\n",
1311                                         optarg, i);
1312                                 return 1;
1313                         }
1314                         act_mask_tmp = i;
1315                         break;
1316
1317                 case 'd':
1318                         if (resize_devices(optarg) != 0)
1319                                 return 1;
1320                         break;
1321
1322                 case 'r':
1323                         relay_path = optarg;
1324                         break;
1325
1326                 case 'o':
1327                         output_name = optarg;
1328                         break;
1329                 case 'k':
1330                         kill_running_trace = 1;
1331                         break;
1332                 case 'w':
1333                         stop_watch = atoi(optarg);
1334                         if (stop_watch <= 0) {
1335                                 fprintf(stderr,
1336                                         "Invalid stopwatch value (%d secs)\n",
1337                                         stop_watch);
1338                                 return 1;
1339                         }
1340                         break;
1341                 case 'V':
1342                         printf("%s version %s\n", argv[0], blktrace_version);
1343                         return 0;
1344                 case 'b':
1345                         buf_size = strtoul(optarg, NULL, 10);
1346                         if (buf_size <= 0 || buf_size > 16*1024) {
1347                                 fprintf(stderr,
1348                                         "Invalid buffer size (%lu)\n",buf_size);
1349                                 return 1;
1350                         }
1351                         buf_size <<= 10;
1352                         break;
1353                 case 'n':
1354                         buf_nr = strtoul(optarg, NULL, 10);
1355                         if (buf_nr <= 0) {
1356                                 fprintf(stderr,
1357                                         "Invalid buffer nr (%lu)\n", buf_nr);
1358                                 return 1;
1359                         }
1360                         break;
1361                 case 'D':
1362                         output_dir = optarg;
1363                         break;
1364                 case 'h':
1365                         net_mode = Net_client;
1366                         strcpy(hostname, optarg);
1367                         break;
1368                 case 'l':
1369                         net_mode = Net_server;
1370                         break;
1371                 case 'p':
1372                         net_port = atoi(optarg);
1373                         break;
1374                 default:
1375                         show_usage(argv[0]);
1376                         return 1;
1377                 }
1378         }
1379
1380         setlocale(LC_NUMERIC, "en_US");
1381
1382         page_size = getpagesize();
1383
1384         if (net_mode == Net_server)
1385                 return net_server();
1386
1387         while (optind < argc) {
1388                 if (resize_devices(argv[optind++]) != 0)
1389                         return 1;
1390         }
1391
1392         if (ndevs == 0) {
1393                 show_usage(argv[0]);
1394                 return 1;
1395         }
1396
1397         if (!relay_path)
1398                 relay_path = default_relay_path;
1399
1400         if (act_mask_tmp != 0)
1401                 act_mask = act_mask_tmp;
1402
1403         if (statfs(relay_path, &st) < 0) {
1404                 perror("statfs");
1405                 fprintf(stderr,"%s does not appear to be a valid path\n",
1406                         relay_path);
1407                 return 1;
1408         } else if (st.f_type != (long) RELAYFS_TYPE) {
1409                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
1410                         relay_path);
1411                 return 1;
1412         }
1413
1414         if (open_devices() != 0)
1415                 return 1;
1416
1417         if (kill_running_trace) {
1418                 stop_all_traces();
1419                 return 0;
1420         }
1421
1422         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
1423         if (ncpus < 0) {
1424                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
1425                 return 1;
1426         }
1427
1428         signal(SIGINT, handle_sigint);
1429         signal(SIGHUP, handle_sigint);
1430         signal(SIGTERM, handle_sigint);
1431         signal(SIGALRM, handle_sigint);
1432
1433         if (net_mode == Net_client && net_setup_client())
1434                 return 1;
1435
1436         if (start_devices() != 0)
1437                 return 1;
1438
1439         atexit(stop_all_tracing);
1440
1441         if (stop_watch)
1442                 alarm(stop_watch);
1443
1444         wait_for_threads();
1445
1446         if (!is_trace_stopped()) {
1447                 trace_stopped = 1;
1448                 stop_all_threads();
1449                 stop_all_traces();
1450         }
1451
1452         show_stats();
1453
1454         return 0;
1455 }
1456