[PATCH] blktrace: first cut at adding network support
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <sys/poll.h>
33 #include <sys/mman.h>
34 #include <sys/socket.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <sched.h>
38 #include <ctype.h>
39 #include <getopt.h>
40 #include <errno.h>
41 #include <netinet/in.h>
42 #include <arpa/inet.h>
43 #include <netdb.h>
44 #include <sys/sendfile.h>
45
46 #include "blktrace.h"
47 #include "barrier.h"
48
49 static char blktrace_version[] = "0.99";
50
51 /*
52  * You may want to increase this even more, if you are logging at a high
53  * rate and see skipped/missed events
54  */
55 #define BUF_SIZE        (512 * 1024)
56 #define BUF_NR          (4)
57
58 #define OFILE_BUF       (128 * 1024)
59
60 #define RELAYFS_TYPE    0xF0B4A981
61
62 #define S_OPTS  "d:a:A:r:o:kw:Vb:n:D:lh:p:"
63 static struct option l_opts[] = {
64         {
65                 .name = "dev",
66                 .has_arg = required_argument,
67                 .flag = NULL,
68                 .val = 'd'
69         },
70         {
71                 .name = "act-mask",
72                 .has_arg = required_argument,
73                 .flag = NULL,
74                 .val = 'a'
75         },
76         {
77                 .name = "set-mask",
78                 .has_arg = required_argument,
79                 .flag = NULL,
80                 .val = 'A'
81         },
82         {
83                 .name = "relay",
84                 .has_arg = required_argument,
85                 .flag = NULL,
86                 .val = 'r'
87         },
88         {
89                 .name = "output",
90                 .has_arg = required_argument,
91                 .flag = NULL,
92                 .val = 'o'
93         },
94         {
95                 .name = "kill",
96                 .has_arg = no_argument,
97                 .flag = NULL,
98                 .val = 'k'
99         },
100         {
101                 .name = "stopwatch",
102                 .has_arg = required_argument,
103                 .flag = NULL,
104                 .val = 'w'
105         },
106         {
107                 .name = "version",
108                 .has_arg = no_argument,
109                 .flag = NULL,
110                 .val = 'V'
111         },
112         {
113                 .name = "buffer-size",
114                 .has_arg = required_argument,
115                 .flag = NULL,
116                 .val = 'b'
117         },
118         {
119                 .name = "num-sub-buffers",
120                 .has_arg = required_argument,
121                 .flag = NULL,
122                 .val = 'n'
123         },
124         {
125                 .name = "output-dir",
126                 .has_arg = required_argument,
127                 .flag = NULL,
128                 .val = 'D'
129         },
130         {
131                 .name = "listen",
132                 .has_arg = no_argument,
133                 .flag = NULL,
134                 .val = 'l'
135         },
136         {
137                 .name = "host",
138                 .has_arg = required_argument,
139                 .flag = NULL,
140                 .val = 'h'
141         },
142         {
143                 .name = "port",
144                 .has_arg = required_argument,
145                 .flag = NULL,
146                 .val = 'p'
147         },
148         {
149                 .name = NULL,
150         }
151 };
152
153 struct tip_subbuf {
154         void *buf;
155         unsigned int len;
156         unsigned int max_len;
157         off_t offset;
158 };
159
160 #define FIFO_SIZE       (1024)  /* should be plenty big! */
161 #define CL_SIZE         (128)   /* cache line, any bigger? */
162
163 struct tip_subbuf_fifo {
164         int tail __attribute__((aligned(CL_SIZE)));
165         int head __attribute__((aligned(CL_SIZE)));
166         struct tip_subbuf *q[FIFO_SIZE];
167 };
168
169 struct thread_information {
170         int cpu;
171         pthread_t thread;
172
173         int fd;
174         void *fd_buf;
175         char fn[MAXPATHLEN + 64];
176
177         char ofname[64];
178
179         FILE *ofile;
180         off_t ofile_offset;
181         char *ofile_buffer;
182         int ofile_stdout;
183         int ofile_mmap;
184
185         unsigned long events_processed;
186         unsigned long long data_read;
187         struct device_information *device;
188
189         int exited;
190
191         /*
192          * piped fifo buffers
193          */
194         struct tip_subbuf_fifo fifo;
195         struct tip_subbuf *leftover_ts;
196
197         /*
198          * mmap controlled output files
199          */
200         unsigned long long fs_size;
201         unsigned long long fs_max_size;
202         unsigned long fs_off;
203         void *fs_buf;
204         unsigned long fs_buf_len;
205 };
206
207 struct device_information {
208         int fd;
209         char *path;
210         char buts_name[32];
211         volatile int trace_started;
212         unsigned long drop_count;
213         struct thread_information *threads;
214 };
215
216 static int ncpus;
217 static struct thread_information *thread_information;
218 static int ndevs;
219 static struct device_information *device_information;
220
221 /* command line option globals */
222 static char *relay_path;
223 static char *output_name;
224 static char *output_dir;
225 static int act_mask = ~0U;
226 static int kill_running_trace;
227 static unsigned long buf_size = BUF_SIZE;
228 static unsigned long buf_nr = BUF_NR;
229 static unsigned int page_size;
230
231 #define is_done()       (*(volatile int *)(&done))
232 static volatile int done;
233
234 #define is_trace_stopped()      (*(volatile int *)(&trace_stopped))
235 static volatile int trace_stopped;
236
237 #define is_stat_shown() (*(volatile int *)(&stat_shown))
238 static volatile int stat_shown;
239
240 int data_is_native = -1;
241
242 static void exit_trace(int status);
243
244 #define dip_tracing(dip)        (*(volatile int *)(&(dip)->trace_started))
245 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
246
247 #define __for_each_dip(__d, __i, __e)   \
248         for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
249
250 #define for_each_dip(__d, __i)  __for_each_dip(__d, __i, ndevs)
251 #define for_each_tip(__d, __t, __j)     \
252         for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
253
254 /*
255  * networking stuff follows. we include a magic number so we know whether
256  * to endianness convert or not
257  */
258 struct blktrace_net_hdr {
259         u32 magic;              /* same as trace magic */
260         char ofname[64];        /* trace name */
261         u32 cpu;                /* for which cpu */
262         u32 len;                /* length of following trace data */
263 };
264
265 #define TRACE_NET_PORT          (8462)
266
267 enum {
268         Net_none = 0,
269         Net_server,
270         Net_client,
271 };
272
273 /*
274  * network cmd line params
275  */
276 static char hostname[MAXHOSTNAMELEN];
277 static int net_port = TRACE_NET_PORT;
278 static int net_mode = 0;
279
280 static int net_in_fd = -1;
281 static int net_out_fd = -1;
282
283 static void handle_sigint(__attribute__((__unused__)) int sig)
284 {
285         done = 1;
286 }
287
288 static int get_dropped_count(const char *buts_name)
289 {
290         int fd;
291         char tmp[MAXPATHLEN + 64];
292
293         snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
294                  relay_path, buts_name);
295
296         fd = open(tmp, O_RDONLY);
297         if (fd < 0) {
298                 /*
299                  * this may be ok, if the kernel doesn't support dropped counts
300                  */
301                 if (errno == ENOENT)
302                         return 0;
303
304                 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
305                 return -1;
306         }
307
308         if (read(fd, tmp, sizeof(tmp)) < 0) {
309                 perror(tmp);
310                 close(fd);
311                 return -1;
312         }
313
314         close(fd);
315
316         return atoi(tmp);
317 }
318
319 static int start_trace(struct device_information *dip)
320 {
321         struct blk_user_trace_setup buts;
322
323         memset(&buts, 0, sizeof(buts));
324         buts.buf_size = buf_size;
325         buts.buf_nr = buf_nr;
326         buts.act_mask = act_mask;
327
328         if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
329                 perror("BLKTRACESETUP");
330                 return 1;
331         }
332
333         if (ioctl(dip->fd, BLKTRACESTART) < 0) {
334                 perror("BLKTRACESTART");
335                 return 1;
336         }
337
338         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
339         dip_set_tracing(dip, 1);
340         return 0;
341 }
342
343 static void stop_trace(struct device_information *dip)
344 {
345         if (dip_tracing(dip) || kill_running_trace) {
346                 dip_set_tracing(dip, 0);
347
348                 if (ioctl(dip->fd, BLKTRACESTOP) < 0)
349                         perror("BLKTRACESTOP");
350                 if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
351                         perror("BLKTRACETEARDOWN");
352
353                 close(dip->fd);
354                 dip->fd = -1;
355         }
356 }
357
358 static void stop_all_traces(void)
359 {
360         struct device_information *dip;
361         int i;
362
363         for_each_dip(dip, i) {
364                 dip->drop_count = get_dropped_count(dip->buts_name);
365                 stop_trace(dip);
366         }
367 }
368
369 static void wait_for_data(struct thread_information *tip)
370 {
371         struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
372
373         do {
374                 poll(&pfd, 1, 100);
375                 if (pfd.revents & POLLIN)
376                         break;
377                 if (tip->ofile_stdout)
378                         break;
379         } while (!is_done());
380 }
381
382 static int read_data_file(struct thread_information *tip, void *buf, int len)
383 {
384         int ret = 0;
385
386         do {
387                 wait_for_data(tip);
388
389                 ret = read(tip->fd, buf, len);
390                 if (!ret)
391                         continue;
392                 else if (ret > 0)
393                         return ret;
394                 else {
395                         if (errno != EAGAIN) {
396                                 perror(tip->fn);
397                                 fprintf(stderr,"Thread %d failed read of %s\n",
398                                         tip->cpu, tip->fn);
399                                 break;
400                         }
401                         continue;
402                 }
403         } while (!is_done());
404
405         return ret;
406
407 }
408
409 static int read_data_net(struct thread_information *tip, void *buf, int len)
410 {
411         unsigned int bytes_left = len;
412         int ret = 0;
413
414         do {
415                 ret = recv(net_in_fd, buf, bytes_left, MSG_WAITALL);
416
417                 if (!ret)
418                         continue;
419                 else if (ret < 0) {
420                         if (errno != EAGAIN) {
421                                 perror(tip->fn);
422                                 fprintf(stderr, "server: failed read\n");
423                                 return 0;
424                         }
425                         continue;
426                 } else {
427                         buf += ret;
428                         bytes_left -= ret;
429                 }
430         } while (!is_done() && bytes_left);
431
432         return len;
433 }
434
435 static int read_data(struct thread_information *tip, void *buf, int len)
436 {
437         if (net_mode == Net_server)
438                 return read_data_net(tip, buf, len);
439
440         return read_data_file(tip, buf, len);
441 }
442
443 static inline struct tip_subbuf *
444 subbuf_fifo_dequeue(struct thread_information *tip)
445 {
446         const int head = tip->fifo.head;
447         const int next = (head + 1) & (FIFO_SIZE - 1);
448
449         if (head != tip->fifo.tail) {
450                 struct tip_subbuf *ts = tip->fifo.q[head];
451
452                 store_barrier();
453                 tip->fifo.head = next;
454                 return ts;
455         }
456
457         return NULL;
458 }
459
460 static inline int subbuf_fifo_queue(struct thread_information *tip,
461                                     struct tip_subbuf *ts)
462 {
463         const int tail = tip->fifo.tail;
464         const int next = (tail + 1) & (FIFO_SIZE - 1);
465
466         if (next != tip->fifo.head) {
467                 tip->fifo.q[tail] = ts;
468                 store_barrier();
469                 tip->fifo.tail = next;
470                 return 0;
471         }
472
473         fprintf(stderr, "fifo too small!\n");
474         return 1;
475 }
476
477 /*
478  * For file output, truncate and mmap the file appropriately
479  */
480 static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
481 {
482         int ofd = fileno(tip->ofile);
483         int ret;
484
485         /*
486          * extend file, if we have to. use chunks of 16 subbuffers.
487          */
488         if (tip->fs_off + buf_size > tip->fs_buf_len) {
489                 if (tip->fs_buf) {
490                         munlock(tip->fs_buf, tip->fs_buf_len);
491                         munmap(tip->fs_buf, tip->fs_buf_len);
492                         tip->fs_buf = NULL;
493                 }
494
495                 tip->fs_off = tip->fs_size & (page_size - 1);
496                 tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
497                 tip->fs_max_size += tip->fs_buf_len;
498
499                 if (ftruncate(ofd, tip->fs_max_size) < 0) {
500                         perror("ftruncate");
501                         return -1;
502                 }
503
504                 tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
505                                    MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
506                 if (tip->fs_buf == MAP_FAILED) {
507                         perror("mmap");
508                         return -1;
509                 }
510                 mlock(tip->fs_buf, tip->fs_buf_len);
511         }
512
513         ret = read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
514         if (ret >= 0) {
515                 tip->data_read += ret;
516                 tip->fs_size += ret;
517                 tip->fs_off += ret;
518                 return 0;
519         }
520
521         return -1;
522 }
523
524 /*
525  * Use the copy approach for pipes
526  */
527 static int get_subbuf(struct thread_information *tip)
528 {
529         struct tip_subbuf *ts = malloc(sizeof(*ts));
530         int ret = 1;
531
532         /*
533          * for the net client, don't actually read the data in as we will
534          * use sendfile to transmit it. just mark how much data we have
535          */
536         if (net_mode == Net_client) {
537                 struct stat sb;
538
539                 ts->buf = NULL;
540
541                 if (fstat(tip->fd, &sb) < 0) {
542                         perror("trace stat");
543                         goto out;
544                 }
545
546                 ts->len = sb.st_size - tip->ofile_offset;
547                 ts->max_len = ts->len;
548                 ts->offset = tip->ofile_offset;
549                 tip->ofile_offset += ts->len;
550                 ret = subbuf_fifo_queue(tip, ts);
551         } else {
552                 ts->buf = malloc(buf_size);
553                 ts->max_len = buf_size;
554
555                 ret = read_data(tip, ts->buf, ts->max_len);
556                 if (ret > 0) {
557                         ts->len = ret;
558                         ret = subbuf_fifo_queue(tip, ts);
559                 }
560         }
561
562 out:
563         if (ret) {
564                 if (ts->buf)
565                         free(ts->buf);
566                 free(ts);
567         }
568
569         return ret;
570 }
571
572 static void close_thread(struct thread_information *tip)
573 {
574         if (tip->fd != -1)
575                 close(tip->fd);
576         if (tip->ofile)
577                 fclose(tip->ofile);
578         if (tip->ofile_buffer)
579                 free(tip->ofile_buffer);
580         if (tip->fd_buf)
581                 free(tip->fd_buf);
582
583         tip->fd = -1;
584         tip->ofile = NULL;
585         tip->ofile_buffer = NULL;
586         tip->fd_buf = NULL;
587 }
588
589 static void tip_ftrunc_final(struct thread_information *tip)
590 {
591         /*
592          * truncate to right size and cleanup mmap
593          */
594         if (tip->ofile_mmap) {
595                 int ofd = fileno(tip->ofile);
596
597                 if (tip->fs_buf)
598                         munmap(tip->fs_buf, tip->fs_buf_len);
599
600                 ftruncate(ofd, tip->fs_size);
601         }
602 }
603
604 static void *thread_main(void *arg)
605 {
606         struct thread_information *tip = arg;
607         pid_t pid = getpid();
608         cpu_set_t cpu_mask;
609
610         CPU_ZERO(&cpu_mask);
611         CPU_SET((tip->cpu), &cpu_mask);
612
613         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
614                 perror("sched_setaffinity");
615                 exit_trace(1);
616         }
617
618         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
619                         relay_path, tip->device->buts_name, tip->cpu);
620         tip->fd = open(tip->fn, O_RDONLY);
621         if (tip->fd < 0) {
622                 perror(tip->fn);
623                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
624                         tip->fn);
625                 exit_trace(1);
626         }
627
628         while (!is_done()) {
629                 if (tip->ofile_mmap && net_mode != Net_client) {
630                         if (mmap_subbuf(tip, buf_size))
631                                 break;
632                 } else {
633                         if (get_subbuf(tip))
634                                 break;
635                 }
636         }
637
638         tip_ftrunc_final(tip);
639         tip->exited = 1;
640         return NULL;
641 }
642
643 static int write_data_net(int fd, void *buf, unsigned int buf_len)
644 {
645         unsigned int bytes_left = buf_len;
646         int ret;
647
648         while (bytes_left) {
649                 ret = send(fd, buf, bytes_left, 0);
650                 if (ret < 0) {
651                         perror("send");
652                         return 1;
653                 }
654
655                 buf += ret;
656                 bytes_left -= ret;
657         }
658
659         return 0;
660 }
661
662 static int flush_subbuf_net(struct thread_information *tip,
663                             struct tip_subbuf *ts)
664 {
665         struct blktrace_net_hdr hdr;
666         int ret = 0;
667
668         hdr.magic = BLK_IO_TRACE_MAGIC;
669         strcpy(hdr.ofname, tip->ofname);
670         hdr.cpu = tip->cpu;
671         hdr.len = ts->len;
672
673         if (write_data_net(net_out_fd, &hdr, sizeof(hdr)))
674                 return 1;
675
676         if (sendfile(net_out_fd, tip->fd, &ts->offset, ts->len) < 0) {
677                 perror("sendfile");
678                 ret = 1;
679         }
680
681         free(ts);
682         return 0;
683 }
684
685 static int write_data(struct thread_information *tip, void *buf,
686                       unsigned int buf_len)
687 {
688         int ret;
689
690         if (!buf_len)
691                 return 0;
692
693         while (1) {
694                 ret = fwrite(buf, buf_len, 1, tip->ofile);
695                 if (ret == 1)
696                         break;
697
698                 if (ret < 0) {
699                         perror("write");
700                         return 1;
701                 }
702         }
703
704         if (tip->ofile_stdout)
705                 fflush(tip->ofile);
706
707         return 0;
708 }
709
710 static int flush_subbuf_file(struct thread_information *tip,
711                              struct tip_subbuf *ts)
712 {
713         unsigned int offset = 0;
714         struct blk_io_trace *t;
715         int pdu_len, events = 0;
716
717         /*
718          * surplus from last run
719          */
720         if (tip->leftover_ts) {
721                 struct tip_subbuf *prev_ts = tip->leftover_ts;
722
723                 if (prev_ts->len + ts->len > prev_ts->max_len) {
724                         prev_ts->max_len += ts->len;
725                         prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
726                 }
727
728                 memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
729                 prev_ts->len += ts->len;
730
731                 free(ts->buf);
732                 free(ts);
733
734                 ts = prev_ts;
735                 tip->leftover_ts = NULL;
736         }
737
738         while (offset + sizeof(*t) <= ts->len) {
739                 t = ts->buf + offset;
740
741                 if (verify_trace(t)) {
742                         write_data(tip, ts->buf, offset);
743                         return -1;
744                 }
745
746                 pdu_len = t->pdu_len;
747
748                 if (offset + sizeof(*t) + pdu_len > ts->len)
749                         break;
750
751                 offset += sizeof(*t) + pdu_len;
752                 tip->events_processed++;
753                 tip->data_read += sizeof(*t) + pdu_len;
754                 events++;
755         }
756
757         if (write_data(tip, ts->buf, offset))
758                 return -1;
759
760         /*
761          * leftover bytes, save them for next time
762          */
763         if (offset != ts->len) {
764                 tip->leftover_ts = ts;
765                 ts->len -= offset;
766                 memmove(ts->buf, ts->buf + offset, ts->len);
767         } else {
768                 free(ts->buf);
769                 free(ts);
770         }
771
772         return events;
773 }
774
775 static int write_tip_events(struct thread_information *tip)
776 {
777         struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
778
779         if (ts) {
780                 if (net_mode == Net_client)
781                         return flush_subbuf_net(tip, ts);
782
783                 return flush_subbuf_file(tip, ts);
784         }
785
786         return 0;
787 }
788
789 /*
790  * scans the tips we know and writes out the subbuffers we accumulate
791  */
792 static void get_and_write_events(void)
793 {
794         struct device_information *dip;
795         struct thread_information *tip;
796         int i, j, events, ret, tips_running;
797
798         while (!is_done()) {
799                 events = 0;
800
801                 for_each_dip(dip, i) {
802                         for_each_tip(dip, tip, j) {
803                                 ret = write_tip_events(tip);
804                                 if (ret > 0)
805                                         events += ret;
806                         }
807                 }
808
809                 if (!events)
810                         usleep(10);
811         }
812
813         /*
814          * reap stored events
815          */
816         do {
817                 events = 0;
818                 tips_running = 0;
819                 for_each_dip(dip, i) {
820                         for_each_tip(dip, tip, j) {
821                                 ret = write_tip_events(tip);
822                                 if (ret > 0)
823                                         events += ret;
824                                 tips_running += !tip->exited;
825                         }
826                 }
827                 usleep(10);
828         } while (events || tips_running);
829 }
830
831 static void wait_for_threads(void)
832 {
833         /*
834          * for piped or network output, poll and fetch data for writeout.
835          * for files, we just wait around for trace threads to exit
836          */
837         if ((output_name && !strcmp(output_name, "-")) ||
838             net_mode == Net_client)
839                 get_and_write_events();
840         else {
841                 struct device_information *dip;
842                 struct thread_information *tip;
843                 int i, j, tips_running;
844
845                 do {
846                         tips_running = 0;
847                         usleep(1000);
848
849                         for_each_dip(dip, i)
850                                 for_each_tip(dip, tip, j)
851                                         tips_running += !tip->exited;
852                 } while (tips_running);
853         }
854 }
855
856 static void fill_ofname(char *dst, char *buts_name, int cpu)
857 {
858         int len = 0;
859
860         if (output_dir)
861                 len = sprintf(dst, "%s/", output_dir);
862
863         if (output_name)
864                 sprintf(dst + len, "%s.blktrace.%d", output_name, cpu);
865         else
866                 sprintf(dst + len, "%s.blktrace.%d", buts_name, cpu);
867 }
868
869 static int start_threads(struct device_information *dip)
870 {
871         struct thread_information *tip;
872         int j, pipeline = output_name && !strcmp(output_name, "-");
873         int mode, vbuf_size;
874
875         for_each_tip(dip, tip, j) {
876                 tip->cpu = j;
877                 tip->device = dip;
878                 tip->events_processed = 0;
879                 memset(&tip->fifo, 0, sizeof(tip->fifo));
880                 tip->leftover_ts = NULL;
881
882                 if (pipeline) {
883                         tip->ofile = fdopen(STDOUT_FILENO, "w");
884                         tip->ofile_stdout = 1;
885                         tip->ofile_mmap = 0;
886                         mode = _IOLBF;
887                         vbuf_size = 512;
888                 } else {
889                         fill_ofname(tip->ofname, dip->buts_name, tip->cpu);
890                         tip->ofile = fopen(tip->ofname, "w+");
891                         tip->ofile_stdout = 0;
892                         tip->ofile_mmap = 1;
893                         mode = _IOFBF;
894                         vbuf_size = OFILE_BUF;
895                 }
896
897                 if (tip->ofile == NULL) {
898                         perror(tip->ofname);
899                         return 1;
900                 }
901
902                 tip->ofile_buffer = malloc(vbuf_size);
903                 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
904                         perror("setvbuf");
905                         close_thread(tip);
906                         return 1;
907                 }
908
909                 if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
910                         perror("pthread_create");
911                         close_thread(tip);
912                         return 1;
913                 }
914         }
915
916         return 0;
917 }
918
919 static void stop_threads(struct device_information *dip)
920 {
921         struct thread_information *tip;
922         unsigned long ret;
923         int i;
924
925         for_each_tip(dip, tip, i) {
926                 (void) pthread_join(tip->thread, (void *) &ret);
927                 close_thread(tip);
928         }
929 }
930
931 static void stop_all_threads(void)
932 {
933         struct device_information *dip;
934         int i;
935
936         for_each_dip(dip, i)
937                 stop_threads(dip);
938 }
939
940 static void stop_all_tracing(void)
941 {
942         struct device_information *dip;
943         int i;
944
945         for_each_dip(dip, i)
946                 stop_trace(dip);
947 }
948
949 static void exit_trace(int status)
950 {
951         if (!is_trace_stopped()) {
952                 trace_stopped = 1;
953                 stop_all_threads();
954                 stop_all_tracing();
955         }
956
957         exit(status);
958 }
959
960 static int resize_devices(char *path)
961 {
962         int size = (ndevs + 1) * sizeof(struct device_information);
963
964         device_information = realloc(device_information, size);
965         if (!device_information) {
966                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
967                 return 1;
968         }
969         device_information[ndevs].path = path;
970         ndevs++;
971         return 0;
972 }
973
974 static int open_devices(void)
975 {
976         struct device_information *dip;
977         int i;
978
979         for_each_dip(dip, i) {
980                 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
981                 if (dip->fd < 0) {
982                         perror(dip->path);
983                         return 1;
984                 }
985         }
986
987         return 0;
988 }
989
990 static int start_devices(void)
991 {
992         struct device_information *dip;
993         int i, j, size;
994
995         size = ncpus * sizeof(struct thread_information);
996         thread_information = malloc(size * ndevs);
997         if (!thread_information) {
998                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
999                 return 1;
1000         }
1001
1002         for_each_dip(dip, i) {
1003                 if (start_trace(dip)) {
1004                         close(dip->fd);
1005                         fprintf(stderr, "Failed to start trace on %s\n",
1006                                 dip->path);
1007                         break;
1008                 }
1009         }
1010
1011         if (i != ndevs) {
1012                 __for_each_dip(dip, j, i)
1013                         stop_trace(dip);
1014
1015                 return 1;
1016         }
1017
1018         for_each_dip(dip, i) {
1019                 dip->threads = thread_information + (i * ncpus);
1020                 if (start_threads(dip)) {
1021                         fprintf(stderr, "Failed to start worker threads\n");
1022                         break;
1023                 }
1024         }
1025
1026         if (i != ndevs) {
1027                 __for_each_dip(dip, j, i)
1028                         stop_threads(dip);
1029                 for_each_dip(dip, i)
1030                         stop_trace(dip);
1031
1032                 return 1;
1033         }
1034
1035         return 0;
1036 }
1037
1038 static void show_stats(void)
1039 {
1040         struct device_information *dip;
1041         struct thread_information *tip;
1042         unsigned long long events_processed, data_read;
1043         unsigned long total_drops;
1044         int i, j, no_stdout = 0;
1045
1046         if (is_stat_shown())
1047                 return;
1048
1049         if (output_name && !strcmp(output_name, "-"))
1050                 no_stdout = 1;
1051
1052         stat_shown = 1;
1053
1054         total_drops = 0;
1055         for_each_dip(dip, i) {
1056                 if (!no_stdout)
1057                         printf("Device: %s\n", dip->path);
1058                 events_processed = 0;
1059                 data_read = 0;
1060                 for_each_tip(dip, tip, j) {
1061                         if (!no_stdout)
1062                                 printf("  CPU%3d: %20lu events, %8llu KiB data\n",
1063                                         tip->cpu, tip->events_processed,
1064                                         tip->data_read >> 10);
1065                         events_processed += tip->events_processed;
1066                         data_read += tip->data_read;
1067                 }
1068                 total_drops += dip->drop_count;
1069                 if (!no_stdout)
1070                         printf("  Total:  %20llu events (dropped %lu), %8llu KiB data\n",
1071                                         events_processed, dip->drop_count,
1072                                         data_read >> 10);
1073         }
1074
1075         if (total_drops)
1076                 fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
1077 }
1078
1079 static struct thread_information *net_tip_list;
1080 static int net_tip_len;
1081
1082 static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh)
1083 {
1084         struct thread_information *tip;
1085         int i;
1086
1087         for (i = 0; i < net_tip_len; i++) {
1088                 tip = &net_tip_list[i];
1089
1090                 if (!strcmp(bnh->ofname, tip->ofname))
1091                         return tip;
1092         }
1093
1094         net_tip_list = realloc(net_tip_list, (net_tip_len + 1) * sizeof(*tip));
1095         tip = &net_tip_list[net_tip_len];
1096         net_tip_len++;
1097
1098         memset(tip, 0, sizeof(*tip));
1099
1100         tip->cpu = bnh->cpu;
1101         strcpy(tip->ofname, bnh->ofname);
1102
1103         tip->ofile = fopen(tip->ofname, "w+");
1104         if (!tip->ofile) {
1105                 perror("fopen");
1106                 return NULL;
1107         }
1108
1109         tip->ofile_stdout = 0;
1110         tip->ofile_mmap = 1;
1111         return tip;
1112 }
1113
1114 static int net_get_header(struct blktrace_net_hdr *bnh)
1115 {
1116         int fl = fcntl(net_in_fd, F_GETFL);
1117         int bytes_left, ret;
1118         void *p = bnh;
1119
1120         fcntl(net_in_fd, F_SETFL, fl | O_NONBLOCK);
1121         bytes_left = sizeof(*bnh);
1122         while (bytes_left && !is_done()) {
1123                 ret = recv(net_in_fd, p, bytes_left, MSG_WAITALL);
1124                 if (ret < 0) {
1125                         if (errno != EAGAIN) {
1126                                 perror("recv header");
1127                                 return 1;
1128                         }
1129                         usleep(100);
1130                         continue;
1131                 } else if (!ret) {
1132                         usleep(100);
1133                         continue;
1134                 } else {
1135                         p += ret;
1136                         bytes_left -= ret;
1137                 }
1138         }
1139         fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
1140         return 0;
1141 }
1142
1143 static int net_server_loop(void)
1144 {
1145         struct thread_information *tip;
1146         struct blktrace_net_hdr bnh;
1147
1148         if (net_get_header(&bnh))
1149                 return 1;
1150
1151         if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
1152                 fprintf(stderr, "server: received data is bad\n");
1153                 return 1;
1154         }
1155
1156         if (!data_is_native) {
1157                 bnh.cpu = be32_to_cpu(bnh.cpu);
1158                 bnh.len = be32_to_cpu(bnh.len);
1159         }
1160
1161         tip = net_get_tip(&bnh);
1162         if (!tip)
1163                 return 1;
1164
1165         if (mmap_subbuf(tip, bnh.len))
1166                 return 1;
1167
1168         return 0;
1169 }
1170
1171 /*
1172  * Start here when we are in server mode - just fetch data from the network
1173  * and dump to files
1174  */
1175 static int net_server(void)
1176 {
1177         struct sockaddr_in addr;
1178         socklen_t socklen;
1179         int fd, opt, i;
1180
1181         fd = socket(AF_INET, SOCK_STREAM, 0);
1182         if (fd < 0) {
1183                 perror("server: socket");
1184                 return 1;
1185         }
1186
1187         opt = 1;
1188         if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
1189                 perror("setsockopt");
1190                 return 1;
1191         }
1192
1193         memset(&addr, 0, sizeof(addr));
1194         addr.sin_family = AF_INET;
1195         addr.sin_addr.s_addr = htonl(INADDR_ANY);
1196         addr.sin_port = htons(net_port);
1197
1198         if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1199                 perror("bind");
1200                 return 1;
1201         }
1202
1203         if (listen(fd, 1) < 0) {
1204                 perror("listen");
1205                 return 1;
1206         }
1207
1208         printf("blktrace: waiting for incoming connection...\n");
1209
1210         socklen = sizeof(addr);
1211         net_in_fd = accept(fd, (struct sockaddr *) &addr, &socklen);
1212         if (net_in_fd < 0) {
1213                 perror("accept");
1214                 return 1;
1215         }
1216
1217         signal(SIGINT, handle_sigint);
1218         signal(SIGHUP, handle_sigint);
1219         signal(SIGTERM, handle_sigint);
1220         signal(SIGALRM, handle_sigint);
1221
1222         printf("blktrace: connected!\n");
1223
1224         while (!is_done()) {
1225                 if (net_server_loop())
1226                         break;
1227         }
1228
1229         for (i = 0; i < net_tip_len; i++)
1230                 tip_ftrunc_final(&net_tip_list[i]);
1231
1232         return 0;
1233 }
1234
1235 /*
1236  * Setup outgoing network connection where we will transmit data
1237  */
1238 static int net_setup_client(void)
1239 {
1240         struct sockaddr_in addr;
1241         int fd;
1242
1243         fd = socket(AF_INET, SOCK_STREAM, 0);
1244         if (fd < 0) {
1245                 perror("client: socket");
1246                 return 1;
1247         }
1248
1249         memset(&addr, 0, sizeof(addr));
1250         addr.sin_family = AF_INET;
1251         addr.sin_port = htons(net_port);
1252
1253         if (inet_aton(hostname, &addr.sin_addr) != 1) {
1254                 struct hostent *hent = gethostbyname(hostname);
1255                 if (!hent) {
1256                         perror("gethostbyname");
1257                         return 1;
1258                 }
1259
1260                 memcpy(&addr.sin_addr, hent->h_addr, 4);
1261                 strcpy(hostname, hent->h_name);
1262         }
1263
1264         printf("blktrace: connecting to %s\n", hostname);
1265
1266         if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1267                 perror("client: connect");
1268                 return 1;
1269         }
1270
1271         printf("blktrace: connected!\n");
1272         net_out_fd = fd;
1273         return 0;
1274 }
1275
1276 static char usage_str[] = \
1277         "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
1278         "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
1279         "\t-d Use specified device. May also be given last after options\n" \
1280         "\t-r Path to mounted relayfs, defaults to /relay\n" \
1281         "\t-o File(s) to send output to\n" \
1282         "\t-D Directory to prepend to output file names\n" \
1283         "\t-k Kill a running trace\n" \
1284         "\t-w Stop after defined time, in seconds\n" \
1285         "\t-a Only trace specified actions. See documentation\n" \
1286         "\t-A Give trace mask as a single value. See documentation\n" \
1287         "\t-b Sub buffer size in KiB\n" \
1288         "\t-n Number of sub buffers\n" \
1289         "\t-v Print program version info\n\n";
1290
1291 static void show_usage(char *program)
1292 {
1293         fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
1294 }
1295
1296 int main(int argc, char *argv[])
1297 {
1298         static char default_relay_path[] = "/relay";
1299         struct statfs st;
1300         int i, c;
1301         int stop_watch = 0;
1302         int act_mask_tmp = 0;
1303
1304         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
1305                 switch (c) {
1306                 case 'a':
1307                         i = find_mask_map(optarg);
1308                         if (i < 0) {
1309                                 fprintf(stderr,"Invalid action mask %s\n",
1310                                         optarg);
1311                                 return 1;
1312                         }
1313                         act_mask_tmp |= i;
1314                         break;
1315
1316                 case 'A':
1317                         if ((sscanf(optarg, "%x", &i) != 1) || 
1318                                                         !valid_act_opt(i)) {
1319                                 fprintf(stderr,
1320                                         "Invalid set action mask %s/0x%x\n",
1321                                         optarg, i);
1322                                 return 1;
1323                         }
1324                         act_mask_tmp = i;
1325                         break;
1326
1327                 case 'd':
1328                         if (resize_devices(optarg) != 0)
1329                                 return 1;
1330                         break;
1331
1332                 case 'r':
1333                         relay_path = optarg;
1334                         break;
1335
1336                 case 'o':
1337                         output_name = optarg;
1338                         break;
1339                 case 'k':
1340                         kill_running_trace = 1;
1341                         break;
1342                 case 'w':
1343                         stop_watch = atoi(optarg);
1344                         if (stop_watch <= 0) {
1345                                 fprintf(stderr,
1346                                         "Invalid stopwatch value (%d secs)\n",
1347                                         stop_watch);
1348                                 return 1;
1349                         }
1350                         break;
1351                 case 'V':
1352                         printf("%s version %s\n", argv[0], blktrace_version);
1353                         return 0;
1354                 case 'b':
1355                         buf_size = strtoul(optarg, NULL, 10);
1356                         if (buf_size <= 0 || buf_size > 16*1024) {
1357                                 fprintf(stderr,
1358                                         "Invalid buffer size (%lu)\n",buf_size);
1359                                 return 1;
1360                         }
1361                         buf_size <<= 10;
1362                         break;
1363                 case 'n':
1364                         buf_nr = strtoul(optarg, NULL, 10);
1365                         if (buf_nr <= 0) {
1366                                 fprintf(stderr,
1367                                         "Invalid buffer nr (%lu)\n", buf_nr);
1368                                 return 1;
1369                         }
1370                         break;
1371                 case 'D':
1372                         output_dir = optarg;
1373                         break;
1374                 case 'h':
1375                         net_mode = Net_client;
1376                         strcpy(hostname, optarg);
1377                         break;
1378                 case 'l':
1379                         net_mode = Net_server;
1380                         break;
1381                 case 'p':
1382                         net_port = atoi(optarg);
1383                         break;
1384                 default:
1385                         show_usage(argv[0]);
1386                         return 1;
1387                 }
1388         }
1389
1390         while (optind < argc) {
1391                 if (resize_devices(argv[optind++]) != 0)
1392                         return 1;
1393         }
1394
1395         setlocale(LC_NUMERIC, "en_US");
1396
1397         page_size = getpagesize();
1398
1399         if (net_mode == Net_server)
1400                 return net_server();
1401
1402         if (ndevs == 0) {
1403                 show_usage(argv[0]);
1404                 return 1;
1405         }
1406
1407         if (!relay_path)
1408                 relay_path = default_relay_path;
1409
1410         if (act_mask_tmp != 0)
1411                 act_mask = act_mask_tmp;
1412
1413         if (statfs(relay_path, &st) < 0) {
1414                 perror("statfs");
1415                 fprintf(stderr,"%s does not appear to be a valid path\n",
1416                         relay_path);
1417                 return 1;
1418         } else if (st.f_type != (long) RELAYFS_TYPE) {
1419                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
1420                         relay_path);
1421                 return 1;
1422         }
1423
1424         if (open_devices() != 0)
1425                 return 1;
1426
1427         if (kill_running_trace) {
1428                 stop_all_traces();
1429                 return 0;
1430         }
1431
1432         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
1433         if (ncpus < 0) {
1434                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
1435                 return 1;
1436         }
1437
1438         signal(SIGINT, handle_sigint);
1439         signal(SIGHUP, handle_sigint);
1440         signal(SIGTERM, handle_sigint);
1441         signal(SIGALRM, handle_sigint);
1442
1443         if (net_mode == Net_client && net_setup_client())
1444                 return 1;
1445
1446         if (start_devices() != 0)
1447                 return 1;
1448
1449         atexit(stop_all_tracing);
1450
1451         if (stop_watch)
1452                 alarm(stop_watch);
1453
1454         wait_for_threads();
1455
1456         if (!is_trace_stopped()) {
1457                 trace_stopped = 1;
1458                 stop_all_threads();
1459                 stop_all_traces();
1460         }
1461
1462         show_stats();
1463
1464         return 0;
1465 }
1466