[PATCH] blktrace: don't truncate/open local files when in net client mode
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <sys/poll.h>
33 #include <sys/mman.h>
34 #include <sys/socket.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <sched.h>
38 #include <ctype.h>
39 #include <getopt.h>
40 #include <errno.h>
41 #include <netinet/in.h>
42 #include <arpa/inet.h>
43 #include <netdb.h>
44 #include <sys/sendfile.h>
45
46 #include "blktrace.h"
47 #include "barrier.h"
48
49 static char blktrace_version[] = "0.99";
50
51 /*
52  * You may want to increase this even more, if you are logging at a high
53  * rate and see skipped/missed events
54  */
55 #define BUF_SIZE        (512 * 1024)
56 #define BUF_NR          (4)
57
58 #define OFILE_BUF       (128 * 1024)
59
60 #define RELAYFS_TYPE    0xF0B4A981
61
62 #define S_OPTS  "d:a:A:r:o:kw:Vb:n:D:lh:p:s"
63 static struct option l_opts[] = {
64         {
65                 .name = "dev",
66                 .has_arg = required_argument,
67                 .flag = NULL,
68                 .val = 'd'
69         },
70         {
71                 .name = "act-mask",
72                 .has_arg = required_argument,
73                 .flag = NULL,
74                 .val = 'a'
75         },
76         {
77                 .name = "set-mask",
78                 .has_arg = required_argument,
79                 .flag = NULL,
80                 .val = 'A'
81         },
82         {
83                 .name = "relay",
84                 .has_arg = required_argument,
85                 .flag = NULL,
86                 .val = 'r'
87         },
88         {
89                 .name = "output",
90                 .has_arg = required_argument,
91                 .flag = NULL,
92                 .val = 'o'
93         },
94         {
95                 .name = "kill",
96                 .has_arg = no_argument,
97                 .flag = NULL,
98                 .val = 'k'
99         },
100         {
101                 .name = "stopwatch",
102                 .has_arg = required_argument,
103                 .flag = NULL,
104                 .val = 'w'
105         },
106         {
107                 .name = "version",
108                 .has_arg = no_argument,
109                 .flag = NULL,
110                 .val = 'V'
111         },
112         {
113                 .name = "buffer-size",
114                 .has_arg = required_argument,
115                 .flag = NULL,
116                 .val = 'b'
117         },
118         {
119                 .name = "num-sub-buffers",
120                 .has_arg = required_argument,
121                 .flag = NULL,
122                 .val = 'n'
123         },
124         {
125                 .name = "output-dir",
126                 .has_arg = required_argument,
127                 .flag = NULL,
128                 .val = 'D'
129         },
130         {
131                 .name = "listen",
132                 .has_arg = no_argument,
133                 .flag = NULL,
134                 .val = 'l'
135         },
136         {
137                 .name = "host",
138                 .has_arg = required_argument,
139                 .flag = NULL,
140                 .val = 'h'
141         },
142         {
143                 .name = "port",
144                 .has_arg = required_argument,
145                 .flag = NULL,
146                 .val = 'p'
147         },
148         {
149                 .name = "sendfile",
150                 .has_arg = no_argument,
151                 .flag = NULL,
152                 .val = 's'
153         },
154         {
155                 .name = NULL,
156         }
157 };
158
159 struct tip_subbuf {
160         void *buf;
161         unsigned int len;
162         unsigned int max_len;
163         off_t offset;
164 };
165
166 #define FIFO_SIZE       (1024)  /* should be plenty big! */
167 #define CL_SIZE         (128)   /* cache line, any bigger? */
168
169 struct tip_subbuf_fifo {
170         int tail __attribute__((aligned(CL_SIZE)));
171         int head __attribute__((aligned(CL_SIZE)));
172         struct tip_subbuf *q[FIFO_SIZE];
173 };
174
175 struct thread_information {
176         int cpu;
177         pthread_t thread;
178
179         int fd;
180         void *fd_buf;
181         char fn[MAXPATHLEN + 64];
182
183         FILE *ofile;
184         char *ofile_buffer;
185         off_t ofile_offset;
186         int ofile_stdout;
187         int ofile_mmap;
188
189         int (*get_subbuf)(struct thread_information *, unsigned int);
190         int (*flush_subbuf)(struct thread_information *, struct tip_subbuf *);
191         int (*read_data)(struct thread_information *, void *, unsigned int);
192
193         unsigned long events_processed;
194         unsigned long long data_read;
195         struct device_information *device;
196
197         int exited;
198
199         /*
200          * piped fifo buffers
201          */
202         struct tip_subbuf_fifo fifo;
203         struct tip_subbuf *leftover_ts;
204
205         /*
206          * mmap controlled output files
207          */
208         unsigned long long fs_size;
209         unsigned long long fs_max_size;
210         unsigned long fs_off;
211         void *fs_buf;
212         unsigned long fs_buf_len;
213 };
214
215 struct device_information {
216         int fd;
217         char *path;
218         char buts_name[32];
219         volatile int trace_started;
220         unsigned long drop_count;
221         struct thread_information *threads;
222 };
223
224 static int ncpus;
225 static struct thread_information *thread_information;
226 static int ndevs;
227 static struct device_information *device_information;
228
229 /* command line option globals */
230 static char *relay_path;
231 static char *output_name;
232 static char *output_dir;
233 static int act_mask = ~0U;
234 static int kill_running_trace;
235 static unsigned long buf_size = BUF_SIZE;
236 static unsigned long buf_nr = BUF_NR;
237 static unsigned int page_size;
238
239 #define is_done()       (*(volatile int *)(&done))
240 static volatile int done;
241
242 #define is_trace_stopped()      (*(volatile int *)(&trace_stopped))
243 static volatile int trace_stopped;
244
245 #define is_stat_shown() (*(volatile int *)(&stat_shown))
246 static volatile int stat_shown;
247
248 int data_is_native = -1;
249
250 static void exit_trace(int status);
251
252 #define dip_tracing(dip)        (*(volatile int *)(&(dip)->trace_started))
253 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
254
255 #define __for_each_dip(__d, __i, __e)   \
256         for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
257
258 #define for_each_dip(__d, __i)  __for_each_dip(__d, __i, ndevs)
259 #define for_each_tip(__d, __t, __j)     \
260         for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
261
262 /*
263  * networking stuff follows. we include a magic number so we know whether
264  * to endianness convert or not
265  */
266 struct blktrace_net_hdr {
267         u32 magic;              /* same as trace magic */
268         char buts_name[32];     /* trace name */
269         u32 cpu;                /* for which cpu */
270         u32 max_cpus;
271         u32 len;                /* length of following trace data */
272 };
273
274 #define TRACE_NET_PORT          (8462)
275
276 enum {
277         Net_none = 0,
278         Net_server,
279         Net_client,
280 };
281
282 /*
283  * network cmd line params
284  */
285 static char hostname[MAXHOSTNAMELEN];
286 static int net_port = TRACE_NET_PORT;
287 static int net_mode = 0;
288 static int net_sendfile;
289
290 static int net_in_fd = -1;
291 static int net_out_fd = -1;
292
293 static void handle_sigint(__attribute__((__unused__)) int sig)
294 {
295         done = 1;
296 }
297
298 static int get_dropped_count(const char *buts_name)
299 {
300         int fd;
301         char tmp[MAXPATHLEN + 64];
302
303         snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
304                  relay_path, buts_name);
305
306         fd = open(tmp, O_RDONLY);
307         if (fd < 0) {
308                 /*
309                  * this may be ok, if the kernel doesn't support dropped counts
310                  */
311                 if (errno == ENOENT)
312                         return 0;
313
314                 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
315                 return -1;
316         }
317
318         if (read(fd, tmp, sizeof(tmp)) < 0) {
319                 perror(tmp);
320                 close(fd);
321                 return -1;
322         }
323
324         close(fd);
325
326         return atoi(tmp);
327 }
328
329 static int start_trace(struct device_information *dip)
330 {
331         struct blk_user_trace_setup buts;
332
333         memset(&buts, 0, sizeof(buts));
334         buts.buf_size = buf_size;
335         buts.buf_nr = buf_nr;
336         buts.act_mask = act_mask;
337
338         if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
339                 perror("BLKTRACESETUP");
340                 return 1;
341         }
342
343         if (ioctl(dip->fd, BLKTRACESTART) < 0) {
344                 perror("BLKTRACESTART");
345                 return 1;
346         }
347
348         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
349         dip_set_tracing(dip, 1);
350         return 0;
351 }
352
353 static void stop_trace(struct device_information *dip)
354 {
355         if (dip_tracing(dip) || kill_running_trace) {
356                 dip_set_tracing(dip, 0);
357
358                 if (ioctl(dip->fd, BLKTRACESTOP) < 0)
359                         perror("BLKTRACESTOP");
360                 if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
361                         perror("BLKTRACETEARDOWN");
362
363                 close(dip->fd);
364                 dip->fd = -1;
365         }
366 }
367
368 static void stop_all_traces(void)
369 {
370         struct device_information *dip;
371         int i;
372
373         for_each_dip(dip, i) {
374                 dip->drop_count = get_dropped_count(dip->buts_name);
375                 stop_trace(dip);
376         }
377 }
378
379 static void wait_for_data(struct thread_information *tip)
380 {
381         struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
382
383         do {
384                 poll(&pfd, 1, 100);
385                 if (pfd.revents & POLLIN)
386                         break;
387                 if (tip->ofile_stdout)
388                         break;
389         } while (!is_done());
390 }
391
392 static int read_data_file(struct thread_information *tip, void *buf,
393                           unsigned int len)
394 {
395         int ret = 0;
396
397         do {
398                 wait_for_data(tip);
399
400                 ret = read(tip->fd, buf, len);
401                 if (!ret)
402                         continue;
403                 else if (ret > 0)
404                         return ret;
405                 else {
406                         if (errno != EAGAIN) {
407                                 perror(tip->fn);
408                                 fprintf(stderr,"Thread %d failed read of %s\n",
409                                         tip->cpu, tip->fn);
410                                 break;
411                         }
412                         continue;
413                 }
414         } while (!is_done());
415
416         return ret;
417
418 }
419
420 static int read_data_net(struct thread_information *tip, void *buf,
421                          unsigned int len)
422 {
423         unsigned int bytes_left = len;
424         int ret = 0;
425
426         do {
427                 ret = recv(net_in_fd, buf, bytes_left, MSG_WAITALL);
428
429                 if (!ret)
430                         continue;
431                 else if (ret < 0) {
432                         if (errno != EAGAIN) {
433                                 perror(tip->fn);
434                                 fprintf(stderr, "server: failed read\n");
435                                 return 0;
436                         }
437                         continue;
438                 } else {
439                         buf += ret;
440                         bytes_left -= ret;
441                 }
442         } while (!is_done() && bytes_left);
443
444         return len - bytes_left;
445 }
446
447 static int read_data(struct thread_information *tip, void *buf,
448                      unsigned int len)
449 {
450         int ret = tip->read_data(tip, buf, len);
451
452         if (ret > 0)
453                 tip->data_read += ret;
454
455         return ret;
456 }
457
458 static inline struct tip_subbuf *
459 subbuf_fifo_dequeue(struct thread_information *tip)
460 {
461         const int head = tip->fifo.head;
462         const int next = (head + 1) & (FIFO_SIZE - 1);
463
464         if (head != tip->fifo.tail) {
465                 struct tip_subbuf *ts = tip->fifo.q[head];
466
467                 store_barrier();
468                 tip->fifo.head = next;
469                 return ts;
470         }
471
472         return NULL;
473 }
474
475 static inline int subbuf_fifo_queue(struct thread_information *tip,
476                                     struct tip_subbuf *ts)
477 {
478         const int tail = tip->fifo.tail;
479         const int next = (tail + 1) & (FIFO_SIZE - 1);
480
481         if (next != tip->fifo.head) {
482                 tip->fifo.q[tail] = ts;
483                 store_barrier();
484                 tip->fifo.tail = next;
485                 return 0;
486         }
487
488         fprintf(stderr, "fifo too small!\n");
489         return 1;
490 }
491
492 /*
493  * For file output, truncate and mmap the file appropriately
494  */
495 static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
496 {
497         int ofd = fileno(tip->ofile);
498         int ret;
499
500         /*
501          * extend file, if we have to. use chunks of 16 subbuffers.
502          */
503         if (tip->fs_off + buf_size > tip->fs_buf_len) {
504                 if (tip->fs_buf) {
505                         munlock(tip->fs_buf, tip->fs_buf_len);
506                         munmap(tip->fs_buf, tip->fs_buf_len);
507                         tip->fs_buf = NULL;
508                 }
509
510                 tip->fs_off = tip->fs_size & (page_size - 1);
511                 tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
512                 tip->fs_max_size += tip->fs_buf_len;
513
514                 if (ftruncate(ofd, tip->fs_max_size) < 0) {
515                         perror("ftruncate");
516                         return -1;
517                 }
518
519                 tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
520                                    MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
521                 if (tip->fs_buf == MAP_FAILED) {
522                         perror("mmap");
523                         return -1;
524                 }
525                 mlock(tip->fs_buf, tip->fs_buf_len);
526         }
527
528         ret = read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
529         if (ret >= 0) {
530                 tip->fs_size += ret;
531                 tip->fs_off += ret;
532                 return 0;
533         }
534
535         return -1;
536 }
537
538 static int get_subbuf_sendfile(struct thread_information *tip,
539                                unsigned int maxlen)
540 {
541         struct tip_subbuf *ts = malloc(sizeof(*ts));
542         struct stat sb;
543
544         ts->max_len = maxlen;
545         ts->buf = NULL;
546
547         if (fstat(tip->fd, &sb) < 0) {
548                 perror("trace stat");
549                 return 1;
550         }
551
552         ts->len = sb.st_size - tip->ofile_offset;
553         ts->max_len = ts->len;
554         ts->offset = tip->ofile_offset;
555         tip->ofile_offset += ts->len;
556         return subbuf_fifo_queue(tip, ts);
557 }
558
559 /*
560  * Use the copy approach for pipes and network
561  */
562 static int get_subbuf(struct thread_information *tip, unsigned int maxlen)
563 {
564         struct tip_subbuf *ts = malloc(sizeof(*ts));
565         int ret;
566
567         ts->buf = malloc(buf_size);
568         ts->max_len = maxlen;
569
570         ret = read_data(tip, ts->buf, ts->max_len);
571         if (ret > 0) {
572                 ts->len = ret;
573                 return subbuf_fifo_queue(tip, ts);
574         }
575
576         return ret;
577 }
578
579 static void close_thread(struct thread_information *tip)
580 {
581         if (tip->fd != -1)
582                 close(tip->fd);
583         if (tip->ofile)
584                 fclose(tip->ofile);
585         if (tip->ofile_buffer)
586                 free(tip->ofile_buffer);
587         if (tip->fd_buf)
588                 free(tip->fd_buf);
589
590         tip->fd = -1;
591         tip->ofile = NULL;
592         tip->ofile_buffer = NULL;
593         tip->fd_buf = NULL;
594 }
595
596 static void tip_ftrunc_final(struct thread_information *tip)
597 {
598         /*
599          * truncate to right size and cleanup mmap
600          */
601         if (tip->ofile_mmap) {
602                 int ofd = fileno(tip->ofile);
603
604                 if (tip->fs_buf)
605                         munmap(tip->fs_buf, tip->fs_buf_len);
606
607                 ftruncate(ofd, tip->fs_size);
608         }
609 }
610
611 static void *thread_main(void *arg)
612 {
613         struct thread_information *tip = arg;
614         pid_t pid = getpid();
615         cpu_set_t cpu_mask;
616
617         CPU_ZERO(&cpu_mask);
618         CPU_SET((tip->cpu), &cpu_mask);
619
620         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
621                 perror("sched_setaffinity");
622                 exit_trace(1);
623         }
624
625         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
626                         relay_path, tip->device->buts_name, tip->cpu);
627         tip->fd = open(tip->fn, O_RDONLY);
628         if (tip->fd < 0) {
629                 perror(tip->fn);
630                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
631                         tip->fn);
632                 exit_trace(1);
633         }
634
635         while (!is_done()) {
636                 if (tip->get_subbuf(tip, buf_size))
637                         break;
638         }
639
640         tip_ftrunc_final(tip);
641         tip->exited = 1;
642         return NULL;
643 }
644
645 static int write_data_net(int fd, void *buf, unsigned int buf_len)
646 {
647         unsigned int bytes_left = buf_len;
648         int ret;
649
650         while (bytes_left) {
651                 ret = send(fd, buf, bytes_left, 0);
652                 if (ret < 0) {
653                         perror("send");
654                         return 1;
655                 }
656
657                 buf += ret;
658                 bytes_left -= ret;
659         }
660
661         return 0;
662 }
663
664 static int net_send_header(struct thread_information *tip, unsigned int len)
665 {
666         struct blktrace_net_hdr hdr;
667
668         hdr.magic = BLK_IO_TRACE_MAGIC;
669         strcpy(hdr.buts_name, tip->device->buts_name);
670         hdr.cpu = tip->cpu;
671         hdr.max_cpus = ncpus;
672         hdr.len = len;
673
674         return write_data_net(net_out_fd, &hdr, sizeof(hdr));
675 }
676
677 /*
678  * send header with 0 length to signal end-of-run
679  */
680 static void net_client_send_close(void)
681 {
682         struct blktrace_net_hdr hdr;
683
684         hdr.magic = BLK_IO_TRACE_MAGIC;
685         hdr.cpu = 0;
686         hdr.max_cpus = ncpus;
687         hdr.len = 0;
688
689         write_data_net(net_out_fd, &hdr, sizeof(hdr));
690 }
691
692 static int flush_subbuf_net(struct thread_information *tip,
693                             struct tip_subbuf *ts)
694 {
695         if (net_send_header(tip, ts->len))
696                 return 1;
697         if (write_data_net(net_out_fd, ts->buf, ts->len))
698                 return 1;
699
700         free(ts->buf);
701         free(ts);
702         return 0;
703 }
704
705 static int flush_subbuf_sendfile(struct thread_information *tip,
706                                  struct tip_subbuf *ts)
707 {
708         if (net_send_header(tip, ts->len))
709                 return 1;
710         if (sendfile(net_out_fd, tip->fd, &ts->offset, ts->len) < 0) {
711                 perror("sendfile");
712                 return 1;
713         }
714
715         free(ts);
716         return 0;
717 }
718
719 static int write_data(struct thread_information *tip, void *buf,
720                       unsigned int buf_len)
721 {
722         int ret;
723
724         if (!buf_len)
725                 return 0;
726
727         while (1) {
728                 ret = fwrite(buf, buf_len, 1, tip->ofile);
729                 if (ret == 1)
730                         break;
731
732                 if (ret < 0) {
733                         perror("write");
734                         return 1;
735                 }
736         }
737
738         if (tip->ofile_stdout)
739                 fflush(tip->ofile);
740
741         return 0;
742 }
743
744 static int flush_subbuf_file(struct thread_information *tip,
745                              struct tip_subbuf *ts)
746 {
747         unsigned int offset = 0;
748         struct blk_io_trace *t;
749         int pdu_len, events = 0;
750
751         /*
752          * surplus from last run
753          */
754         if (tip->leftover_ts) {
755                 struct tip_subbuf *prev_ts = tip->leftover_ts;
756
757                 if (prev_ts->len + ts->len > prev_ts->max_len) {
758                         prev_ts->max_len += ts->len;
759                         prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
760                 }
761
762                 memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
763                 prev_ts->len += ts->len;
764
765                 free(ts->buf);
766                 free(ts);
767
768                 ts = prev_ts;
769                 tip->leftover_ts = NULL;
770         }
771
772         while (offset + sizeof(*t) <= ts->len) {
773                 t = ts->buf + offset;
774
775                 if (verify_trace(t)) {
776                         write_data(tip, ts->buf, offset);
777                         return -1;
778                 }
779
780                 pdu_len = t->pdu_len;
781
782                 if (offset + sizeof(*t) + pdu_len > ts->len)
783                         break;
784
785                 offset += sizeof(*t) + pdu_len;
786                 tip->events_processed++;
787                 tip->data_read += sizeof(*t) + pdu_len;
788                 events++;
789         }
790
791         if (write_data(tip, ts->buf, offset))
792                 return -1;
793
794         /*
795          * leftover bytes, save them for next time
796          */
797         if (offset != ts->len) {
798                 tip->leftover_ts = ts;
799                 ts->len -= offset;
800                 memmove(ts->buf, ts->buf + offset, ts->len);
801         } else {
802                 free(ts->buf);
803                 free(ts);
804         }
805
806         return events;
807 }
808
809 static int write_tip_events(struct thread_information *tip)
810 {
811         struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
812
813         if (ts)
814                 return tip->flush_subbuf(tip, ts);
815
816         return 0;
817 }
818
819 /*
820  * scans the tips we know and writes out the subbuffers we accumulate
821  */
822 static void get_and_write_events(void)
823 {
824         struct device_information *dip;
825         struct thread_information *tip;
826         int i, j, events, ret, tips_running;
827
828         while (!is_done()) {
829                 events = 0;
830
831                 for_each_dip(dip, i) {
832                         for_each_tip(dip, tip, j) {
833                                 ret = write_tip_events(tip);
834                                 if (ret > 0)
835                                         events += ret;
836                         }
837                 }
838
839                 if (!events)
840                         usleep(10);
841         }
842
843         /*
844          * reap stored events
845          */
846         do {
847                 events = 0;
848                 tips_running = 0;
849                 for_each_dip(dip, i) {
850                         for_each_tip(dip, tip, j) {
851                                 ret = write_tip_events(tip);
852                                 if (ret > 0)
853                                         events += ret;
854                                 tips_running += !tip->exited;
855                         }
856                 }
857                 usleep(10);
858         } while (events || tips_running);
859 }
860
861 static void wait_for_threads(void)
862 {
863         /*
864          * for piped or network output, poll and fetch data for writeout.
865          * for files, we just wait around for trace threads to exit
866          */
867         if ((output_name && !strcmp(output_name, "-")) ||
868             net_mode == Net_client)
869                 get_and_write_events();
870         else {
871                 struct device_information *dip;
872                 struct thread_information *tip;
873                 int i, j, tips_running;
874
875                 do {
876                         tips_running = 0;
877                         usleep(1000);
878
879                         for_each_dip(dip, i)
880                                 for_each_tip(dip, tip, j)
881                                         tips_running += !tip->exited;
882                 } while (tips_running);
883         }
884
885         if (net_mode == Net_client)
886                 net_client_send_close();
887 }
888
889 static void fill_ofname(char *dst, char *buts_name, int cpu)
890 {
891         int len = 0;
892
893         if (output_dir)
894                 len = sprintf(dst, "%s/", output_dir);
895
896         if (output_name)
897                 sprintf(dst + len, "%s.blktrace.%d", output_name, cpu);
898         else
899                 sprintf(dst + len, "%s.blktrace.%d", buts_name, cpu);
900 }
901
902 static void fill_ops(struct thread_information *tip)
903 {
904         /*
905          * setup ops
906          */
907         if (net_mode == Net_client) {
908                 if (net_sendfile) {
909                         tip->get_subbuf = get_subbuf_sendfile;
910                         tip->flush_subbuf = flush_subbuf_sendfile;
911                 } else {
912                         tip->get_subbuf = get_subbuf;
913                         tip->flush_subbuf = flush_subbuf_net;
914                 }
915         } else {
916                 if (tip->ofile_mmap)
917                         tip->get_subbuf = mmap_subbuf;
918                 else
919                         tip->get_subbuf = get_subbuf;
920
921                 tip->flush_subbuf = flush_subbuf_file;
922         }
923                         
924         if (net_mode == Net_server)
925                 tip->read_data = read_data_net;
926         else
927                 tip->read_data = read_data_file;
928 }
929
930 static int tip_open_output(struct device_information *dip,
931                            struct thread_information *tip)
932 {
933         int pipeline = output_name && !strcmp(output_name, "-");
934         int mode, vbuf_size;
935         char op[64];
936
937         if (net_mode == Net_client) {
938                 tip->ofile = NULL;
939                 tip->ofile_stdout = 0;
940                 tip->ofile_mmap = 0;
941                 vbuf_size = 0;
942         } else if (pipeline) {
943                 tip->ofile = fdopen(STDOUT_FILENO, "w");
944                 tip->ofile_stdout = 1;
945                 tip->ofile_mmap = 0;
946                 mode = _IOLBF;
947                 vbuf_size = 512;
948         } else {
949                 fill_ofname(op, dip->buts_name, tip->cpu);
950                 tip->ofile = fopen(op, "w+");
951                 tip->ofile_stdout = 0;
952                 tip->ofile_mmap = 1;
953                 mode = _IOFBF;
954                 vbuf_size = OFILE_BUF;
955         }
956
957         if (net_mode != Net_client && tip->ofile == NULL) {
958                 perror(op);
959                 return 1;
960         }
961
962         if (vbuf_size) {
963                 tip->ofile_buffer = malloc(vbuf_size);
964                 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
965                         perror("setvbuf");
966                         close_thread(tip);
967                         return 1;
968                 }
969         }
970
971         fill_ops(tip);
972         return 0;
973 }
974
975 static int start_threads(struct device_information *dip)
976 {
977         struct thread_information *tip;
978         int j;
979
980         for_each_tip(dip, tip, j) {
981                 tip->cpu = j;
982                 tip->device = dip;
983                 tip->events_processed = 0;
984                 memset(&tip->fifo, 0, sizeof(tip->fifo));
985                 tip->leftover_ts = NULL;
986
987                 if (tip_open_output(dip, tip))
988                         return 1;
989
990                 if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
991                         perror("pthread_create");
992                         close_thread(tip);
993                         return 1;
994                 }
995         }
996
997         return 0;
998 }
999
1000 static void stop_threads(struct device_information *dip)
1001 {
1002         struct thread_information *tip;
1003         unsigned long ret;
1004         int i;
1005
1006         for_each_tip(dip, tip, i) {
1007                 (void) pthread_join(tip->thread, (void *) &ret);
1008                 close_thread(tip);
1009         }
1010 }
1011
1012 static void stop_all_threads(void)
1013 {
1014         struct device_information *dip;
1015         int i;
1016
1017         for_each_dip(dip, i)
1018                 stop_threads(dip);
1019 }
1020
1021 static void stop_all_tracing(void)
1022 {
1023         struct device_information *dip;
1024         int i;
1025
1026         for_each_dip(dip, i)
1027                 stop_trace(dip);
1028 }
1029
1030 static void exit_trace(int status)
1031 {
1032         if (!is_trace_stopped()) {
1033                 trace_stopped = 1;
1034                 stop_all_threads();
1035                 stop_all_tracing();
1036         }
1037
1038         exit(status);
1039 }
1040
1041 static int resize_devices(char *path)
1042 {
1043         int size = (ndevs + 1) * sizeof(struct device_information);
1044
1045         device_information = realloc(device_information, size);
1046         if (!device_information) {
1047                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
1048                 return 1;
1049         }
1050         device_information[ndevs].path = path;
1051         ndevs++;
1052         return 0;
1053 }
1054
1055 static int open_devices(void)
1056 {
1057         struct device_information *dip;
1058         int i;
1059
1060         for_each_dip(dip, i) {
1061                 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
1062                 if (dip->fd < 0) {
1063                         perror(dip->path);
1064                         return 1;
1065                 }
1066         }
1067
1068         return 0;
1069 }
1070
1071 static int start_devices(void)
1072 {
1073         struct device_information *dip;
1074         int i, j, size;
1075
1076         size = ncpus * sizeof(struct thread_information);
1077         thread_information = malloc(size * ndevs);
1078         if (!thread_information) {
1079                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
1080                 return 1;
1081         }
1082
1083         for_each_dip(dip, i) {
1084                 if (start_trace(dip)) {
1085                         close(dip->fd);
1086                         fprintf(stderr, "Failed to start trace on %s\n",
1087                                 dip->path);
1088                         break;
1089                 }
1090         }
1091
1092         if (i != ndevs) {
1093                 __for_each_dip(dip, j, i)
1094                         stop_trace(dip);
1095
1096                 return 1;
1097         }
1098
1099         for_each_dip(dip, i) {
1100                 dip->threads = thread_information + (i * ncpus);
1101                 if (start_threads(dip)) {
1102                         fprintf(stderr, "Failed to start worker threads\n");
1103                         break;
1104                 }
1105         }
1106
1107         if (i != ndevs) {
1108                 __for_each_dip(dip, j, i)
1109                         stop_threads(dip);
1110                 for_each_dip(dip, i)
1111                         stop_trace(dip);
1112
1113                 return 1;
1114         }
1115
1116         return 0;
1117 }
1118
1119 static void show_stats(void)
1120 {
1121         struct device_information *dip;
1122         struct thread_information *tip;
1123         unsigned long long events_processed, data_read;
1124         unsigned long total_drops;
1125         int i, j, no_stdout = 0;
1126
1127         if (is_stat_shown())
1128                 return;
1129
1130         if (output_name && !strcmp(output_name, "-"))
1131                 no_stdout = 1;
1132
1133         stat_shown = 1;
1134
1135         total_drops = 0;
1136         for_each_dip(dip, i) {
1137                 if (!no_stdout)
1138                         printf("Device: %s\n", dip->path);
1139                 events_processed = 0;
1140                 data_read = 0;
1141                 for_each_tip(dip, tip, j) {
1142                         if (!no_stdout)
1143                                 printf("  CPU%3d: %20lu events, %8llu KiB data\n",
1144                                         tip->cpu, tip->events_processed,
1145                                         (tip->data_read + 1023) >> 10);
1146                         events_processed += tip->events_processed;
1147                         data_read += tip->data_read;
1148                 }
1149                 total_drops += dip->drop_count;
1150                 if (!no_stdout)
1151                         printf("  Total:  %20llu events (dropped %lu), %8llu KiB data\n",
1152                                         events_processed, dip->drop_count,
1153                                         (data_read + 1023) >> 10);
1154         }
1155
1156         if (total_drops)
1157                 fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
1158 }
1159
1160 static struct device_information *net_get_dip(char *buts_name)
1161 {
1162         struct device_information *dip;
1163         int i;
1164
1165         for (i = 0; i < ndevs; i++) {
1166                 dip = &device_information[i];
1167
1168                 if (!strcmp(dip->buts_name, buts_name))
1169                         return dip;
1170         }
1171
1172         device_information = realloc(device_information, (ndevs + 1) * sizeof(*dip));
1173         dip = &device_information[ndevs];
1174         strcpy(dip->buts_name, buts_name);
1175         strcpy(dip->path, buts_name);
1176         ndevs++;
1177         dip->threads = malloc(ncpus * sizeof(struct thread_information));
1178         memset(dip->threads, 0, ncpus * sizeof(struct thread_information));
1179
1180         /*
1181          * open all files
1182          */
1183         for (i = 0; i < ncpus; i++) {
1184                 struct thread_information *tip = &dip->threads[i];
1185
1186                 tip->cpu = i;
1187                 tip->device = dip;
1188
1189                 if (tip_open_output(dip, tip))
1190                         return NULL;
1191         }
1192
1193         return dip;
1194 }
1195
1196 static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh)
1197 {
1198         struct device_information *dip;
1199
1200         ncpus = bnh->max_cpus;
1201         dip = net_get_dip(bnh->buts_name);
1202         return &dip->threads[bnh->cpu];
1203 }
1204
1205 static int net_get_header(struct blktrace_net_hdr *bnh)
1206 {
1207         int fl = fcntl(net_in_fd, F_GETFL);
1208         int bytes_left, ret;
1209         void *p = bnh;
1210
1211         fcntl(net_in_fd, F_SETFL, fl | O_NONBLOCK);
1212         bytes_left = sizeof(*bnh);
1213         while (bytes_left && !is_done()) {
1214                 ret = recv(net_in_fd, p, bytes_left, MSG_WAITALL);
1215                 if (ret < 0) {
1216                         if (errno != EAGAIN) {
1217                                 perror("recv header");
1218                                 return 1;
1219                         }
1220                         usleep(100);
1221                         continue;
1222                 } else if (!ret) {
1223                         usleep(100);
1224                         continue;
1225                 } else {
1226                         p += ret;
1227                         bytes_left -= ret;
1228                 }
1229         }
1230         fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
1231         return 0;
1232 }
1233
1234 static int net_server_loop(void)
1235 {
1236         struct thread_information *tip;
1237         struct blktrace_net_hdr bnh;
1238
1239         if (net_get_header(&bnh))
1240                 return 1;
1241
1242         if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
1243                 fprintf(stderr, "server: received data is bad\n");
1244                 return 1;
1245         }
1246
1247         if (!data_is_native) {
1248                 bnh.cpu = be32_to_cpu(bnh.cpu);
1249                 bnh.len = be32_to_cpu(bnh.len);
1250         }
1251
1252         /*
1253          * len == 0 means that the other end signalled end-of-run
1254          */
1255         if (!bnh.len) {
1256                 fprintf(stderr, "server: end of run\n");
1257                 return 1;
1258         }
1259
1260         tip = net_get_tip(&bnh);
1261         if (!tip)
1262                 return 1;
1263
1264         if (mmap_subbuf(tip, bnh.len))
1265                 return 1;
1266
1267         return 0;
1268 }
1269
1270 /*
1271  * Start here when we are in server mode - just fetch data from the network
1272  * and dump to files
1273  */
1274 static int net_server(void)
1275 {
1276         struct device_information *dip;
1277         struct thread_information *tip;
1278         struct sockaddr_in addr;
1279         socklen_t socklen;
1280         int fd, opt, i, j;
1281
1282         fd = socket(AF_INET, SOCK_STREAM, 0);
1283         if (fd < 0) {
1284                 perror("server: socket");
1285                 return 1;
1286         }
1287
1288         opt = 1;
1289         if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
1290                 perror("setsockopt");
1291                 return 1;
1292         }
1293
1294         memset(&addr, 0, sizeof(addr));
1295         addr.sin_family = AF_INET;
1296         addr.sin_addr.s_addr = htonl(INADDR_ANY);
1297         addr.sin_port = htons(net_port);
1298
1299         if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1300                 perror("bind");
1301                 return 1;
1302         }
1303
1304         if (listen(fd, 1) < 0) {
1305                 perror("listen");
1306                 return 1;
1307         }
1308
1309 repeat:
1310         signal(SIGINT, NULL);
1311         signal(SIGHUP, NULL);
1312         signal(SIGTERM, NULL);
1313         signal(SIGALRM, NULL);
1314
1315         printf("blktrace: waiting for incoming connection...\n");
1316
1317         socklen = sizeof(addr);
1318         net_in_fd = accept(fd, (struct sockaddr *) &addr, &socklen);
1319         if (net_in_fd < 0) {
1320                 perror("accept");
1321                 return 1;
1322         }
1323
1324         signal(SIGINT, handle_sigint);
1325         signal(SIGHUP, handle_sigint);
1326         signal(SIGTERM, handle_sigint);
1327         signal(SIGALRM, handle_sigint);
1328
1329         printf("blktrace: connection from %s\n", inet_ntoa(addr.sin_addr));
1330
1331         while (!is_done()) {
1332                 if (net_server_loop())
1333                         break;
1334         }
1335
1336         for_each_dip(dip, i)
1337                 for_each_tip(dip, tip, j)
1338                         tip_ftrunc_final(tip);
1339
1340         show_stats();
1341
1342         if (is_done())
1343                 return 0;
1344
1345         /*
1346          * cleanup for next run
1347          */
1348         for_each_dip(dip, i) {
1349                 for_each_tip(dip, tip, j)
1350                         fclose(tip->ofile);
1351
1352                 free(dip->threads);
1353         }
1354
1355         free(device_information);
1356         device_information = NULL;
1357         ncpus = ndevs = 0;
1358
1359         close(net_in_fd);
1360         net_in_fd = -1;
1361         goto repeat;
1362 }
1363
1364 /*
1365  * Setup outgoing network connection where we will transmit data
1366  */
1367 static int net_setup_client(void)
1368 {
1369         struct sockaddr_in addr;
1370         int fd;
1371
1372         fd = socket(AF_INET, SOCK_STREAM, 0);
1373         if (fd < 0) {
1374                 perror("client: socket");
1375                 return 1;
1376         }
1377
1378         memset(&addr, 0, sizeof(addr));
1379         addr.sin_family = AF_INET;
1380         addr.sin_port = htons(net_port);
1381
1382         if (inet_aton(hostname, &addr.sin_addr) != 1) {
1383                 struct hostent *hent = gethostbyname(hostname);
1384                 if (!hent) {
1385                         perror("gethostbyname");
1386                         return 1;
1387                 }
1388
1389                 memcpy(&addr.sin_addr, hent->h_addr, 4);
1390                 strcpy(hostname, hent->h_name);
1391         }
1392
1393         printf("blktrace: connecting to %s\n", hostname);
1394
1395         if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1396                 perror("client: connect");
1397                 return 1;
1398         }
1399
1400         printf("blktrace: connected!\n");
1401         net_out_fd = fd;
1402         return 0;
1403 }
1404
1405 static char usage_str[] = \
1406         "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
1407         "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
1408         "\t-d Use specified device. May also be given last after options\n" \
1409         "\t-r Path to mounted relayfs, defaults to /relay\n" \
1410         "\t-o File(s) to send output to\n" \
1411         "\t-D Directory to prepend to output file names\n" \
1412         "\t-k Kill a running trace\n" \
1413         "\t-w Stop after defined time, in seconds\n" \
1414         "\t-a Only trace specified actions. See documentation\n" \
1415         "\t-A Give trace mask as a single value. See documentation\n" \
1416         "\t-b Sub buffer size in KiB\n" \
1417         "\t-n Number of sub buffers\n" \
1418         "\t-l Run in network listen mode (blktrace server)\n" \
1419         "\t-h Run in network client mode, connecting to the given host\n" \
1420         "\t-p Network port to use (default 8462)\n" \
1421         "\t-s Make the network client use sendfile() to transfer data\n" \
1422         "\t-V Print program version info\n\n";
1423
1424 static void show_usage(char *program)
1425 {
1426         fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
1427 }
1428
1429 int main(int argc, char *argv[])
1430 {
1431         static char default_relay_path[] = "/relay";
1432         struct statfs st;
1433         int i, c;
1434         int stop_watch = 0;
1435         int act_mask_tmp = 0;
1436
1437         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
1438                 switch (c) {
1439                 case 'a':
1440                         i = find_mask_map(optarg);
1441                         if (i < 0) {
1442                                 fprintf(stderr,"Invalid action mask %s\n",
1443                                         optarg);
1444                                 return 1;
1445                         }
1446                         act_mask_tmp |= i;
1447                         break;
1448
1449                 case 'A':
1450                         if ((sscanf(optarg, "%x", &i) != 1) || 
1451                                                         !valid_act_opt(i)) {
1452                                 fprintf(stderr,
1453                                         "Invalid set action mask %s/0x%x\n",
1454                                         optarg, i);
1455                                 return 1;
1456                         }
1457                         act_mask_tmp = i;
1458                         break;
1459
1460                 case 'd':
1461                         if (resize_devices(optarg) != 0)
1462                                 return 1;
1463                         break;
1464
1465                 case 'r':
1466                         relay_path = optarg;
1467                         break;
1468
1469                 case 'o':
1470                         output_name = optarg;
1471                         break;
1472                 case 'k':
1473                         kill_running_trace = 1;
1474                         break;
1475                 case 'w':
1476                         stop_watch = atoi(optarg);
1477                         if (stop_watch <= 0) {
1478                                 fprintf(stderr,
1479                                         "Invalid stopwatch value (%d secs)\n",
1480                                         stop_watch);
1481                                 return 1;
1482                         }
1483                         break;
1484                 case 'V':
1485                         printf("%s version %s\n", argv[0], blktrace_version);
1486                         return 0;
1487                 case 'b':
1488                         buf_size = strtoul(optarg, NULL, 10);
1489                         if (buf_size <= 0 || buf_size > 16*1024) {
1490                                 fprintf(stderr,
1491                                         "Invalid buffer size (%lu)\n",buf_size);
1492                                 return 1;
1493                         }
1494                         buf_size <<= 10;
1495                         break;
1496                 case 'n':
1497                         buf_nr = strtoul(optarg, NULL, 10);
1498                         if (buf_nr <= 0) {
1499                                 fprintf(stderr,
1500                                         "Invalid buffer nr (%lu)\n", buf_nr);
1501                                 return 1;
1502                         }
1503                         break;
1504                 case 'D':
1505                         output_dir = optarg;
1506                         break;
1507                 case 'h':
1508                         net_mode = Net_client;
1509                         strcpy(hostname, optarg);
1510                         break;
1511                 case 'l':
1512                         net_mode = Net_server;
1513                         break;
1514                 case 'p':
1515                         net_port = atoi(optarg);
1516                         break;
1517                 case 's':
1518                         net_sendfile = 1;
1519                         break;
1520                 default:
1521                         show_usage(argv[0]);
1522                         return 1;
1523                 }
1524         }
1525
1526         setlocale(LC_NUMERIC, "en_US");
1527
1528         page_size = getpagesize();
1529
1530         if (net_mode == Net_server)
1531                 return net_server();
1532
1533         while (optind < argc) {
1534                 if (resize_devices(argv[optind++]) != 0)
1535                         return 1;
1536         }
1537
1538         if (ndevs == 0) {
1539                 show_usage(argv[0]);
1540                 return 1;
1541         }
1542
1543         if (!relay_path)
1544                 relay_path = default_relay_path;
1545
1546         if (act_mask_tmp != 0)
1547                 act_mask = act_mask_tmp;
1548
1549         if (statfs(relay_path, &st) < 0) {
1550                 perror("statfs");
1551                 fprintf(stderr,"%s does not appear to be a valid path\n",
1552                         relay_path);
1553                 return 1;
1554         } else if (st.f_type != (long) RELAYFS_TYPE) {
1555                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
1556                         relay_path);
1557                 return 1;
1558         }
1559
1560         if (open_devices() != 0)
1561                 return 1;
1562
1563         if (kill_running_trace) {
1564                 stop_all_traces();
1565                 return 0;
1566         }
1567
1568         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
1569         if (ncpus < 0) {
1570                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
1571                 return 1;
1572         }
1573
1574         signal(SIGINT, handle_sigint);
1575         signal(SIGHUP, handle_sigint);
1576         signal(SIGTERM, handle_sigint);
1577         signal(SIGALRM, handle_sigint);
1578
1579         if (net_mode == Net_client && net_setup_client())
1580                 return 1;
1581
1582         if (start_devices() != 0)
1583                 return 1;
1584
1585         atexit(stop_all_tracing);
1586
1587         if (stop_watch)
1588                 alarm(stop_watch);
1589
1590         wait_for_threads();
1591
1592         if (!is_trace_stopped()) {
1593                 trace_stopped = 1;
1594                 stop_all_threads();
1595                 stop_all_traces();
1596         }
1597
1598         show_stats();
1599
1600         return 0;
1601 }
1602