avoid string overflows
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  * Rewrite to have a single thread per CPU (managing all devices on that CPU)
8  *      Alan D. Brunelle <alan.brunelle@hp.com> - January 2009
9  *
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation; either version 2 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU General Public License for more details.
19  *
20  *  You should have received a copy of the GNU General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  *
24  */
25
26 #include <errno.h>
27 #include <stdarg.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <fcntl.h>
32 #include <getopt.h>
33 #include <sched.h>
34 #include <unistd.h>
35 #include <poll.h>
36 #include <signal.h>
37 #include <pthread.h>
38 #include <locale.h>
39 #include <sys/ioctl.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/vfs.h>
43 #include <sys/mman.h>
44 #include <sys/param.h>
45 #include <sys/time.h>
46 #include <sys/resource.h>
47 #include <sys/socket.h>
48 #include <netinet/in.h>
49 #include <arpa/inet.h>
50 #include <netdb.h>
51 #include <sys/sendfile.h>
52
53 #include "btt/list.h"
54 #include "blktrace.h"
55
56 /*
57  * You may want to increase this even more, if you are logging at a high
58  * rate and see skipped/missed events
59  */
60 #define BUF_SIZE                (512 * 1024)
61 #define BUF_NR                  (4)
62
63 #define FILE_VBUF_SIZE          (128 * 1024)
64
65 #define DEBUGFS_TYPE            (0x64626720)
66 #define TRACE_NET_PORT          (8462)
67
68 enum {
69         Net_none = 0,
70         Net_server,
71         Net_client,
72 };
73
74 enum thread_status {
75         Th_running,
76         Th_leaving,
77         Th_error
78 };
79
80 /*
81  * Generic stats collected: nevents can be _roughly_ estimated by data_read
82  * (discounting pdu...)
83  *
84  * These fields are updated w/ pdc_dr_update & pdc_nev_update below.
85  */
86 struct pdc_stats {
87         unsigned long long data_read;
88         unsigned long long nevents;
89 };
90
91 struct devpath {
92         struct list_head head;
93         char *path;                     /* path to device special file */
94         char *buts_name;                /* name returned from bt kernel code */
95         struct pdc_stats *stats;
96         int fd, ncpus;
97         unsigned long long drops;
98
99         /*
100          * For piped output only:
101          *
102          * Each tracer will have a tracer_devpath_head that it will add new
103          * data onto. It's list is protected above (tracer_devpath_head.mutex)
104          * and it will signal the processing thread using the dp_cond,
105          * dp_mutex & dp_entries variables above.
106          */
107         struct tracer_devpath_head *heads;
108
109         /*
110          * For network server mode only:
111          */
112         struct cl_host *ch;
113         u32 cl_id;
114         time_t cl_connect_time;
115         struct io_info *ios;
116 };
117
118 /*
119  * For piped output to stdout we will have each tracer thread (one per dev)
120  * tack buffers read from the relay queues on a per-device list.
121  *
122  * The main thread will then collect trace buffers from each of lists in turn.
123  *
124  * We will use a mutex to guard each of the trace_buf list. The tracers
125  * can then signal the main thread using <dp_cond,dp_mutex> and
126  * dp_entries. (When dp_entries is 0, and a tracer adds an entry it will
127  * signal. When dp_entries is 0, the main thread will wait for that condition
128  * to be signalled.)
129  *
130  * adb: It may be better just to have a large buffer per tracer per dev,
131  * and then use it as a ring-buffer. This would certainly cut down a lot
132  * of malloc/free thrashing, at the cost of more memory movements (potentially).
133  */
134 struct trace_buf {
135         struct list_head head;
136         struct devpath *dpp;
137         void *buf;
138         int cpu, len;
139 };
140
141 struct tracer_devpath_head {
142         pthread_mutex_t mutex;
143         struct list_head head;
144         struct trace_buf *prev;
145 };
146
147 /*
148  * Used to handle the mmap() interfaces for output file (containing traces)
149  */
150 struct mmap_info {
151         void *fs_buf;
152         unsigned long long fs_size, fs_max_size, fs_off, fs_buf_len;
153         unsigned long buf_size, buf_nr;
154         int pagesize;
155 };
156
157 /*
158  * Each thread doing work on a (client) side of blktrace will have one
159  * of these. The ios array contains input/output information, pfds holds
160  * poll() data. The volatile's provide flags to/from the main executing
161  * thread.
162  */
163 struct tracer {
164         struct list_head head;
165         struct io_info *ios;
166         struct pollfd *pfds;
167         pthread_t thread;
168         int cpu, nios;
169         volatile int status, is_done;
170 };
171
172 /*
173  * networking stuff follows. we include a magic number so we know whether
174  * to endianness convert or not.
175  *
176  * The len field is overloaded:
177  *      0 - Indicates an "open" - allowing the server to set up for a dev/cpu
178  *      1 - Indicates a "close" - Shut down connection orderly
179  *
180  * The cpu field is overloaded on close: it will contain the number of drops.
181  */
182 struct blktrace_net_hdr {
183         u32 magic;              /* same as trace magic */
184         char buts_name[32];     /* trace name */
185         u32 cpu;                /* for which cpu */
186         u32 max_cpus;
187         u32 len;                /* length of following trace data */
188         u32 cl_id;              /* id for set of client per-cpu connections */
189         u32 buf_size;           /* client buf_size for this trace  */
190         u32 buf_nr;             /* client buf_nr for this trace  */
191         u32 page_size;          /* client page_size for this trace  */
192 };
193
194 /*
195  * Each host encountered has one of these. The head is used to link this
196  * on to the network server's ch_list. Connections associated with this
197  * host are linked on conn_list, and any devices traced on that host
198  * are connected on the devpaths list.
199  */
200 struct cl_host {
201         struct list_head head;
202         struct list_head conn_list;
203         struct list_head devpaths;
204         struct net_server_s *ns;
205         char *hostname;
206         struct in_addr cl_in_addr;
207         int connects, ndevs, cl_opens;
208 };
209
210 /*
211  * Each connection (client to server socket ('fd')) has one of these. A
212  * back reference to the host ('ch'), and lists headers (for the host
213  * list, and the network server conn_list) are also included.
214  */
215 struct cl_conn {
216         struct list_head ch_head, ns_head;
217         struct cl_host *ch;
218         int fd, ncpus;
219         time_t connect_time;
220 };
221
222 /*
223  * The network server requires some poll structures to be maintained -
224  * one per conection currently on conn_list. The nchs/ch_list values
225  * are for each host connected to this server. The addr field is used
226  * for scratch as new connections are established.
227  */
228 struct net_server_s {
229         struct list_head conn_list;
230         struct list_head ch_list;
231         struct pollfd *pfds;
232         int listen_fd, connects, nchs;
233         struct sockaddr_in addr;
234 };
235
236 /*
237  * This structure is (generically) used to providide information
238  * for a read-to-write set of values.
239  *
240  * ifn & ifd represent input information
241  *
242  * ofn, ofd, ofp, obuf & mmap_info are used for output file (optionally).
243  */
244 struct io_info {
245         struct devpath *dpp;
246         FILE *ofp;
247         char *obuf;
248         struct cl_conn *nc;     /* Server network connection */
249
250         /*
251          * mmap controlled output files
252          */
253         struct mmap_info mmap_info;
254
255         /*
256          * Client network fields
257          */
258         unsigned int ready;
259         unsigned long long data_queued;
260
261         /*
262          * Input/output file descriptors & names
263          */
264         int ifd, ofd;
265         char ifn[MAXPATHLEN + 64];
266         char ofn[MAXPATHLEN + 64];
267 };
268
269 static char blktrace_version[] = "2.0.0";
270
271 /*
272  * Linkage to blktrace helper routines (trace conversions)
273  */
274 int data_is_native = -1;
275
276 static int ndevs;
277 static int ncpus;
278 static int pagesize;
279 static int act_mask = ~0U;
280 static int kill_running_trace;
281 static int stop_watch;
282 static int piped_output;
283
284 static char *debugfs_path = "/sys/kernel/debug";
285 static char *output_name;
286 static char *output_dir;
287
288 static unsigned long buf_size = BUF_SIZE;
289 static unsigned long buf_nr = BUF_NR;
290
291 static FILE *pfp;
292
293 static LIST_HEAD(devpaths);
294 static LIST_HEAD(tracers);
295
296 static volatile int done;
297
298 /*
299  * tracer threads add entries, the main thread takes them off and processes
300  * them. These protect the dp_entries variable.
301  */
302 static pthread_cond_t dp_cond = PTHREAD_COND_INITIALIZER;
303 static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
304 static volatile int dp_entries;
305
306 /*
307  * These synchronize master / thread interactions.
308  */
309 static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
310 static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
311 static volatile int nthreads_running;
312 static volatile int nthreads_leaving;
313 static volatile int nthreads_error;
314 static volatile int tracers_run;
315
316 /*
317  * network cmd line params
318  */
319 static struct sockaddr_in hostname_addr;
320 static char hostname[MAXHOSTNAMELEN];
321 static int net_port = TRACE_NET_PORT;
322 static int net_use_sendfile = 1;
323 static int net_mode;
324 static int *cl_fds;
325
326 static int (*handle_pfds)(struct tracer *, int, int);
327 static int (*handle_list)(struct tracer_devpath_head *, struct list_head *);
328
329 #define S_OPTS  "d:a:A:r:o:kw:vVb:n:D:lh:p:sI:"
330 static struct option l_opts[] = {
331         {
332                 .name = "dev",
333                 .has_arg = required_argument,
334                 .flag = NULL,
335                 .val = 'd'
336         },
337         {
338                 .name = "input-devs",
339                 .has_arg = required_argument,
340                 .flag = NULL,
341                 .val = 'I'
342         },
343         {
344                 .name = "act-mask",
345                 .has_arg = required_argument,
346                 .flag = NULL,
347                 .val = 'a'
348         },
349         {
350                 .name = "set-mask",
351                 .has_arg = required_argument,
352                 .flag = NULL,
353                 .val = 'A'
354         },
355         {
356                 .name = "relay",
357                 .has_arg = required_argument,
358                 .flag = NULL,
359                 .val = 'r'
360         },
361         {
362                 .name = "output",
363                 .has_arg = required_argument,
364                 .flag = NULL,
365                 .val = 'o'
366         },
367         {
368                 .name = "kill",
369                 .has_arg = no_argument,
370                 .flag = NULL,
371                 .val = 'k'
372         },
373         {
374                 .name = "stopwatch",
375                 .has_arg = required_argument,
376                 .flag = NULL,
377                 .val = 'w'
378         },
379         {
380                 .name = "version",
381                 .has_arg = no_argument,
382                 .flag = NULL,
383                 .val = 'v'
384         },
385         {
386                 .name = "version",
387                 .has_arg = no_argument,
388                 .flag = NULL,
389                 .val = 'V'
390         },
391         {
392                 .name = "buffer-size",
393                 .has_arg = required_argument,
394                 .flag = NULL,
395                 .val = 'b'
396         },
397         {
398                 .name = "num-sub-buffers",
399                 .has_arg = required_argument,
400                 .flag = NULL,
401                 .val = 'n'
402         },
403         {
404                 .name = "output-dir",
405                 .has_arg = required_argument,
406                 .flag = NULL,
407                 .val = 'D'
408         },
409         {
410                 .name = "listen",
411                 .has_arg = no_argument,
412                 .flag = NULL,
413                 .val = 'l'
414         },
415         {
416                 .name = "host",
417                 .has_arg = required_argument,
418                 .flag = NULL,
419                 .val = 'h'
420         },
421         {
422                 .name = "port",
423                 .has_arg = required_argument,
424                 .flag = NULL,
425                 .val = 'p'
426         },
427         {
428                 .name = "no-sendfile",
429                 .has_arg = no_argument,
430                 .flag = NULL,
431                 .val = 's'
432         },
433         {
434                 .name = NULL,
435         }
436 };
437
438 static char usage_str[] = "\n\n" \
439         "-d <dev>             | --dev=<dev>\n" \
440         "[ -r <debugfs path>  | --relay=<debugfs path> ]\n" \
441         "[ -o <file>          | --output=<file>]\n" \
442         "[ -D <dir>           | --output-dir=<dir>\n" \
443         "[ -w <time>          | --stopwatch=<time>]\n" \
444         "[ -a <action field>  | --act-mask=<action field>]\n" \
445         "[ -A <action mask>   | --set-mask=<action mask>]\n" \
446         "[ -b <size>          | --buffer-size]\n" \
447         "[ -n <number>        | --num-sub-buffers=<number>]\n" \
448         "[ -l                 | --listen]\n" \
449         "[ -h <hostname>      | --host=<hostname>]\n" \
450         "[ -p <port number>   | --port=<port number>]\n" \
451         "[ -s                 | --no-sendfile]\n" \
452         "[ -I <devs file>     | --input-devs=<devs file>]\n" \
453         "[ -v <version>       | --version]\n" \
454         "[ -V <version>       | --version]\n" \
455
456         "\t-d Use specified device. May also be given last after options\n" \
457         "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
458         "\t-o File(s) to send output to\n" \
459         "\t-D Directory to prepend to output file names\n" \
460         "\t-w Stop after defined time, in seconds\n" \
461         "\t-a Only trace specified actions. See documentation\n" \
462         "\t-A Give trace mask as a single value. See documentation\n" \
463         "\t-b Sub buffer size in KiB (default 512)\n" \
464         "\t-n Number of sub buffers (default 4)\n" \
465         "\t-l Run in network listen mode (blktrace server)\n" \
466         "\t-h Run in network client mode, connecting to the given host\n" \
467         "\t-p Network port to use (default 8462)\n" \
468         "\t-s Make the network client NOT use sendfile() to transfer data\n" \
469         "\t-I Add devices found in <devs file>\n" \
470         "\t-v Print program version info\n" \
471         "\t-V Print program version info\n\n";
472
473 static void clear_events(struct pollfd *pfd)
474 {
475         pfd->events = 0;
476         pfd->revents = 0;
477 }
478
479 static inline int net_client_use_sendfile(void)
480 {
481         return net_mode == Net_client && net_use_sendfile;
482 }
483
484 static inline int net_client_use_send(void)
485 {
486         return net_mode == Net_client && !net_use_sendfile;
487 }
488
489 static inline int use_tracer_devpaths(void)
490 {
491         return piped_output || net_client_use_send();
492 }
493
494 static inline int in_addr_eq(struct in_addr a, struct in_addr b)
495 {
496         return a.s_addr == b.s_addr;
497 }
498
499 static inline void pdc_dr_update(struct devpath *dpp, int cpu, int data_read)
500 {
501         dpp->stats[cpu].data_read += data_read;
502 }
503
504 static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
505 {
506         dpp->stats[cpu].nevents += nevents;
507 }
508
509 static void show_usage(char *prog)
510 {
511         fprintf(stderr, "Usage: %s %s", prog, usage_str);
512 }
513
514 /*
515  * Create a timespec 'msec' milliseconds into the future
516  */
517 static inline void make_timespec(struct timespec *tsp, long delta_msec)
518 {
519         struct timeval now;
520
521         gettimeofday(&now, NULL);
522         tsp->tv_sec = now.tv_sec;
523         tsp->tv_nsec = 1000L * now.tv_usec;
524
525         tsp->tv_nsec += (delta_msec * 1000000L);
526         if (tsp->tv_nsec > 1000000000L) {
527                 long secs = tsp->tv_nsec / 1000000000L;
528
529                 tsp->tv_sec += secs;
530                 tsp->tv_nsec -= (secs * 1000000000L);
531         }
532 }
533
534 /*
535  * Add a timer to ensure wait ends
536  */
537 static void t_pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
538 {
539         struct timespec ts;
540
541         make_timespec(&ts, 50);
542         pthread_cond_timedwait(cond, mutex, &ts);
543 }
544
545 static void unblock_tracers(void)
546 {
547         pthread_mutex_lock(&mt_mutex);
548         tracers_run = 1;
549         pthread_cond_broadcast(&mt_cond);
550         pthread_mutex_unlock(&mt_mutex);
551 }
552
553 static void tracer_wait_unblock(struct tracer *tp)
554 {
555         pthread_mutex_lock(&mt_mutex);
556         while (!tp->is_done && !tracers_run)
557                 pthread_cond_wait(&mt_cond, &mt_mutex);
558         pthread_mutex_unlock(&mt_mutex);
559 }
560
561 static void tracer_signal_ready(struct tracer *tp,
562                                 enum thread_status th_status,
563                                 int status)
564 {
565         pthread_mutex_lock(&mt_mutex);
566         tp->status = status;
567
568         if (th_status == Th_running)
569                 nthreads_running++;
570         else if (th_status == Th_error)
571                 nthreads_error++;
572         else
573                 nthreads_leaving++;
574
575         pthread_cond_signal(&mt_cond);
576         pthread_mutex_unlock(&mt_mutex);
577 }
578
579 static void wait_tracers_ready(int ncpus_started)
580 {
581         pthread_mutex_lock(&mt_mutex);
582         while ((nthreads_running + nthreads_error) < ncpus_started)
583                 t_pthread_cond_wait(&mt_cond, &mt_mutex);
584         pthread_mutex_unlock(&mt_mutex);
585 }
586
587 static void wait_tracers_leaving(void)
588 {
589         pthread_mutex_lock(&mt_mutex);
590         while (nthreads_leaving < nthreads_running)
591                 t_pthread_cond_wait(&mt_cond, &mt_mutex);
592         pthread_mutex_unlock(&mt_mutex);
593 }
594
595 static void init_mmap_info(struct mmap_info *mip)
596 {
597         mip->buf_size = buf_size;
598         mip->buf_nr = buf_nr;
599         mip->pagesize = pagesize;
600 }
601
602 static void net_close_connection(int *fd)
603 {
604         shutdown(*fd, SHUT_RDWR);
605         close(*fd);
606         *fd = -1;
607 }
608
609 static void dpp_free(struct devpath *dpp)
610 {
611         if (dpp->stats)
612                 free(dpp->stats);
613         if (dpp->ios)
614                 free(dpp->ios);
615         if (dpp->path)
616                 free(dpp->path);
617         if (dpp->buts_name)
618                 free(dpp->buts_name);
619         free(dpp);
620 }
621
622 static int lock_on_cpu(int cpu)
623 {
624         cpu_set_t cpu_mask;
625
626         CPU_ZERO(&cpu_mask);
627         CPU_SET(cpu, &cpu_mask);
628         if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) < 0)
629                 return errno;
630
631         return 0;
632 }
633
634 static int increase_limit(int resource, rlim_t increase)
635 {
636         struct rlimit rlim;
637         int save_errno = errno;
638
639         if (!getrlimit(resource, &rlim)) {
640                 rlim.rlim_cur += increase;
641                 if (rlim.rlim_cur >= rlim.rlim_max)
642                         rlim.rlim_max = rlim.rlim_cur + increase;
643
644                 if (!setrlimit(resource, &rlim))
645                         return 1;
646         }
647
648         errno = save_errno;
649         return 0;
650 }
651
652 static int handle_open_failure(void)
653 {
654         if (errno == ENFILE || errno == EMFILE)
655                 return increase_limit(RLIMIT_NOFILE, 16);
656         return 0;
657 }
658
659 static int handle_mem_failure(size_t length)
660 {
661         if (errno == ENFILE)
662                 return handle_open_failure();
663         else if (errno == ENOMEM)
664                 return increase_limit(RLIMIT_MEMLOCK, 2 * length);
665         return 0;
666 }
667
668 static FILE *my_fopen(const char *path, const char *mode)
669 {
670         FILE *fp;
671
672         do {
673                 fp = fopen(path, mode);
674         } while (fp == NULL && handle_open_failure());
675
676         return fp;
677 }
678
679 static int my_open(const char *path, int flags)
680 {
681         int fd;
682
683         do {
684                 fd = open(path, flags);
685         } while (fd < 0 && handle_open_failure());
686
687         return fd;
688 }
689
690 static int my_socket(int domain, int type, int protocol)
691 {
692         int fd;
693
694         do {
695                 fd = socket(domain, type, protocol);
696         } while (fd < 0 && handle_open_failure());
697
698         return fd;
699 }
700
701 static int my_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
702 {
703         int fd;
704
705         do {
706                 fd = accept(sockfd, addr, addrlen);
707         } while (fd < 0 && handle_open_failure());
708
709         return fd;
710 }
711
712 static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
713                      off_t offset)
714 {
715         void *new;
716
717         do {
718                 new = mmap(addr, length, prot, flags, fd, offset);
719         } while (new == MAP_FAILED && handle_mem_failure(length));
720
721         return new;
722 }
723
724 static int my_mlock(struct tracer *tp,
725                     const void *addr, size_t len)
726 {
727         int ret, retry = 0;
728
729         do {
730                 ret = mlock(addr, len);
731                 if ((retry >= 10) && tp && tp->is_done)
732                         break;
733                 retry++;
734         } while (ret < 0 && handle_mem_failure(len));
735
736         return ret;
737 }
738
739 static int setup_mmap(int fd, unsigned int maxlen,
740                       struct mmap_info *mip,
741                       struct tracer *tp)
742 {
743         if (mip->fs_off + maxlen > mip->fs_buf_len) {
744                 unsigned long nr = max(16, mip->buf_nr);
745
746                 if (mip->fs_buf) {
747                         munlock(mip->fs_buf, mip->fs_buf_len);
748                         munmap(mip->fs_buf, mip->fs_buf_len);
749                         mip->fs_buf = NULL;
750                 }
751
752                 mip->fs_off = mip->fs_size & (mip->pagesize - 1);
753                 mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
754                 mip->fs_max_size += mip->fs_buf_len;
755
756                 if (ftruncate(fd, mip->fs_max_size) < 0) {
757                         perror("setup_mmap: ftruncate");
758                         return 1;
759                 }
760
761                 mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
762                                       MAP_SHARED, fd,
763                                       mip->fs_size - mip->fs_off);
764                 if (mip->fs_buf == MAP_FAILED) {
765                         perror("setup_mmap: mmap");
766                         return 1;
767                 }
768                 if (my_mlock(tp, mip->fs_buf, mip->fs_buf_len) < 0) {
769                         perror("setup_mlock: mlock");
770                         return 1;
771                 }
772         }
773
774         return 0;
775 }
776
777 static int __stop_trace(int fd)
778 {
779         /*
780          * Should be stopped, don't complain if it isn't
781          */
782         ioctl(fd, BLKTRACESTOP);
783         return ioctl(fd, BLKTRACETEARDOWN);
784 }
785
786 static int write_data(char *buf, int len)
787 {
788         int ret;
789
790 rewrite:
791         ret = fwrite(buf, len, 1, pfp);
792         if (ferror(pfp) || ret != 1) {
793                 if (errno == EINTR) {
794                         clearerr(pfp);
795                         goto rewrite;
796                 }
797
798                 if (!piped_output || (errno != EPIPE && errno != EBADF)) {
799                         fprintf(stderr, "write(%d) failed: %d/%s\n",
800                                 len, errno, strerror(errno));
801                 }
802                 goto err;
803         }
804
805         fflush(pfp);
806         return 0;
807
808 err:
809         clearerr(pfp);
810         return 1;
811 }
812
813 /*
814  * Returns the number of bytes read (successfully)
815  */
816 static int __net_recv_data(int fd, void *buf, unsigned int len)
817 {
818         unsigned int bytes_left = len;
819
820         while (bytes_left && !done) {
821                 int ret = recv(fd, buf, bytes_left, MSG_WAITALL);
822
823                 if (ret == 0)
824                         break;
825                 else if (ret < 0) {
826                         if (errno == EAGAIN) {
827                                 usleep(50);
828                                 continue;
829                         }
830                         perror("server: net_recv_data: recv failed");
831                         break;
832                 } else {
833                         buf += ret;
834                         bytes_left -= ret;
835                 }
836         }
837
838         return len - bytes_left;
839 }
840
841 static int net_recv_data(int fd, void *buf, unsigned int len)
842 {
843         return __net_recv_data(fd, buf, len);
844 }
845
846 /*
847  * Returns number of bytes written
848  */
849 static int net_send_data(int fd, void *buf, unsigned int buf_len)
850 {
851         int ret;
852         unsigned int bytes_left = buf_len;
853
854         while (bytes_left) {
855                 ret = send(fd, buf, bytes_left, 0);
856                 if (ret < 0) {
857                         perror("send");
858                         break;
859                 }
860
861                 buf += ret;
862                 bytes_left -= ret;
863         }
864
865         return buf_len - bytes_left;
866 }
867
868 static int net_send_header(int fd, int cpu, char *buts_name, int len)
869 {
870         struct blktrace_net_hdr hdr;
871
872         memset(&hdr, 0, sizeof(hdr));
873
874         hdr.magic = BLK_IO_TRACE_MAGIC;
875         memset(hdr.buts_name, 0, sizeof(hdr.buts_name));
876         strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
877         hdr.buts_name[sizeof(hdr.buts_name) - 1] = '\0';
878         hdr.cpu = cpu;
879         hdr.max_cpus = ncpus;
880         hdr.len = len;
881         hdr.cl_id = getpid();
882         hdr.buf_size = buf_size;
883         hdr.buf_nr = buf_nr;
884         hdr.page_size = pagesize;
885
886         return net_send_data(fd, &hdr, sizeof(hdr)) != sizeof(hdr);
887 }
888
889 static void net_send_open_close(int fd, int cpu, char *buts_name, int len)
890 {
891         struct blktrace_net_hdr ret_hdr;
892
893         net_send_header(fd, cpu, buts_name, len);
894         net_recv_data(fd, &ret_hdr, sizeof(ret_hdr));
895 }
896
897 static void net_send_open(int fd, int cpu, char *buts_name)
898 {
899         net_send_open_close(fd, cpu, buts_name, 0);
900 }
901
902 static void net_send_close(int fd, char *buts_name, int drops)
903 {
904         /*
905          * Overload CPU w/ number of drops
906          *
907          * XXX: Need to clear/set done around call - done=1 (which
908          * is true here) stops reads from happening... :-(
909          */
910         done = 0;
911         net_send_open_close(fd, drops, buts_name, 1);
912         done = 1;
913 }
914
915 static void ack_open_close(int fd, char *buts_name)
916 {
917         net_send_header(fd, 0, buts_name, 2);
918 }
919
920 static void net_send_drops(int fd)
921 {
922         struct list_head *p;
923
924         __list_for_each(p, &devpaths) {
925                 struct devpath *dpp = list_entry(p, struct devpath, head);
926
927                 net_send_close(fd, dpp->buts_name, dpp->drops);
928         }
929 }
930
931 /*
932  * Returns:
933  *       0: "EOF"
934  *       1: OK
935  *      -1: Error
936  */
937 static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
938 {
939         int bytes_read;
940         int fl = fcntl(nc->fd, F_GETFL);
941
942         fcntl(nc->fd, F_SETFL, fl | O_NONBLOCK);
943         bytes_read = __net_recv_data(nc->fd, bnh, sizeof(*bnh));
944         fcntl(nc->fd, F_SETFL, fl & ~O_NONBLOCK);
945
946         if (bytes_read == sizeof(*bnh))
947                 return 1;
948         else if (bytes_read == 0)
949                 return 0;
950         else
951                 return -1;
952 }
953
954 static int net_setup_addr(void)
955 {
956         struct sockaddr_in *addr = &hostname_addr;
957
958         memset(addr, 0, sizeof(*addr));
959         addr->sin_family = AF_INET;
960         addr->sin_port = htons(net_port);
961
962         if (inet_aton(hostname, &addr->sin_addr) != 1) {
963                 struct hostent *hent;
964 retry:
965                 hent = gethostbyname(hostname);
966                 if (!hent) {
967                         if (h_errno == TRY_AGAIN) {
968                                 usleep(100);
969                                 goto retry;
970                         } else if (h_errno == NO_RECOVERY) {
971                                 fprintf(stderr, "gethostbyname(%s)"
972                                         "non-recoverable error encountered\n",
973                                         hostname);
974                         } else {
975                                 /*
976                                  * HOST_NOT_FOUND, NO_ADDRESS or NO_DATA
977                                  */
978                                 fprintf(stderr, "Host %s not found\n",
979                                         hostname);
980                         }
981                         return 1;
982                 }
983
984                 memcpy(&addr->sin_addr, hent->h_addr, 4);
985                 memset(hostname, 0, sizeof(hostname));
986                 strncpy(hostname, hent->h_name, sizeof(hostname));
987                 hostname[sizeof(hostname) - 1] = '\0';
988         }
989
990         return 0;
991 }
992
993 static int net_setup_client(void)
994 {
995         int fd;
996         struct sockaddr_in *addr = &hostname_addr;
997
998         fd = my_socket(AF_INET, SOCK_STREAM, 0);
999         if (fd < 0) {
1000                 perror("client: socket");
1001                 return -1;
1002         }
1003
1004         if (connect(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0) {
1005                 if (errno == ECONNREFUSED)
1006                         fprintf(stderr,
1007                                 "\nclient: Connection to %s refused, "
1008                                 "perhaps the server is not started?\n\n",
1009                                 hostname);
1010                 else
1011                         perror("client: connect");
1012
1013                 close(fd);
1014                 return -1;
1015         }
1016
1017         return fd;
1018 }
1019
1020 static int open_client_connections(void)
1021 {
1022         int cpu;
1023
1024         cl_fds = calloc(ncpus, sizeof(*cl_fds));
1025         for (cpu = 0; cpu < ncpus; cpu++) {
1026                 cl_fds[cpu] = net_setup_client();
1027                 if (cl_fds[cpu] < 0)
1028                         goto err;
1029         }
1030         return 0;
1031
1032 err:
1033         while (cpu > 0)
1034                 close(cl_fds[cpu--]);
1035         free(cl_fds);
1036         return 1;
1037 }
1038
1039 static void close_client_connections(void)
1040 {
1041         if (cl_fds) {
1042                 int cpu, *fdp;
1043
1044                 for (cpu = 0, fdp = cl_fds; cpu < ncpus; cpu++, fdp++) {
1045                         if (*fdp >= 0) {
1046                                 net_send_drops(*fdp);
1047                                 net_close_connection(fdp);
1048                         }
1049                 }
1050                 free(cl_fds);
1051         }
1052 }
1053
1054 static void setup_buts(void)
1055 {
1056         struct list_head *p;
1057
1058         __list_for_each(p, &devpaths) {
1059                 struct blk_user_trace_setup buts;
1060                 struct devpath *dpp = list_entry(p, struct devpath, head);
1061
1062                 memset(&buts, 0, sizeof(buts));
1063                 buts.buf_size = buf_size;
1064                 buts.buf_nr = buf_nr;
1065                 buts.act_mask = act_mask;
1066
1067                 if (ioctl(dpp->fd, BLKTRACESETUP, &buts) >= 0) {
1068                         dpp->ncpus = ncpus;
1069                         dpp->buts_name = strdup(buts.name);
1070                         if (dpp->stats)
1071                                 free(dpp->stats);
1072                         dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
1073                         memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
1074                 } else
1075                         fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
1076                                 dpp->path, errno, strerror(errno));
1077         }
1078 }
1079
1080 static void start_buts(void)
1081 {
1082         struct list_head *p;
1083
1084         __list_for_each(p, &devpaths) {
1085                 struct devpath *dpp = list_entry(p, struct devpath, head);
1086
1087                 if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
1088                         fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
1089                                 dpp->path, errno, strerror(errno));
1090                 }
1091         }
1092 }
1093
1094 static int get_drops(struct devpath *dpp)
1095 {
1096         int fd, drops = 0;
1097         char fn[MAXPATHLEN + 64], tmp[256];
1098
1099         snprintf(fn, sizeof(fn), "%s/block/%s/dropped", debugfs_path,
1100                  dpp->buts_name);
1101
1102         fd = my_open(fn, O_RDONLY);
1103         if (fd < 0) {
1104                 /*
1105                  * This may be ok: the kernel may not support
1106                  * dropped counts.
1107                  */
1108                 if (errno != ENOENT)
1109                         fprintf(stderr, "Could not open %s: %d/%s\n",
1110                                 fn, errno, strerror(errno));
1111                 return 0;
1112         } else if (read(fd, tmp, sizeof(tmp)) < 0) {
1113                 fprintf(stderr, "Could not read %s: %d/%s\n",
1114                         fn, errno, strerror(errno));
1115         } else
1116                 drops = atoi(tmp);
1117         close(fd);
1118
1119         return drops;
1120 }
1121
1122 static void get_all_drops(void)
1123 {
1124         struct list_head *p;
1125
1126         __list_for_each(p, &devpaths) {
1127                 struct devpath *dpp = list_entry(p, struct devpath, head);
1128
1129                 dpp->drops = get_drops(dpp);
1130         }
1131 }
1132
1133 static inline struct trace_buf *alloc_trace_buf(int cpu, int bufsize)
1134 {
1135         struct trace_buf *tbp;
1136
1137         tbp = malloc(sizeof(*tbp) + bufsize);
1138         INIT_LIST_HEAD(&tbp->head);
1139         tbp->len = 0;
1140         tbp->buf = (void *)(tbp + 1);
1141         tbp->cpu = cpu;
1142         tbp->dpp = NULL;        /* Will be set when tbp is added */
1143
1144         return tbp;
1145 }
1146
1147 static void free_tracer_heads(struct devpath *dpp)
1148 {
1149         int cpu;
1150         struct tracer_devpath_head *hd;
1151
1152         for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
1153                 if (hd->prev)
1154                         free(hd->prev);
1155
1156                 pthread_mutex_destroy(&hd->mutex);
1157         }
1158         free(dpp->heads);
1159 }
1160
1161 static int setup_tracer_devpaths(void)
1162 {
1163         struct list_head *p;
1164
1165         if (net_client_use_send())
1166                 if (open_client_connections())
1167                         return 1;
1168
1169         __list_for_each(p, &devpaths) {
1170                 int cpu;
1171                 struct tracer_devpath_head *hd;
1172                 struct devpath *dpp = list_entry(p, struct devpath, head);
1173
1174                 dpp->heads = calloc(ncpus, sizeof(struct tracer_devpath_head));
1175                 for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
1176                         INIT_LIST_HEAD(&hd->head);
1177                         pthread_mutex_init(&hd->mutex, NULL);
1178                         hd->prev = NULL;
1179                 }
1180         }
1181
1182         return 0;
1183 }
1184
1185 static inline void add_trace_buf(struct devpath *dpp, int cpu,
1186                                                 struct trace_buf **tbpp)
1187 {
1188         struct trace_buf *tbp = *tbpp;
1189         struct tracer_devpath_head *hd = &dpp->heads[cpu];
1190
1191         tbp->dpp = dpp;
1192
1193         pthread_mutex_lock(&hd->mutex);
1194         list_add_tail(&tbp->head, &hd->head);
1195         pthread_mutex_unlock(&hd->mutex);
1196
1197         *tbpp = alloc_trace_buf(cpu, buf_size);
1198 }
1199
1200 static inline void incr_entries(int entries_handled)
1201 {
1202         pthread_mutex_lock(&dp_mutex);
1203         if (dp_entries == 0)
1204                 pthread_cond_signal(&dp_cond);
1205         dp_entries += entries_handled;
1206         pthread_mutex_unlock(&dp_mutex);
1207 }
1208
1209 static void decr_entries(int handled)
1210 {
1211         pthread_mutex_lock(&dp_mutex);
1212         dp_entries -= handled;
1213         pthread_mutex_unlock(&dp_mutex);
1214 }
1215
1216 static int wait_empty_entries(void)
1217 {
1218         pthread_mutex_lock(&dp_mutex);
1219         while (!done && dp_entries == 0)
1220                 t_pthread_cond_wait(&dp_cond, &dp_mutex);
1221         pthread_mutex_unlock(&dp_mutex);
1222
1223         return !done;
1224 }
1225
1226 static int add_devpath(char *path)
1227 {
1228         int fd;
1229         struct devpath *dpp;
1230         struct list_head *p;
1231
1232         /*
1233          * Verify device is not duplicated
1234          */
1235         __list_for_each(p, &devpaths) {
1236                struct devpath *tmp = list_entry(p, struct devpath, head);
1237                if (!strcmp(tmp->path, path))
1238                         return 0;
1239         }
1240         /*
1241          * Verify device is valid before going too far
1242          */
1243         fd = my_open(path, O_RDONLY | O_NONBLOCK);
1244         if (fd < 0) {
1245                 fprintf(stderr, "Invalid path %s specified: %d/%s\n",
1246                         path, errno, strerror(errno));
1247                 return 1;
1248         }
1249
1250         dpp = malloc(sizeof(*dpp));
1251         memset(dpp, 0, sizeof(*dpp));
1252         dpp->path = strdup(path);
1253         dpp->fd = fd;
1254         ndevs++;
1255         list_add_tail(&dpp->head, &devpaths);
1256
1257         return 0;
1258 }
1259
1260 static void rel_devpaths(void)
1261 {
1262         struct list_head *p, *q;
1263
1264         list_for_each_safe(p, q, &devpaths) {
1265                 struct devpath *dpp = list_entry(p, struct devpath, head);
1266
1267                 list_del(&dpp->head);
1268                 __stop_trace(dpp->fd);
1269                 close(dpp->fd);
1270
1271                 if (dpp->heads)
1272                         free_tracer_heads(dpp);
1273
1274                 dpp_free(dpp);
1275                 ndevs--;
1276         }
1277 }
1278
1279 static int flush_subbuf_net(struct trace_buf *tbp)
1280 {
1281         int fd = cl_fds[tbp->cpu];
1282         struct devpath *dpp = tbp->dpp;
1283
1284         if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
1285                 return 1;
1286         else if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
1287                 return 1;
1288
1289         return 0;
1290 }
1291
1292 static int
1293 handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
1294                 struct list_head *list)
1295 {
1296         struct trace_buf *tbp;
1297         struct list_head *p, *q;
1298         int entries_handled = 0;
1299
1300         list_for_each_safe(p, q, list) {
1301                 tbp = list_entry(p, struct trace_buf, head);
1302
1303                 list_del(&tbp->head);
1304                 entries_handled++;
1305
1306                 if (cl_fds[tbp->cpu] >= 0) {
1307                         if (flush_subbuf_net(tbp)) {
1308                                 close(cl_fds[tbp->cpu]);
1309                                 cl_fds[tbp->cpu] = -1;
1310                         }
1311                 }
1312
1313                 free(tbp);
1314         }
1315
1316         return entries_handled;
1317 }
1318
1319 /*
1320  * Tack 'tbp's buf onto the tail of 'prev's buf
1321  */
1322 static struct trace_buf *tb_combine(struct trace_buf *prev,
1323                                     struct trace_buf *tbp)
1324 {
1325         unsigned long tot_len;
1326
1327         tot_len = prev->len + tbp->len;
1328         if (tot_len > buf_size) {
1329                 /*
1330                  * tbp->head isn't connected (it was 'prev'
1331                  * so it had been taken off of the list
1332                  * before). Therefore, we can realloc
1333                  * the whole structures, as the other fields
1334                  * are "static".
1335                  */
1336                 prev = realloc(prev, sizeof(*prev) + tot_len);
1337                 prev->buf = (void *)(prev + 1);
1338         }
1339
1340         memcpy(prev->buf + prev->len, tbp->buf, tbp->len);
1341         prev->len = tot_len;
1342
1343         free(tbp);
1344         return prev;
1345 }
1346
1347 static int handle_list_file(struct tracer_devpath_head *hd,
1348                             struct list_head *list)
1349 {
1350         int off, t_len, nevents;
1351         struct blk_io_trace *t;
1352         struct list_head *p, *q;
1353         int entries_handled = 0;
1354         struct trace_buf *tbp, *prev;
1355
1356         prev = hd->prev;
1357         list_for_each_safe(p, q, list) {
1358                 tbp = list_entry(p, struct trace_buf, head);
1359                 list_del(&tbp->head);
1360                 entries_handled++;
1361
1362                 /*
1363                  * If there was some leftover before, tack this new
1364                  * entry onto the tail of the previous one.
1365                  */
1366                 if (prev)
1367                         tbp = tb_combine(prev, tbp);
1368
1369                 /*
1370                  * See how many whole traces there are - send them
1371                  * all out in one go.
1372                  */
1373                 off = 0;
1374                 nevents = 0;
1375                 while (off + (int)sizeof(*t) <= tbp->len) {
1376                         t = (struct blk_io_trace *)(tbp->buf + off);
1377                         t_len = sizeof(*t) + t->pdu_len;
1378                         if (off + t_len > tbp->len)
1379                                 break;
1380
1381                         off += t_len;
1382                         nevents++;
1383                 }
1384                 if (nevents)
1385                         pdc_nev_update(tbp->dpp, tbp->cpu, nevents);
1386
1387                 /*
1388                  * Write any full set of traces, any remaining data is kept
1389                  * for the next pass.
1390                  */
1391                 if (off) {
1392                         if (write_data(tbp->buf, off) || off == tbp->len) {
1393                                 free(tbp);
1394                                 prev = NULL;
1395                         }
1396                         else {
1397                                 /*
1398                                  * Move valid data to beginning of buffer
1399                                  */
1400                                 tbp->len -= off;
1401                                 memmove(tbp->buf, tbp->buf + off, tbp->len);
1402                                 prev = tbp;
1403                         }
1404                 } else
1405                         prev = tbp;
1406         }
1407         hd->prev = prev;
1408
1409         return entries_handled;
1410 }
1411
1412 static void __process_trace_bufs(void)
1413 {
1414         int cpu;
1415         struct list_head *p;
1416         struct list_head list;
1417         int handled = 0;
1418
1419         __list_for_each(p, &devpaths) {
1420                 struct devpath *dpp = list_entry(p, struct devpath, head);
1421                 struct tracer_devpath_head *hd = dpp->heads;
1422
1423                 for (cpu = 0; cpu < ncpus; cpu++, hd++) {
1424                         pthread_mutex_lock(&hd->mutex);
1425                         if (list_empty(&hd->head)) {
1426                                 pthread_mutex_unlock(&hd->mutex);
1427                                 continue;
1428                         }
1429
1430                         list_replace_init(&hd->head, &list);
1431                         pthread_mutex_unlock(&hd->mutex);
1432
1433                         handled += handle_list(hd, &list);
1434                 }
1435         }
1436
1437         if (handled)
1438                 decr_entries(handled);
1439 }
1440
1441 static void process_trace_bufs(void)
1442 {
1443         while (wait_empty_entries())
1444                 __process_trace_bufs();
1445 }
1446
1447 static void clean_trace_bufs(void)
1448 {
1449         /*
1450          * No mutex needed here: we're only reading from the lists,
1451          * tracers are done
1452          */
1453         while (dp_entries)
1454                 __process_trace_bufs();
1455 }
1456
1457 static inline void read_err(int cpu, char *ifn)
1458 {
1459         if (errno != EAGAIN)
1460                 fprintf(stderr, "Thread %d failed read of %s: %d/%s\n",
1461                         cpu, ifn, errno, strerror(errno));
1462 }
1463
1464 static int net_sendfile(struct io_info *iop)
1465 {
1466         int ret;
1467
1468         ret = sendfile(iop->ofd, iop->ifd, NULL, iop->ready);
1469         if (ret < 0) {
1470                 perror("sendfile");
1471                 return 1;
1472         } else if (ret < (int)iop->ready) {
1473                 fprintf(stderr, "short sendfile send (%d of %d)\n",
1474                         ret, iop->ready);
1475                 return 1;
1476         }
1477
1478         return 0;
1479 }
1480
1481 static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
1482 {
1483         struct devpath *dpp = iop->dpp;
1484
1485         if (net_send_header(iop->ofd, tp->cpu, dpp->buts_name, iop->ready))
1486                 return 1;
1487         return net_sendfile(iop);
1488 }
1489
1490 static int fill_ofname(struct io_info *iop, int cpu)
1491 {
1492         int len;
1493         struct stat sb;
1494         char *dst = iop->ofn;
1495
1496         if (output_dir)
1497                 len = snprintf(iop->ofn, sizeof(iop->ofn), "%s/", output_dir);
1498         else
1499                 len = snprintf(iop->ofn, sizeof(iop->ofn), "./");
1500
1501         if (net_mode == Net_server) {
1502                 struct cl_conn *nc = iop->nc;
1503
1504                 len += sprintf(dst + len, "%s-", nc->ch->hostname);
1505                 len += strftime(dst + len, 64, "%F-%T/",
1506                                 gmtime(&iop->dpp->cl_connect_time));
1507         }
1508
1509         if (stat(iop->ofn, &sb) < 0) {
1510                 if (errno != ENOENT) {
1511                         fprintf(stderr,
1512                                 "Destination dir %s stat failed: %d/%s\n",
1513                                 iop->ofn, errno, strerror(errno));
1514                         return 1;
1515                 }
1516                 /*
1517                  * There is no synchronization between multiple threads
1518                  * trying to create the directory at once.  It's harmless
1519                  * to let them try, so just detect the problem and move on.
1520                  */
1521                 if (mkdir(iop->ofn, 0755) < 0 && errno != EEXIST) {
1522                         fprintf(stderr,
1523                                 "Destination dir %s can't be made: %d/%s\n",
1524                                 iop->ofn, errno, strerror(errno));
1525                         return 1;
1526                 }
1527         }
1528
1529         if (output_name)
1530                 snprintf(iop->ofn + len, sizeof(iop->ofn), "%s.blktrace.%d",
1531                          output_name, cpu);
1532         else
1533                 snprintf(iop->ofn + len, sizeof(iop->ofn), "%s.blktrace.%d",
1534                          iop->dpp->buts_name, cpu);
1535
1536         return 0;
1537 }
1538
1539 static int set_vbuf(struct io_info *iop, int mode, size_t size)
1540 {
1541         iop->obuf = malloc(size);
1542         if (setvbuf(iop->ofp, iop->obuf, mode, size) < 0) {
1543                 fprintf(stderr, "setvbuf(%s, %d) failed: %d/%s\n",
1544                         iop->dpp->path, (int)size, errno,
1545                         strerror(errno));
1546                 free(iop->obuf);
1547                 return 1;
1548         }
1549
1550         return 0;
1551 }
1552
1553 static int iop_open(struct io_info *iop, int cpu)
1554 {
1555         iop->ofd = -1;
1556         if (fill_ofname(iop, cpu))
1557                 return 1;
1558
1559         iop->ofp = my_fopen(iop->ofn, "w+");
1560         if (iop->ofp == NULL) {
1561                 fprintf(stderr, "Open output file %s failed: %d/%s\n",
1562                         iop->ofn, errno, strerror(errno));
1563                 return 1;
1564         }
1565
1566         if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
1567                 fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
1568                         iop->ofn, errno, strerror(errno));
1569                 fclose(iop->ofp);
1570                 return 1;
1571         }
1572
1573         iop->ofd = fileno(iop->ofp);
1574         return 0;
1575 }
1576
1577 static void close_iop(struct io_info *iop)
1578 {
1579         struct mmap_info *mip = &iop->mmap_info;
1580
1581         if (mip->fs_buf)
1582                 munmap(mip->fs_buf, mip->fs_buf_len);
1583
1584         if (!piped_output) {
1585                 if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
1586                         fprintf(stderr,
1587                                 "Ignoring err: ftruncate(%s): %d/%s\n",
1588                                 iop->ofn, errno, strerror(errno));
1589                 }
1590         }
1591
1592         if (iop->ofp)
1593                 fclose(iop->ofp);
1594         if (iop->obuf)
1595                 free(iop->obuf);
1596 }
1597
1598 static void close_ios(struct tracer *tp)
1599 {
1600         while (tp->nios > 0) {
1601                 struct io_info *iop = &tp->ios[--tp->nios];
1602
1603                 iop->dpp->drops = get_drops(iop->dpp);
1604                 if (iop->ifd >= 0)
1605                         close(iop->ifd);
1606
1607                 if (iop->ofp)
1608                         close_iop(iop);
1609                 else if (iop->ofd >= 0) {
1610                         struct devpath *dpp = iop->dpp;
1611
1612                         net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
1613                         net_close_connection(&iop->ofd);
1614                 }
1615         }
1616
1617         free(tp->ios);
1618         free(tp->pfds);
1619 }
1620
1621 static int open_ios(struct tracer *tp)
1622 {
1623         struct pollfd *pfd;
1624         struct io_info *iop;
1625         struct list_head *p;
1626
1627         tp->ios = calloc(ndevs, sizeof(struct io_info));
1628         memset(tp->ios, 0, ndevs * sizeof(struct io_info));
1629
1630         tp->pfds = calloc(ndevs, sizeof(struct pollfd));
1631         memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
1632
1633         tp->nios = 0;
1634         iop = tp->ios;
1635         pfd = tp->pfds;
1636         __list_for_each(p, &devpaths) {
1637                 struct devpath *dpp = list_entry(p, struct devpath, head);
1638
1639                 iop->dpp = dpp;
1640                 iop->ofd = -1;
1641                 snprintf(iop->ifn, sizeof(iop->ifn), "%s/block/%s/trace%d",
1642                         debugfs_path, dpp->buts_name, tp->cpu);
1643
1644                 iop->ifd = my_open(iop->ifn, O_RDONLY | O_NONBLOCK);
1645                 if (iop->ifd < 0) {
1646                         fprintf(stderr, "Thread %d failed open %s: %d/%s\n",
1647                                 tp->cpu, iop->ifn, errno, strerror(errno));
1648                         return 1;
1649                 }
1650
1651                 init_mmap_info(&iop->mmap_info);
1652
1653                 pfd->fd = iop->ifd;
1654                 pfd->events = POLLIN;
1655
1656                 if (piped_output)
1657                         ;
1658                 else if (net_client_use_sendfile()) {
1659                         iop->ofd = net_setup_client();
1660                         if (iop->ofd < 0)
1661                                 goto err;
1662                         net_send_open(iop->ofd, tp->cpu, dpp->buts_name);
1663                 } else if (net_mode == Net_none) {
1664                         if (iop_open(iop, tp->cpu))
1665                                 goto err;
1666                 } else {
1667                         /*
1668                          * This ensures that the server knows about all
1669                          * connections & devices before _any_ closes
1670                          */
1671                         net_send_open(cl_fds[tp->cpu], tp->cpu, dpp->buts_name);
1672                 }
1673
1674                 pfd++;
1675                 iop++;
1676                 tp->nios++;
1677         }
1678
1679         return 0;
1680
1681 err:
1682         close(iop->ifd);        /* tp->nios _not_ bumped */
1683         close_ios(tp);
1684         return 1;
1685 }
1686
1687 static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
1688 {
1689         struct mmap_info *mip;
1690         int i, ret, nentries = 0;
1691         struct pollfd *pfd = tp->pfds;
1692         struct io_info *iop = tp->ios;
1693
1694         for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++) {
1695                 if (pfd->revents & POLLIN || force_read) {
1696                         mip = &iop->mmap_info;
1697
1698                         ret = setup_mmap(iop->ofd, buf_size, mip, tp);
1699                         if (ret < 0) {
1700                                 pfd->events = 0;
1701                                 break;
1702                         }
1703
1704                         ret = read(iop->ifd, mip->fs_buf + mip->fs_off,
1705                                    buf_size);
1706                         if (ret > 0) {
1707                                 pdc_dr_update(iop->dpp, tp->cpu, ret);
1708                                 mip->fs_size += ret;
1709                                 mip->fs_off += ret;
1710                                 nentries++;
1711                         } else if (ret == 0) {
1712                                 /*
1713                                  * Short reads after we're done stop us
1714                                  * from trying reads.
1715                                  */
1716                                 if (tp->is_done)
1717                                         clear_events(pfd);
1718                         } else {
1719                                 read_err(tp->cpu, iop->ifn);
1720                                 if (errno != EAGAIN || tp->is_done)
1721                                         clear_events(pfd);
1722                         }
1723                         nevs--;
1724                 }
1725         }
1726
1727         return nentries;
1728 }
1729
1730 static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
1731 {
1732         struct stat sb;
1733         int i, nentries = 0;
1734         struct pollfd *pfd = tp->pfds;
1735         struct io_info *iop = tp->ios;
1736
1737         for (i = 0; i < ndevs; i++, pfd++, iop++) {
1738                 if (pfd->revents & POLLIN || force_read) {
1739                         if (fstat(iop->ifd, &sb) < 0) {
1740                                 perror(iop->ifn);
1741                                 pfd->events = 0;
1742                         } else if (sb.st_size > (off_t)iop->data_queued) {
1743                                 iop->ready = sb.st_size - iop->data_queued;
1744                                 iop->data_queued = sb.st_size;
1745
1746                                 if (!net_sendfile_data(tp, iop)) {
1747                                         pdc_dr_update(iop->dpp, tp->cpu,
1748                                                       iop->ready);
1749                                         nentries++;
1750                                 } else
1751                                         clear_events(pfd);
1752                         }
1753                         if (--nevs == 0)
1754                                 break;
1755                 }
1756         }
1757
1758         if (nentries)
1759                 incr_entries(nentries);
1760
1761         return nentries;
1762 }
1763
1764 static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
1765 {
1766         int i, nentries = 0;
1767         struct trace_buf *tbp;
1768         struct pollfd *pfd = tp->pfds;
1769         struct io_info *iop = tp->ios;
1770
1771         tbp = alloc_trace_buf(tp->cpu, buf_size);
1772         for (i = 0; i < ndevs; i++, pfd++, iop++) {
1773                 if (pfd->revents & POLLIN || force_read) {
1774                         tbp->len = read(iop->ifd, tbp->buf, buf_size);
1775                         if (tbp->len > 0) {
1776                                 pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
1777                                 add_trace_buf(iop->dpp, tp->cpu, &tbp);
1778                                 nentries++;
1779                         } else if (tbp->len == 0) {
1780                                 /*
1781                                  * Short reads after we're done stop us
1782                                  * from trying reads.
1783                                  */
1784                                 if (tp->is_done)
1785                                         clear_events(pfd);
1786                         } else {
1787                                 read_err(tp->cpu, iop->ifn);
1788                                 if (errno != EAGAIN || tp->is_done)
1789                                         clear_events(pfd);
1790                         }
1791                         if (!piped_output && --nevs == 0)
1792                                 break;
1793                 }
1794         }
1795         free(tbp);
1796
1797         if (nentries)
1798                 incr_entries(nentries);
1799
1800         return nentries;
1801 }
1802
1803 static void *thread_main(void *arg)
1804 {
1805         int ret, ndone, to_val;
1806         struct tracer *tp = arg;
1807
1808         ret = lock_on_cpu(tp->cpu);
1809         if (ret)
1810                 goto err;
1811
1812         ret = open_ios(tp);
1813         if (ret)
1814                 goto err;
1815
1816         if (piped_output)
1817                 to_val = 50;            /* Frequent partial handles */
1818         else
1819                 to_val = 500;           /* 1/2 second intervals */
1820
1821
1822         tracer_signal_ready(tp, Th_running, 0);
1823         tracer_wait_unblock(tp);
1824
1825         while (!tp->is_done) {
1826                 ndone = poll(tp->pfds, ndevs, to_val);
1827                 if (ndone || piped_output)
1828                         (void)handle_pfds(tp, ndone, piped_output);
1829                 else if (ndone < 0 && errno != EINTR)
1830                         fprintf(stderr, "Thread %d poll failed: %d/%s\n",
1831                                 tp->cpu, errno, strerror(errno));
1832         }
1833
1834         /*
1835          * Trace is stopped, pull data until we get a short read
1836          */
1837         while (handle_pfds(tp, ndevs, 1) > 0)
1838                 ;
1839
1840         close_ios(tp);
1841         tracer_signal_ready(tp, Th_leaving, 0);
1842         return NULL;
1843
1844 err:
1845         tracer_signal_ready(tp, Th_error, ret);
1846         return NULL;
1847 }
1848
1849 static int start_tracer(int cpu)
1850 {
1851         struct tracer *tp;
1852
1853         tp = malloc(sizeof(*tp));
1854         memset(tp, 0, sizeof(*tp));
1855
1856         INIT_LIST_HEAD(&tp->head);
1857         tp->status = 0;
1858         tp->cpu = cpu;
1859
1860         if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
1861                 fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
1862                         cpu, errno, strerror(errno));
1863                 free(tp);
1864                 return 1;
1865         }
1866
1867         list_add_tail(&tp->head, &tracers);
1868         return 0;
1869 }
1870
1871 static void start_tracers(void)
1872 {
1873         int cpu;
1874         struct list_head *p;
1875
1876         for (cpu = 0; cpu < ncpus; cpu++)
1877                 if (start_tracer(cpu))
1878                         break;
1879
1880         wait_tracers_ready(cpu);
1881
1882         __list_for_each(p, &tracers) {
1883                 struct tracer *tp = list_entry(p, struct tracer, head);
1884                 if (tp->status)
1885                         fprintf(stderr,
1886                                 "FAILED to start thread on CPU %d: %d/%s\n",
1887                                 tp->cpu, tp->status, strerror(tp->status));
1888         }
1889 }
1890
1891 static void stop_tracers(void)
1892 {
1893         struct list_head *p;
1894
1895         /*
1896          * Stop the tracing - makes the tracer threads clean up quicker.
1897          */
1898         __list_for_each(p, &devpaths) {
1899                 struct devpath *dpp = list_entry(p, struct devpath, head);
1900                 (void)ioctl(dpp->fd, BLKTRACESTOP);
1901         }
1902
1903         /*
1904          * Tell each tracer to quit
1905          */
1906         __list_for_each(p, &tracers) {
1907                 struct tracer *tp = list_entry(p, struct tracer, head);
1908                 tp->is_done = 1;
1909         }
1910 }
1911
1912 static void del_tracers(void)
1913 {
1914         struct list_head *p, *q;
1915
1916         list_for_each_safe(p, q, &tracers) {
1917                 struct tracer *tp = list_entry(p, struct tracer, head);
1918
1919                 list_del(&tp->head);
1920                 free(tp);
1921         }
1922 }
1923
1924 static void wait_tracers(void)
1925 {
1926         struct list_head *p;
1927
1928         if (use_tracer_devpaths())
1929                 process_trace_bufs();
1930
1931         wait_tracers_leaving();
1932
1933         __list_for_each(p, &tracers) {
1934                 int ret;
1935                 struct tracer *tp = list_entry(p, struct tracer, head);
1936
1937                 ret = pthread_join(tp->thread, NULL);
1938                 if (ret)
1939                         fprintf(stderr, "Thread join %d failed %d\n",
1940                                 tp->cpu, ret);
1941         }
1942
1943         if (use_tracer_devpaths())
1944                 clean_trace_bufs();
1945
1946         get_all_drops();
1947 }
1948
1949 static void exit_tracing(void)
1950 {
1951         signal(SIGINT, SIG_IGN);
1952         signal(SIGHUP, SIG_IGN);
1953         signal(SIGTERM, SIG_IGN);
1954         signal(SIGALRM, SIG_IGN);
1955
1956         stop_tracers();
1957         wait_tracers();
1958         del_tracers();
1959         rel_devpaths();
1960 }
1961
1962 static void handle_sigint(__attribute__((__unused__)) int sig)
1963 {
1964         done = 1;
1965         stop_tracers();
1966 }
1967
1968 static void show_stats(struct list_head *devpaths)
1969 {
1970         FILE *ofp;
1971         struct list_head *p;
1972         unsigned long long nevents, data_read;
1973         unsigned long long total_drops = 0;
1974         unsigned long long total_events = 0;
1975
1976         if (piped_output)
1977                 ofp = my_fopen("/dev/null", "w");
1978         else
1979                 ofp = stdout;
1980
1981         __list_for_each(p, devpaths) {
1982                 int cpu;
1983                 struct pdc_stats *sp;
1984                 struct devpath *dpp = list_entry(p, struct devpath, head);
1985
1986                 if (net_mode == Net_server)
1987                         printf("server: end of run for %s:%s\n",
1988                                 dpp->ch->hostname, dpp->buts_name);
1989
1990                 data_read = 0;
1991                 nevents = 0;
1992
1993                 fprintf(ofp, "=== %s ===\n", dpp->buts_name);
1994                 for (cpu = 0, sp = dpp->stats; cpu < dpp->ncpus; cpu++, sp++) {
1995                         /*
1996                          * Estimate events if not known...
1997                          */
1998                         if (sp->nevents == 0) {
1999                                 sp->nevents = sp->data_read /
2000                                                 sizeof(struct blk_io_trace);
2001                         }
2002
2003                         fprintf(ofp,
2004                                 "  CPU%3d: %20llu events, %8llu KiB data\n",
2005                                 cpu, sp->nevents, (sp->data_read + 1023) >> 10);
2006
2007                         data_read += sp->data_read;
2008                         nevents += sp->nevents;
2009                 }
2010
2011                 fprintf(ofp, "  Total:  %20llu events (dropped %llu),"
2012                              " %8llu KiB data\n", nevents,
2013                              dpp->drops, (data_read + 1024) >> 10);
2014
2015                 total_drops += dpp->drops;
2016                 total_events += (nevents + dpp->drops);
2017         }
2018
2019         fflush(ofp);
2020         if (piped_output)
2021                 fclose(ofp);
2022
2023         if (total_drops) {
2024                 double drops_ratio = 1.0;
2025
2026                 if (total_events)
2027                         drops_ratio = (double)total_drops/(double)total_events;
2028
2029                 fprintf(stderr, "\nYou have %llu (%5.1lf%%) dropped events\n"
2030                                 "Consider using a larger buffer size (-b) "
2031                                 "and/or more buffers (-n)\n",
2032                         total_drops, 100.0 * drops_ratio);
2033         }
2034 }
2035
2036 static int handle_args(int argc, char *argv[])
2037 {
2038         int c, i;
2039         struct statfs st;
2040         int act_mask_tmp = 0;
2041
2042         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
2043                 switch (c) {
2044                 case 'a':
2045                         i = find_mask_map(optarg);
2046                         if (i < 0) {
2047                                 fprintf(stderr, "Invalid action mask %s\n",
2048                                         optarg);
2049                                 return 1;
2050                         }
2051                         act_mask_tmp |= i;
2052                         break;
2053
2054                 case 'A':
2055                         if ((sscanf(optarg, "%x", &i) != 1) ||
2056                                                         !valid_act_opt(i)) {
2057                                 fprintf(stderr,
2058                                         "Invalid set action mask %s/0x%x\n",
2059                                         optarg, i);
2060                                 return 1;
2061                         }
2062                         act_mask_tmp = i;
2063                         break;
2064
2065                 case 'd':
2066                         if (add_devpath(optarg) != 0)
2067                                 return 1;
2068                         break;
2069
2070                 case 'I': {
2071                         char dev_line[256];
2072                         FILE *ifp = my_fopen(optarg, "r");
2073
2074                         if (!ifp) {
2075                                 fprintf(stderr,
2076                                         "Invalid file for devices %s\n",
2077                                         optarg);
2078                                 return 1;
2079                         }
2080
2081                         while (fscanf(ifp, "%s\n", dev_line) == 1) {
2082                                 if (add_devpath(dev_line) != 0) {
2083                                         fclose(ifp);
2084                                         return 1;
2085                                 }
2086                         }
2087                         fclose(ifp);
2088                         break;
2089                 }
2090
2091                 case 'r':
2092                         debugfs_path = optarg;
2093                         break;
2094
2095                 case 'o':
2096                         output_name = optarg;
2097                         break;
2098                 case 'k':
2099                         kill_running_trace = 1;
2100                         break;
2101                 case 'w':
2102                         stop_watch = atoi(optarg);
2103                         if (stop_watch <= 0) {
2104                                 fprintf(stderr,
2105                                         "Invalid stopwatch value (%d secs)\n",
2106                                         stop_watch);
2107                                 return 1;
2108                         }
2109                         break;
2110                 case 'V':
2111                 case 'v':
2112                         printf("%s version %s\n", argv[0], blktrace_version);
2113                         exit(0);
2114                         /*NOTREACHED*/
2115                 case 'b':
2116                         buf_size = strtoul(optarg, NULL, 10);
2117                         if (buf_size <= 0 || buf_size > 16*1024) {
2118                                 fprintf(stderr, "Invalid buffer size (%lu)\n",
2119                                         buf_size);
2120                                 return 1;
2121                         }
2122                         buf_size <<= 10;
2123                         break;
2124                 case 'n':
2125                         buf_nr = strtoul(optarg, NULL, 10);
2126                         if (buf_nr <= 0) {
2127                                 fprintf(stderr,
2128                                         "Invalid buffer nr (%lu)\n", buf_nr);
2129                                 return 1;
2130                         }
2131                         break;
2132                 case 'D':
2133                         output_dir = optarg;
2134                         break;
2135                 case 'h':
2136                         net_mode = Net_client;
2137                         memset(hostname, 0, sizeof(hostname));
2138                         strncpy(hostname, optarg, sizeof(hostname));
2139                         hostname[sizeof(hostname) - 1] = '\0';
2140                         break;
2141                 case 'l':
2142                         net_mode = Net_server;
2143                         break;
2144                 case 'p':
2145                         net_port = atoi(optarg);
2146                         break;
2147                 case 's':
2148                         net_use_sendfile = 0;
2149                         break;
2150                 default:
2151                         show_usage(argv[0]);
2152                         exit(1);
2153                         /*NOTREACHED*/
2154                 }
2155         }
2156
2157         while (optind < argc)
2158                 if (add_devpath(argv[optind++]) != 0)
2159                         return 1;
2160
2161         if (net_mode != Net_server && ndevs == 0) {
2162                 show_usage(argv[0]);
2163                 return 1;
2164         }
2165
2166         if (statfs(debugfs_path, &st) < 0) {
2167                 fprintf(stderr, "Invalid debug path %s: %d/%s\n",
2168                         debugfs_path, errno, strerror(errno));
2169                 return 1;
2170         }
2171
2172         if (st.f_type != (long)DEBUGFS_TYPE) {
2173                 fprintf(stderr, "Debugfs is not mounted at %s\n", debugfs_path);
2174                 return 1;
2175         }
2176
2177         if (act_mask_tmp != 0)
2178                 act_mask = act_mask_tmp;
2179
2180         if (net_mode == Net_client && net_setup_addr())
2181                 return 1;
2182
2183         /*
2184          * Set up for appropriate PFD handler based upon output name.
2185          */
2186         if (net_client_use_sendfile())
2187                 handle_pfds = handle_pfds_netclient;
2188         else if (net_client_use_send())
2189                 handle_pfds = handle_pfds_entries;
2190         else if (output_name && (strcmp(output_name, "-") == 0)) {
2191                 piped_output = 1;
2192                 handle_pfds = handle_pfds_entries;
2193                 pfp = stdout;
2194                 if (setvbuf(pfp, NULL, _IONBF, 0)) {
2195                         perror("setvbuf stdout");
2196                         return 1;
2197                 }
2198         } else
2199                 handle_pfds = handle_pfds_file;
2200         return 0;
2201 }
2202
2203 static void ch_add_connection(struct net_server_s *ns, struct cl_host *ch,
2204                               int fd)
2205 {
2206         struct cl_conn *nc;
2207
2208         nc = malloc(sizeof(*nc));
2209         memset(nc, 0, sizeof(*nc));
2210
2211         time(&nc->connect_time);
2212         nc->ch = ch;
2213         nc->fd = fd;
2214         nc->ncpus = -1;
2215
2216         list_add_tail(&nc->ch_head, &ch->conn_list);
2217         ch->connects++;
2218
2219         list_add_tail(&nc->ns_head, &ns->conn_list);
2220         ns->connects++;
2221         ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2222 }
2223
2224 static void ch_rem_connection(struct net_server_s *ns, struct cl_host *ch,
2225                               struct cl_conn *nc)
2226 {
2227         net_close_connection(&nc->fd);
2228
2229         list_del(&nc->ch_head);
2230         ch->connects--;
2231
2232         list_del(&nc->ns_head);
2233         ns->connects--;
2234         ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2235
2236         free(nc);
2237 }
2238
2239 static struct cl_host *net_find_client_host(struct net_server_s *ns,
2240                                             struct in_addr cl_in_addr)
2241 {
2242         struct list_head *p;
2243
2244         __list_for_each(p, &ns->ch_list) {
2245                 struct cl_host *ch = list_entry(p, struct cl_host, head);
2246
2247                 if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
2248                         return ch;
2249         }
2250
2251         return NULL;
2252 }
2253
2254 static struct cl_host *net_add_client_host(struct net_server_s *ns,
2255                                            struct sockaddr_in *addr)
2256 {
2257         struct cl_host *ch;
2258
2259         ch = malloc(sizeof(*ch));
2260         memset(ch, 0, sizeof(*ch));
2261
2262         ch->ns = ns;
2263         ch->cl_in_addr = addr->sin_addr;
2264         list_add_tail(&ch->head, &ns->ch_list);
2265         ns->nchs++;
2266
2267         ch->hostname = strdup(inet_ntoa(addr->sin_addr));
2268         printf("server: connection from %s\n", ch->hostname);
2269
2270         INIT_LIST_HEAD(&ch->conn_list);
2271         INIT_LIST_HEAD(&ch->devpaths);
2272
2273         return ch;
2274 }
2275
2276 static void device_done(struct devpath *dpp, int ncpus)
2277 {
2278         int cpu;
2279         struct io_info *iop;
2280
2281         for (cpu = 0, iop = dpp->ios; cpu < ncpus; cpu++, iop++)
2282                 close_iop(iop);
2283
2284         list_del(&dpp->head);
2285         dpp_free(dpp);
2286 }
2287
2288 static void net_ch_remove(struct cl_host *ch, int ncpus)
2289 {
2290         struct list_head *p, *q;
2291         struct net_server_s *ns = ch->ns;
2292
2293         list_for_each_safe(p, q, &ch->devpaths) {
2294                 struct devpath *dpp = list_entry(p, struct devpath, head);
2295                 device_done(dpp, ncpus);
2296         }
2297
2298         list_for_each_safe(p, q, &ch->conn_list) {
2299                 struct cl_conn *nc = list_entry(p, struct cl_conn, ch_head);
2300
2301                 ch_rem_connection(ns, ch, nc);
2302         }
2303
2304         list_del(&ch->head);
2305         ns->nchs--;
2306
2307         if (ch->hostname)
2308                 free(ch->hostname);
2309         free(ch);
2310 }
2311
2312 static void net_add_connection(struct net_server_s *ns)
2313 {
2314         int fd;
2315         struct cl_host *ch;
2316         socklen_t socklen = sizeof(ns->addr);
2317
2318         fd = my_accept(ns->listen_fd, (struct sockaddr *)&ns->addr, &socklen);
2319         if (fd < 0) {
2320                 /*
2321                  * This is OK: we just won't accept this connection,
2322                  * nothing fatal.
2323                  */
2324                 perror("accept");
2325         } else {
2326                 ch = net_find_client_host(ns, ns->addr.sin_addr);
2327                 if (!ch)
2328                         ch = net_add_client_host(ns, &ns->addr);
2329
2330                 ch_add_connection(ns, ch, fd);
2331         }
2332 }
2333
2334 static struct devpath *nc_add_dpp(struct cl_conn *nc,
2335                                   struct blktrace_net_hdr *bnh,
2336                                   time_t connect_time)
2337 {
2338         int cpu;
2339         struct io_info *iop;
2340         struct devpath *dpp;
2341
2342         dpp = malloc(sizeof(*dpp));
2343         memset(dpp, 0, sizeof(*dpp));
2344
2345         dpp->buts_name = strdup(bnh->buts_name);
2346         dpp->path = strdup(bnh->buts_name);
2347         dpp->fd = -1;
2348         dpp->ch = nc->ch;
2349         dpp->cl_id = bnh->cl_id;
2350         dpp->cl_connect_time = connect_time;
2351         dpp->ncpus = nc->ncpus;
2352         dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
2353         memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
2354
2355         list_add_tail(&dpp->head, &nc->ch->devpaths);
2356         nc->ch->ndevs++;
2357
2358         dpp->ios = calloc(nc->ncpus, sizeof(*iop));
2359         memset(dpp->ios, 0, ndevs * sizeof(*iop));
2360
2361         for (cpu = 0, iop = dpp->ios; cpu < nc->ncpus; cpu++, iop++) {
2362                 iop->dpp = dpp;
2363                 iop->nc = nc;
2364                 init_mmap_info(&iop->mmap_info);
2365
2366                 if (iop_open(iop, cpu))
2367                         goto err;
2368         }
2369
2370         return dpp;
2371
2372 err:
2373         /*
2374          * Need to unravel what's been done...
2375          */
2376         while (cpu >= 0)
2377                 close_iop(&dpp->ios[cpu--]);
2378         dpp_free(dpp);
2379
2380         return NULL;
2381 }
2382
2383 static struct devpath *nc_find_dpp(struct cl_conn *nc,
2384                                    struct blktrace_net_hdr *bnh)
2385 {
2386         struct list_head *p;
2387         time_t connect_time = nc->connect_time;
2388
2389         __list_for_each(p, &nc->ch->devpaths) {
2390                 struct devpath *dpp = list_entry(p, struct devpath, head);
2391
2392                 if (!strcmp(dpp->buts_name, bnh->buts_name))
2393                         return dpp;
2394
2395                 if (dpp->cl_id == bnh->cl_id)
2396                         connect_time = dpp->cl_connect_time;
2397         }
2398
2399         return nc_add_dpp(nc, bnh, connect_time);
2400 }
2401
2402 static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
2403                                  struct blktrace_net_hdr *bnh)
2404 {
2405         int ret;
2406         struct io_info *iop = &dpp->ios[bnh->cpu];
2407         struct mmap_info *mip = &iop->mmap_info;
2408
2409         if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info, NULL)) {
2410                 fprintf(stderr, "ncd(%s:%d): mmap failed\n",
2411                         nc->ch->hostname, nc->fd);
2412                 exit(1);
2413         }
2414
2415         ret = net_recv_data(nc->fd, mip->fs_buf + mip->fs_off, bnh->len);
2416         if (ret > 0) {
2417                 pdc_dr_update(dpp, bnh->cpu, ret);
2418                 mip->fs_size += ret;
2419                 mip->fs_off += ret;
2420         } else if (ret < 0)
2421                 exit(1);
2422 }
2423
2424 /*
2425  * Returns 1 if we closed a host - invalidates other polling information
2426  * that may be present.
2427  */
2428 static int net_client_data(struct cl_conn *nc)
2429 {
2430         int ret;
2431         struct devpath *dpp;
2432         struct blktrace_net_hdr bnh;
2433
2434         ret = net_get_header(nc, &bnh);
2435         if (ret == 0)
2436                 return 0;
2437
2438         if (ret < 0) {
2439                 fprintf(stderr, "ncd(%d): header read failed\n", nc->fd);
2440                 exit(1);
2441         }
2442
2443         if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
2444                 fprintf(stderr, "ncd(%d): received data is bad\n", nc->fd);
2445                 exit(1);
2446         }
2447
2448         if (!data_is_native) {
2449                 bnh.magic = be32_to_cpu(bnh.magic);
2450                 bnh.cpu = be32_to_cpu(bnh.cpu);
2451                 bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
2452                 bnh.len = be32_to_cpu(bnh.len);
2453                 bnh.cl_id = be32_to_cpu(bnh.cl_id);
2454                 bnh.buf_size = be32_to_cpu(bnh.buf_size);
2455                 bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
2456                 bnh.page_size = be32_to_cpu(bnh.page_size);
2457         }
2458
2459         if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2460                 fprintf(stderr, "ncd(%s:%d): bad data magic\n",
2461                         nc->ch->hostname, nc->fd);
2462                 exit(1);
2463         }
2464
2465         if (nc->ncpus == -1)
2466                 nc->ncpus = bnh.max_cpus;
2467
2468         /*
2469          * len == 0 means the other end is sending us a new connection/dpp
2470          * len == 1 means that the other end signalled end-of-run
2471          */
2472         dpp = nc_find_dpp(nc, &bnh);
2473         if (bnh.len == 0) {
2474                 /*
2475                  * Just adding in the dpp above is enough
2476                  */
2477                 ack_open_close(nc->fd, dpp->buts_name);
2478                 nc->ch->cl_opens++;
2479         } else if (bnh.len == 1) {
2480                 /*
2481                  * overload cpu count with dropped events
2482                  */
2483                 dpp->drops = bnh.cpu;
2484
2485                 ack_open_close(nc->fd, dpp->buts_name);
2486                 if (--nc->ch->cl_opens == 0) {
2487                         show_stats(&nc->ch->devpaths);
2488                         net_ch_remove(nc->ch, nc->ncpus);
2489                         return 1;
2490                 }
2491         } else
2492                 net_client_read_data(nc, dpp, &bnh);
2493
2494         return 0;
2495 }
2496
2497 static void handle_client_data(struct net_server_s *ns, int events)
2498 {
2499         struct cl_conn *nc;
2500         struct pollfd *pfd;
2501         struct list_head *p, *q;
2502
2503         pfd = &ns->pfds[1];
2504         list_for_each_safe(p, q, &ns->conn_list) {
2505                 if (pfd->revents & POLLIN) {
2506                         nc = list_entry(p, struct cl_conn, ns_head);
2507
2508                         if (net_client_data(nc) || --events == 0)
2509                                 break;
2510                 }
2511                 pfd++;
2512         }
2513 }
2514
2515 static void net_setup_pfds(struct net_server_s *ns)
2516 {
2517         struct pollfd *pfd;
2518         struct list_head *p;
2519
2520         ns->pfds[0].fd = ns->listen_fd;
2521         ns->pfds[0].events = POLLIN;
2522
2523         pfd = &ns->pfds[1];
2524         __list_for_each(p, &ns->conn_list) {
2525                 struct cl_conn *nc = list_entry(p, struct cl_conn, ns_head);
2526
2527                 pfd->fd = nc->fd;
2528                 pfd->events = POLLIN;
2529                 pfd++;
2530         }
2531 }
2532
2533 static int net_server_handle_connections(struct net_server_s *ns)
2534 {
2535         int events;
2536
2537         printf("server: waiting for connections...\n");
2538
2539         while (!done) {
2540                 net_setup_pfds(ns);
2541                 events = poll(ns->pfds, ns->connects + 1, -1);
2542                 if (events < 0) {
2543                         if (errno != EINTR) {
2544                                 perror("FATAL: poll error");
2545                                 return 1;
2546                         }
2547                 } else if (events > 0) {
2548                         if (ns->pfds[0].revents & POLLIN) {
2549                                 net_add_connection(ns);
2550                                 events--;
2551                         }
2552
2553                         if (events)
2554                                 handle_client_data(ns, events);
2555                 }
2556         }
2557
2558         return 0;
2559 }
2560
2561 static int net_server(void)
2562 {
2563         int fd, opt;
2564         int ret = 1;
2565         struct net_server_s net_server;
2566         struct net_server_s *ns = &net_server;
2567
2568         memset(ns, 0, sizeof(*ns));
2569         INIT_LIST_HEAD(&ns->ch_list);
2570         INIT_LIST_HEAD(&ns->conn_list);
2571         ns->pfds = malloc(sizeof(struct pollfd));
2572
2573         fd = my_socket(AF_INET, SOCK_STREAM, 0);
2574         if (fd < 0) {
2575                 perror("server: socket");
2576                 goto out;
2577         }
2578
2579         opt = 1;
2580         if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
2581                 perror("setsockopt");
2582                 goto out;
2583         }
2584
2585         memset(&ns->addr, 0, sizeof(ns->addr));
2586         ns->addr.sin_family = AF_INET;
2587         ns->addr.sin_addr.s_addr = htonl(INADDR_ANY);
2588         ns->addr.sin_port = htons(net_port);
2589
2590         if (bind(fd, (struct sockaddr *) &ns->addr, sizeof(ns->addr)) < 0) {
2591                 perror("bind");
2592                 goto out;
2593         }
2594
2595         if (listen(fd, 1) < 0) {
2596                 perror("listen");
2597                 goto out;
2598         }
2599
2600         /*
2601          * The actual server looping is done here:
2602          */
2603         ns->listen_fd = fd;
2604         ret = net_server_handle_connections(ns);
2605
2606         /*
2607          * Clean up and return...
2608          */
2609 out:
2610         free(ns->pfds);
2611         return ret;
2612 }
2613
2614 static int run_tracers(void)
2615 {
2616         atexit(exit_tracing);
2617         if (net_mode == Net_client)
2618                 printf("blktrace: connecting to %s\n", hostname);
2619
2620         setup_buts();
2621
2622         if (use_tracer_devpaths()) {
2623                 if (setup_tracer_devpaths())
2624                         return 1;
2625
2626                 if (piped_output)
2627                         handle_list = handle_list_file;
2628                 else
2629                         handle_list = handle_list_net;
2630         }
2631
2632         start_tracers();
2633         if (nthreads_running == ncpus) {
2634                 unblock_tracers();
2635                 start_buts();
2636                 if (net_mode == Net_client)
2637                         printf("blktrace: connected!\n");
2638                 if (stop_watch)
2639                         alarm(stop_watch);
2640         } else
2641                 stop_tracers();
2642
2643         wait_tracers();
2644         if (nthreads_running == ncpus)
2645                 show_stats(&devpaths);
2646         if (net_client_use_send())
2647                 close_client_connections();
2648         del_tracers();
2649
2650         return 0;
2651 }
2652
2653 int main(int argc, char *argv[])
2654 {
2655         int ret = 0;
2656
2657         setlocale(LC_NUMERIC, "en_US");
2658         pagesize = getpagesize();
2659         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
2660         if (ncpus < 0) {
2661                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed %d/%s\n",
2662                         errno, strerror(errno));
2663                 ret = 1;
2664                 goto out;
2665         } else if (handle_args(argc, argv)) {
2666                 ret = 1;
2667                 goto out;
2668         }
2669
2670         if (ndevs > 1 && output_name && strcmp(output_name, "-") != 0) {
2671                 fprintf(stderr, "-o not supported with multiple devices\n");
2672                 ret = 1;
2673                 goto out;
2674         }
2675
2676         signal(SIGINT, handle_sigint);
2677         signal(SIGHUP, handle_sigint);
2678         signal(SIGTERM, handle_sigint);
2679         signal(SIGALRM, handle_sigint);
2680         signal(SIGPIPE, SIG_IGN);
2681
2682         if (kill_running_trace) {
2683                 struct devpath *dpp;
2684                 struct list_head *p;
2685
2686                 __list_for_each(p, &devpaths) {
2687                         dpp = list_entry(p, struct devpath, head);
2688                         if (__stop_trace(dpp->fd)) {
2689                                 fprintf(stderr,
2690                                         "BLKTRACETEARDOWN %s failed: %d/%s\n",
2691                                         dpp->path, errno, strerror(errno));
2692                         }
2693                 }
2694         } else if (net_mode == Net_server) {
2695                 if (output_name) {
2696                         fprintf(stderr, "-o ignored in server mode\n");
2697                         output_name = NULL;
2698                 }
2699                 ret = net_server();
2700         } else
2701                 ret = run_tracers();
2702
2703 out:
2704         if (pfp)
2705                 fclose(pfp);
2706         rel_devpaths();
2707         return ret;
2708 }