blktrace: Reorganize creation of output file name
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  * Rewrite to have a single thread per CPU (managing all devices on that CPU)
8  *      Alan D. Brunelle <alan.brunelle@hp.com> - January 2009
9  *
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation; either version 2 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU General Public License for more details.
19  *
20  *  You should have received a copy of the GNU General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  *
24  */
25
26 #include <errno.h>
27 #include <stdarg.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <fcntl.h>
32 #include <getopt.h>
33 #include <sched.h>
34 #include <unistd.h>
35 #include <poll.h>
36 #include <signal.h>
37 #include <pthread.h>
38 #include <locale.h>
39 #include <sys/ioctl.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/vfs.h>
43 #include <sys/mman.h>
44 #include <sys/param.h>
45 #include <sys/time.h>
46 #include <sys/resource.h>
47 #include <sys/socket.h>
48 #include <netinet/in.h>
49 #include <arpa/inet.h>
50 #include <netdb.h>
51 #include <sys/sendfile.h>
52
53 #include "btt/list.h"
54 #include "blktrace.h"
55
56 /*
57  * You may want to increase this even more, if you are logging at a high
58  * rate and see skipped/missed events
59  */
60 #define BUF_SIZE                (512 * 1024)
61 #define BUF_NR                  (4)
62
63 #define FILE_VBUF_SIZE          (128 * 1024)
64
65 #define DEBUGFS_TYPE            (0x64626720)
66 #define TRACE_NET_PORT          (8462)
67
68 enum {
69         Net_none = 0,
70         Net_server,
71         Net_client,
72 };
73
74 enum thread_status {
75         Th_running,
76         Th_leaving,
77         Th_error
78 };
79
80 /*
81  * Generic stats collected: nevents can be _roughly_ estimated by data_read
82  * (discounting pdu...)
83  *
84  * These fields are updated w/ pdc_dr_update & pdc_nev_update below.
85  */
86 struct pdc_stats {
87         unsigned long long data_read;
88         unsigned long long nevents;
89 };
90
91 struct devpath {
92         struct list_head head;
93         char *path;                     /* path to device special file */
94         char *buts_name;                /* name returned from bt kernel code */
95         struct pdc_stats *stats;
96         int fd, ncpus;
97         unsigned long long drops;
98
99         /*
100          * For piped output only:
101          *
102          * Each tracer will have a tracer_devpath_head that it will add new
103          * data onto. It's list is protected above (tracer_devpath_head.mutex)
104          * and it will signal the processing thread using the dp_cond,
105          * dp_mutex & dp_entries variables above.
106          */
107         struct tracer_devpath_head *heads;
108
109         /*
110          * For network server mode only:
111          */
112         struct cl_host *ch;
113         u32 cl_id;
114         time_t cl_connect_time;
115         struct io_info *ios;
116 };
117
118 /*
119  * For piped output to stdout we will have each tracer thread (one per dev)
120  * tack buffers read from the relay queues on a per-device list.
121  *
122  * The main thread will then collect trace buffers from each of lists in turn.
123  *
124  * We will use a mutex to guard each of the trace_buf list. The tracers
125  * can then signal the main thread using <dp_cond,dp_mutex> and
126  * dp_entries. (When dp_entries is 0, and a tracer adds an entry it will
127  * signal. When dp_entries is 0, the main thread will wait for that condition
128  * to be signalled.)
129  *
130  * adb: It may be better just to have a large buffer per tracer per dev,
131  * and then use it as a ring-buffer. This would certainly cut down a lot
132  * of malloc/free thrashing, at the cost of more memory movements (potentially).
133  */
134 struct trace_buf {
135         struct list_head head;
136         struct devpath *dpp;
137         void *buf;
138         int cpu, len;
139 };
140
141 struct tracer_devpath_head {
142         pthread_mutex_t mutex;
143         struct list_head head;
144         struct trace_buf *prev;
145 };
146
147 /*
148  * Used to handle the mmap() interfaces for output file (containing traces)
149  */
150 struct mmap_info {
151         void *fs_buf;
152         unsigned long long fs_size, fs_max_size, fs_off, fs_buf_len;
153         unsigned long buf_size, buf_nr;
154         int pagesize;
155 };
156
157 /*
158  * Each thread doing work on a (client) side of blktrace will have one
159  * of these. The ios array contains input/output information, pfds holds
160  * poll() data. The volatile's provide flags to/from the main executing
161  * thread.
162  */
163 struct tracer {
164         struct list_head head;
165         struct io_info *ios;
166         struct pollfd *pfds;
167         pthread_t thread;
168         int cpu, nios;
169         volatile int status, is_done;
170 };
171
172 /*
173  * networking stuff follows. we include a magic number so we know whether
174  * to endianness convert or not.
175  *
176  * The len field is overloaded:
177  *      0 - Indicates an "open" - allowing the server to set up for a dev/cpu
178  *      1 - Indicates a "close" - Shut down connection orderly
179  *
180  * The cpu field is overloaded on close: it will contain the number of drops.
181  */
182 struct blktrace_net_hdr {
183         u32 magic;              /* same as trace magic */
184         char buts_name[32];     /* trace name */
185         u32 cpu;                /* for which cpu */
186         u32 max_cpus;
187         u32 len;                /* length of following trace data */
188         u32 cl_id;              /* id for set of client per-cpu connections */
189         u32 buf_size;           /* client buf_size for this trace  */
190         u32 buf_nr;             /* client buf_nr for this trace  */
191         u32 page_size;          /* client page_size for this trace  */
192 };
193
194 /*
195  * Each host encountered has one of these. The head is used to link this
196  * on to the network server's ch_list. Connections associated with this
197  * host are linked on conn_list, and any devices traced on that host
198  * are connected on the devpaths list.
199  */
200 struct cl_host {
201         struct list_head head;
202         struct list_head conn_list;
203         struct list_head devpaths;
204         struct net_server_s *ns;
205         char *hostname;
206         struct in_addr cl_in_addr;
207         int connects, ndevs, cl_opens;
208 };
209
210 /*
211  * Each connection (client to server socket ('fd')) has one of these. A
212  * back reference to the host ('ch'), and lists headers (for the host
213  * list, and the network server conn_list) are also included.
214  */
215 struct cl_conn {
216         struct list_head ch_head, ns_head;
217         struct cl_host *ch;
218         int fd, ncpus;
219         time_t connect_time;
220 };
221
222 /*
223  * The network server requires some poll structures to be maintained -
224  * one per conection currently on conn_list. The nchs/ch_list values
225  * are for each host connected to this server. The addr field is used
226  * for scratch as new connections are established.
227  */
228 struct net_server_s {
229         struct list_head conn_list;
230         struct list_head ch_list;
231         struct pollfd *pfds;
232         int listen_fd, connects, nchs;
233         struct sockaddr_in addr;
234 };
235
236 /*
237  * This structure is (generically) used to providide information
238  * for a read-to-write set of values.
239  *
240  * ifn & ifd represent input information
241  *
242  * ofn, ofd, ofp, obuf & mmap_info are used for output file (optionally).
243  */
244 struct io_info {
245         struct devpath *dpp;
246         FILE *ofp;
247         char *obuf;
248         struct cl_conn *nc;     /* Server network connection */
249
250         /*
251          * mmap controlled output files
252          */
253         struct mmap_info mmap_info;
254
255         /*
256          * Client network fields
257          */
258         unsigned int ready;
259         unsigned long long data_queued;
260
261         /*
262          * Input/output file descriptors & names
263          */
264         int ifd, ofd;
265         char ifn[MAXPATHLEN + 64];
266         char ofn[MAXPATHLEN + 64];
267 };
268
269 static char blktrace_version[] = "2.0.0";
270
271 /*
272  * Linkage to blktrace helper routines (trace conversions)
273  */
274 int data_is_native = -1;
275
276 static int ndevs;
277 static int max_cpus;
278 static int ncpus;
279 static cpu_set_t *online_cpus;
280 static int pagesize;
281 static int act_mask = ~0U;
282 static int kill_running_trace;
283 static int stop_watch;
284 static int piped_output;
285
286 static char *debugfs_path = "/sys/kernel/debug";
287 static char *output_name;
288 static char *output_dir;
289
290 static unsigned long buf_size = BUF_SIZE;
291 static unsigned long buf_nr = BUF_NR;
292
293 static FILE *pfp;
294
295 static LIST_HEAD(devpaths);
296 static LIST_HEAD(tracers);
297
298 static volatile int done;
299
300 /*
301  * tracer threads add entries, the main thread takes them off and processes
302  * them. These protect the dp_entries variable.
303  */
304 static pthread_cond_t dp_cond = PTHREAD_COND_INITIALIZER;
305 static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
306 static volatile int dp_entries;
307
308 /*
309  * These synchronize master / thread interactions.
310  */
311 static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
312 static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
313 static volatile int nthreads_running;
314 static volatile int nthreads_leaving;
315 static volatile int nthreads_error;
316 static volatile int tracers_run;
317
318 /*
319  * network cmd line params
320  */
321 static struct sockaddr_in hostname_addr;
322 static char hostname[MAXHOSTNAMELEN];
323 static int net_port = TRACE_NET_PORT;
324 static int net_use_sendfile = 1;
325 static int net_mode;
326 static int *cl_fds;
327
328 static int (*handle_pfds)(struct tracer *, int, int);
329 static int (*handle_list)(struct tracer_devpath_head *, struct list_head *);
330
331 #define S_OPTS  "d:a:A:r:o:kw:vVb:n:D:lh:p:sI:"
332 static struct option l_opts[] = {
333         {
334                 .name = "dev",
335                 .has_arg = required_argument,
336                 .flag = NULL,
337                 .val = 'd'
338         },
339         {
340                 .name = "input-devs",
341                 .has_arg = required_argument,
342                 .flag = NULL,
343                 .val = 'I'
344         },
345         {
346                 .name = "act-mask",
347                 .has_arg = required_argument,
348                 .flag = NULL,
349                 .val = 'a'
350         },
351         {
352                 .name = "set-mask",
353                 .has_arg = required_argument,
354                 .flag = NULL,
355                 .val = 'A'
356         },
357         {
358                 .name = "relay",
359                 .has_arg = required_argument,
360                 .flag = NULL,
361                 .val = 'r'
362         },
363         {
364                 .name = "output",
365                 .has_arg = required_argument,
366                 .flag = NULL,
367                 .val = 'o'
368         },
369         {
370                 .name = "kill",
371                 .has_arg = no_argument,
372                 .flag = NULL,
373                 .val = 'k'
374         },
375         {
376                 .name = "stopwatch",
377                 .has_arg = required_argument,
378                 .flag = NULL,
379                 .val = 'w'
380         },
381         {
382                 .name = "version",
383                 .has_arg = no_argument,
384                 .flag = NULL,
385                 .val = 'v'
386         },
387         {
388                 .name = "version",
389                 .has_arg = no_argument,
390                 .flag = NULL,
391                 .val = 'V'
392         },
393         {
394                 .name = "buffer-size",
395                 .has_arg = required_argument,
396                 .flag = NULL,
397                 .val = 'b'
398         },
399         {
400                 .name = "num-sub-buffers",
401                 .has_arg = required_argument,
402                 .flag = NULL,
403                 .val = 'n'
404         },
405         {
406                 .name = "output-dir",
407                 .has_arg = required_argument,
408                 .flag = NULL,
409                 .val = 'D'
410         },
411         {
412                 .name = "listen",
413                 .has_arg = no_argument,
414                 .flag = NULL,
415                 .val = 'l'
416         },
417         {
418                 .name = "host",
419                 .has_arg = required_argument,
420                 .flag = NULL,
421                 .val = 'h'
422         },
423         {
424                 .name = "port",
425                 .has_arg = required_argument,
426                 .flag = NULL,
427                 .val = 'p'
428         },
429         {
430                 .name = "no-sendfile",
431                 .has_arg = no_argument,
432                 .flag = NULL,
433                 .val = 's'
434         },
435         {
436                 .name = NULL,
437         }
438 };
439
440 static char usage_str[] = "\n\n" \
441         "-d <dev>             | --dev=<dev>\n" \
442         "[ -r <debugfs path>  | --relay=<debugfs path> ]\n" \
443         "[ -o <file>          | --output=<file>]\n" \
444         "[ -D <dir>           | --output-dir=<dir>\n" \
445         "[ -w <time>          | --stopwatch=<time>]\n" \
446         "[ -a <action field>  | --act-mask=<action field>]\n" \
447         "[ -A <action mask>   | --set-mask=<action mask>]\n" \
448         "[ -b <size>          | --buffer-size]\n" \
449         "[ -n <number>        | --num-sub-buffers=<number>]\n" \
450         "[ -l                 | --listen]\n" \
451         "[ -h <hostname>      | --host=<hostname>]\n" \
452         "[ -p <port number>   | --port=<port number>]\n" \
453         "[ -s                 | --no-sendfile]\n" \
454         "[ -I <devs file>     | --input-devs=<devs file>]\n" \
455         "[ -v <version>       | --version]\n" \
456         "[ -V <version>       | --version]\n" \
457
458         "\t-d Use specified device. May also be given last after options\n" \
459         "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
460         "\t-o File(s) to send output to\n" \
461         "\t-D Directory to prepend to output file names\n" \
462         "\t-w Stop after defined time, in seconds\n" \
463         "\t-a Only trace specified actions. See documentation\n" \
464         "\t-A Give trace mask as a single value. See documentation\n" \
465         "\t-b Sub buffer size in KiB (default 512)\n" \
466         "\t-n Number of sub buffers (default 4)\n" \
467         "\t-l Run in network listen mode (blktrace server)\n" \
468         "\t-h Run in network client mode, connecting to the given host\n" \
469         "\t-p Network port to use (default 8462)\n" \
470         "\t-s Make the network client NOT use sendfile() to transfer data\n" \
471         "\t-I Add devices found in <devs file>\n" \
472         "\t-v Print program version info\n" \
473         "\t-V Print program version info\n\n";
474
475 static void clear_events(struct pollfd *pfd)
476 {
477         pfd->events = 0;
478         pfd->revents = 0;
479 }
480
481 static inline int net_client_use_sendfile(void)
482 {
483         return net_mode == Net_client && net_use_sendfile;
484 }
485
486 static inline int net_client_use_send(void)
487 {
488         return net_mode == Net_client && !net_use_sendfile;
489 }
490
491 static inline int use_tracer_devpaths(void)
492 {
493         return piped_output || net_client_use_send();
494 }
495
496 static inline int in_addr_eq(struct in_addr a, struct in_addr b)
497 {
498         return a.s_addr == b.s_addr;
499 }
500
501 static inline void pdc_dr_update(struct devpath *dpp, int cpu, int data_read)
502 {
503         dpp->stats[cpu].data_read += data_read;
504 }
505
506 static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
507 {
508         dpp->stats[cpu].nevents += nevents;
509 }
510
511 static void show_usage(char *prog)
512 {
513         fprintf(stderr, "Usage: %s %s", prog, usage_str);
514 }
515
516 /*
517  * Create a timespec 'msec' milliseconds into the future
518  */
519 static inline void make_timespec(struct timespec *tsp, long delta_msec)
520 {
521         struct timeval now;
522
523         gettimeofday(&now, NULL);
524         tsp->tv_sec = now.tv_sec;
525         tsp->tv_nsec = 1000L * now.tv_usec;
526
527         tsp->tv_nsec += (delta_msec * 1000000L);
528         if (tsp->tv_nsec > 1000000000L) {
529                 long secs = tsp->tv_nsec / 1000000000L;
530
531                 tsp->tv_sec += secs;
532                 tsp->tv_nsec -= (secs * 1000000000L);
533         }
534 }
535
536 /*
537  * Add a timer to ensure wait ends
538  */
539 static void t_pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
540 {
541         struct timespec ts;
542
543         make_timespec(&ts, 50);
544         pthread_cond_timedwait(cond, mutex, &ts);
545 }
546
547 static void unblock_tracers(void)
548 {
549         pthread_mutex_lock(&mt_mutex);
550         tracers_run = 1;
551         pthread_cond_broadcast(&mt_cond);
552         pthread_mutex_unlock(&mt_mutex);
553 }
554
555 static void tracer_wait_unblock(struct tracer *tp)
556 {
557         pthread_mutex_lock(&mt_mutex);
558         while (!tp->is_done && !tracers_run)
559                 pthread_cond_wait(&mt_cond, &mt_mutex);
560         pthread_mutex_unlock(&mt_mutex);
561 }
562
563 static void tracer_signal_ready(struct tracer *tp,
564                                 enum thread_status th_status,
565                                 int status)
566 {
567         pthread_mutex_lock(&mt_mutex);
568         tp->status = status;
569
570         if (th_status == Th_running)
571                 nthreads_running++;
572         else if (th_status == Th_error)
573                 nthreads_error++;
574         else
575                 nthreads_leaving++;
576
577         pthread_cond_signal(&mt_cond);
578         pthread_mutex_unlock(&mt_mutex);
579 }
580
581 static void wait_tracers_ready(int ncpus_started)
582 {
583         pthread_mutex_lock(&mt_mutex);
584         while ((nthreads_running + nthreads_error) < ncpus_started)
585                 t_pthread_cond_wait(&mt_cond, &mt_mutex);
586         pthread_mutex_unlock(&mt_mutex);
587 }
588
589 static void wait_tracers_leaving(void)
590 {
591         pthread_mutex_lock(&mt_mutex);
592         while (nthreads_leaving < nthreads_running)
593                 t_pthread_cond_wait(&mt_cond, &mt_mutex);
594         pthread_mutex_unlock(&mt_mutex);
595 }
596
597 static void init_mmap_info(struct mmap_info *mip)
598 {
599         mip->buf_size = buf_size;
600         mip->buf_nr = buf_nr;
601         mip->pagesize = pagesize;
602 }
603
604 static void net_close_connection(int *fd)
605 {
606         shutdown(*fd, SHUT_RDWR);
607         close(*fd);
608         *fd = -1;
609 }
610
611 static void dpp_free(struct devpath *dpp)
612 {
613         if (dpp->stats)
614                 free(dpp->stats);
615         if (dpp->ios)
616                 free(dpp->ios);
617         if (dpp->path)
618                 free(dpp->path);
619         if (dpp->buts_name)
620                 free(dpp->buts_name);
621         free(dpp);
622 }
623
624 static int lock_on_cpu(int cpu)
625 {
626         cpu_set_t * cpu_mask;
627         size_t size;
628
629         cpu_mask = CPU_ALLOC(max_cpus);
630         size = CPU_ALLOC_SIZE(max_cpus);
631
632         CPU_ZERO_S(size, cpu_mask);
633         CPU_SET_S(cpu, size, cpu_mask);
634         if (sched_setaffinity(0, size, cpu_mask) < 0) {
635                 CPU_FREE(cpu_mask);             
636                 return errno;
637         }
638
639         CPU_FREE(cpu_mask);             
640         return 0;
641 }
642
643 static int increase_limit(int resource, rlim_t increase)
644 {
645         struct rlimit rlim;
646         int save_errno = errno;
647
648         if (!getrlimit(resource, &rlim)) {
649                 rlim.rlim_cur += increase;
650                 if (rlim.rlim_cur >= rlim.rlim_max)
651                         rlim.rlim_max = rlim.rlim_cur + increase;
652
653                 if (!setrlimit(resource, &rlim))
654                         return 1;
655         }
656
657         errno = save_errno;
658         return 0;
659 }
660
661 static int handle_open_failure(void)
662 {
663         if (errno == ENFILE || errno == EMFILE)
664                 return increase_limit(RLIMIT_NOFILE, 16);
665         return 0;
666 }
667
668 static int handle_mem_failure(size_t length)
669 {
670         if (errno == ENFILE)
671                 return handle_open_failure();
672         else if (errno == ENOMEM)
673                 return increase_limit(RLIMIT_MEMLOCK, 2 * length);
674         return 0;
675 }
676
677 static FILE *my_fopen(const char *path, const char *mode)
678 {
679         FILE *fp;
680
681         do {
682                 fp = fopen(path, mode);
683         } while (fp == NULL && handle_open_failure());
684
685         return fp;
686 }
687
688 static int my_open(const char *path, int flags)
689 {
690         int fd;
691
692         do {
693                 fd = open(path, flags);
694         } while (fd < 0 && handle_open_failure());
695
696         return fd;
697 }
698
699 static int my_socket(int domain, int type, int protocol)
700 {
701         int fd;
702
703         do {
704                 fd = socket(domain, type, protocol);
705         } while (fd < 0 && handle_open_failure());
706
707         return fd;
708 }
709
710 static int my_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
711 {
712         int fd;
713
714         do {
715                 fd = accept(sockfd, addr, addrlen);
716         } while (fd < 0 && handle_open_failure());
717
718         return fd;
719 }
720
721 static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
722                      off_t offset)
723 {
724         void *new;
725
726         do {
727                 new = mmap(addr, length, prot, flags, fd, offset);
728         } while (new == MAP_FAILED && handle_mem_failure(length));
729
730         return new;
731 }
732
733 static int my_mlock(struct tracer *tp,
734                     const void *addr, size_t len)
735 {
736         int ret, retry = 0;
737
738         do {
739                 ret = mlock(addr, len);
740                 if ((retry >= 10) && tp && tp->is_done)
741                         break;
742                 retry++;
743         } while (ret < 0 && handle_mem_failure(len));
744
745         return ret;
746 }
747
748 static int setup_mmap(int fd, unsigned int maxlen,
749                       struct mmap_info *mip,
750                       struct tracer *tp)
751 {
752         if (mip->fs_off + maxlen > mip->fs_buf_len) {
753                 unsigned long nr = max(16, mip->buf_nr);
754
755                 if (mip->fs_buf) {
756                         munlock(mip->fs_buf, mip->fs_buf_len);
757                         munmap(mip->fs_buf, mip->fs_buf_len);
758                         mip->fs_buf = NULL;
759                 }
760
761                 mip->fs_off = mip->fs_size & (mip->pagesize - 1);
762                 mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
763                 mip->fs_max_size += mip->fs_buf_len;
764
765                 if (ftruncate(fd, mip->fs_max_size) < 0) {
766                         perror("setup_mmap: ftruncate");
767                         return 1;
768                 }
769
770                 mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
771                                       MAP_SHARED, fd,
772                                       mip->fs_size - mip->fs_off);
773                 if (mip->fs_buf == MAP_FAILED) {
774                         perror("setup_mmap: mmap");
775                         return 1;
776                 }
777                 if (my_mlock(tp, mip->fs_buf, mip->fs_buf_len) < 0) {
778                         perror("setup_mlock: mlock");
779                         return 1;
780                 }
781         }
782
783         return 0;
784 }
785
786 static int __stop_trace(int fd)
787 {
788         /*
789          * Should be stopped, don't complain if it isn't
790          */
791         ioctl(fd, BLKTRACESTOP);
792         return ioctl(fd, BLKTRACETEARDOWN);
793 }
794
795 static int write_data(char *buf, int len)
796 {
797         int ret;
798
799 rewrite:
800         ret = fwrite(buf, len, 1, pfp);
801         if (ferror(pfp) || ret != 1) {
802                 if (errno == EINTR) {
803                         clearerr(pfp);
804                         goto rewrite;
805                 }
806
807                 if (!piped_output || (errno != EPIPE && errno != EBADF)) {
808                         fprintf(stderr, "write(%d) failed: %d/%s\n",
809                                 len, errno, strerror(errno));
810                 }
811                 goto err;
812         }
813
814         fflush(pfp);
815         return 0;
816
817 err:
818         clearerr(pfp);
819         return 1;
820 }
821
822 /*
823  * Returns the number of bytes read (successfully)
824  */
825 static int __net_recv_data(int fd, void *buf, unsigned int len)
826 {
827         unsigned int bytes_left = len;
828
829         while (bytes_left && !done) {
830                 int ret = recv(fd, buf, bytes_left, MSG_WAITALL);
831
832                 if (ret == 0)
833                         break;
834                 else if (ret < 0) {
835                         if (errno == EAGAIN) {
836                                 usleep(50);
837                                 continue;
838                         }
839                         perror("server: net_recv_data: recv failed");
840                         break;
841                 } else {
842                         buf += ret;
843                         bytes_left -= ret;
844                 }
845         }
846
847         return len - bytes_left;
848 }
849
850 static int net_recv_data(int fd, void *buf, unsigned int len)
851 {
852         return __net_recv_data(fd, buf, len);
853 }
854
855 /*
856  * Returns number of bytes written
857  */
858 static int net_send_data(int fd, void *buf, unsigned int buf_len)
859 {
860         int ret;
861         unsigned int bytes_left = buf_len;
862
863         while (bytes_left) {
864                 ret = send(fd, buf, bytes_left, 0);
865                 if (ret < 0) {
866                         perror("send");
867                         break;
868                 }
869
870                 buf += ret;
871                 bytes_left -= ret;
872         }
873
874         return buf_len - bytes_left;
875 }
876
877 static int net_send_header(int fd, int cpu, char *buts_name, int len)
878 {
879         struct blktrace_net_hdr hdr;
880
881         memset(&hdr, 0, sizeof(hdr));
882
883         hdr.magic = BLK_IO_TRACE_MAGIC;
884         memset(hdr.buts_name, 0, sizeof(hdr.buts_name));
885         strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
886         hdr.buts_name[sizeof(hdr.buts_name) - 1] = '\0';
887         hdr.cpu = cpu;
888         hdr.max_cpus = max_cpus;
889         hdr.len = len;
890         hdr.cl_id = getpid();
891         hdr.buf_size = buf_size;
892         hdr.buf_nr = buf_nr;
893         hdr.page_size = pagesize;
894
895         return net_send_data(fd, &hdr, sizeof(hdr)) != sizeof(hdr);
896 }
897
898 static void net_send_open_close(int fd, int cpu, char *buts_name, int len)
899 {
900         struct blktrace_net_hdr ret_hdr;
901
902         net_send_header(fd, cpu, buts_name, len);
903         net_recv_data(fd, &ret_hdr, sizeof(ret_hdr));
904 }
905
906 static void net_send_open(int fd, int cpu, char *buts_name)
907 {
908         net_send_open_close(fd, cpu, buts_name, 0);
909 }
910
911 static void net_send_close(int fd, char *buts_name, int drops)
912 {
913         /*
914          * Overload CPU w/ number of drops
915          *
916          * XXX: Need to clear/set done around call - done=1 (which
917          * is true here) stops reads from happening... :-(
918          */
919         done = 0;
920         net_send_open_close(fd, drops, buts_name, 1);
921         done = 1;
922 }
923
924 static void ack_open_close(int fd, char *buts_name)
925 {
926         net_send_header(fd, 0, buts_name, 2);
927 }
928
929 static void net_send_drops(int fd)
930 {
931         struct list_head *p;
932
933         __list_for_each(p, &devpaths) {
934                 struct devpath *dpp = list_entry(p, struct devpath, head);
935
936                 net_send_close(fd, dpp->buts_name, dpp->drops);
937         }
938 }
939
940 /*
941  * Returns:
942  *       0: "EOF"
943  *       1: OK
944  *      -1: Error
945  */
946 static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
947 {
948         int bytes_read;
949         int fl = fcntl(nc->fd, F_GETFL);
950
951         fcntl(nc->fd, F_SETFL, fl | O_NONBLOCK);
952         bytes_read = __net_recv_data(nc->fd, bnh, sizeof(*bnh));
953         fcntl(nc->fd, F_SETFL, fl & ~O_NONBLOCK);
954
955         if (bytes_read == sizeof(*bnh))
956                 return 1;
957         else if (bytes_read == 0)
958                 return 0;
959         else
960                 return -1;
961 }
962
963 static int net_setup_addr(void)
964 {
965         struct sockaddr_in *addr = &hostname_addr;
966
967         memset(addr, 0, sizeof(*addr));
968         addr->sin_family = AF_INET;
969         addr->sin_port = htons(net_port);
970
971         if (inet_aton(hostname, &addr->sin_addr) != 1) {
972                 struct hostent *hent;
973 retry:
974                 hent = gethostbyname(hostname);
975                 if (!hent) {
976                         if (h_errno == TRY_AGAIN) {
977                                 usleep(100);
978                                 goto retry;
979                         } else if (h_errno == NO_RECOVERY) {
980                                 fprintf(stderr, "gethostbyname(%s)"
981                                         "non-recoverable error encountered\n",
982                                         hostname);
983                         } else {
984                                 /*
985                                  * HOST_NOT_FOUND, NO_ADDRESS or NO_DATA
986                                  */
987                                 fprintf(stderr, "Host %s not found\n",
988                                         hostname);
989                         }
990                         return 1;
991                 }
992
993                 memcpy(&addr->sin_addr, hent->h_addr, 4);
994                 memset(hostname, 0, sizeof(hostname));
995                 strncpy(hostname, hent->h_name, sizeof(hostname));
996                 hostname[sizeof(hostname) - 1] = '\0';
997         }
998
999         return 0;
1000 }
1001
1002 static int net_setup_client(void)
1003 {
1004         int fd;
1005         struct sockaddr_in *addr = &hostname_addr;
1006
1007         fd = my_socket(AF_INET, SOCK_STREAM, 0);
1008         if (fd < 0) {
1009                 perror("client: socket");
1010                 return -1;
1011         }
1012
1013         if (connect(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0) {
1014                 if (errno == ECONNREFUSED)
1015                         fprintf(stderr,
1016                                 "\nclient: Connection to %s refused, "
1017                                 "perhaps the server is not started?\n\n",
1018                                 hostname);
1019                 else
1020                         perror("client: connect");
1021
1022                 close(fd);
1023                 return -1;
1024         }
1025
1026         return fd;
1027 }
1028
1029 static int open_client_connections(void)
1030 {
1031         int cpu;
1032         size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
1033
1034         cl_fds = calloc(ncpus, sizeof(*cl_fds));
1035         for (cpu = 0; cpu < max_cpus; cpu++) {
1036                 if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
1037                         continue;
1038                 cl_fds[cpu] = net_setup_client();
1039                 if (cl_fds[cpu] < 0)
1040                         goto err;
1041         }
1042         return 0;
1043
1044 err:
1045         while (cpu > 0)
1046                 close(cl_fds[cpu--]);
1047         free(cl_fds);
1048         return 1;
1049 }
1050
1051 static void close_client_connections(void)
1052 {
1053         if (cl_fds) {
1054                 int cpu, *fdp;
1055                 size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
1056
1057                 for (cpu = 0, fdp = cl_fds; cpu < max_cpus; cpu++, fdp++) {
1058                         if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
1059                                 continue;
1060                         if (*fdp >= 0) {
1061                                 net_send_drops(*fdp);
1062                                 net_close_connection(fdp);
1063                         }
1064                 }
1065                 free(cl_fds);
1066         }
1067 }
1068
1069 static void setup_buts(void)
1070 {
1071         struct list_head *p;
1072
1073         __list_for_each(p, &devpaths) {
1074                 struct blk_user_trace_setup buts;
1075                 struct devpath *dpp = list_entry(p, struct devpath, head);
1076
1077                 memset(&buts, 0, sizeof(buts));
1078                 buts.buf_size = buf_size;
1079                 buts.buf_nr = buf_nr;
1080                 buts.act_mask = act_mask;
1081
1082                 if (ioctl(dpp->fd, BLKTRACESETUP, &buts) >= 0) {
1083                         dpp->ncpus = max_cpus;
1084                         dpp->buts_name = strdup(buts.name);
1085                         if (dpp->stats)
1086                                 free(dpp->stats);
1087                         dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
1088                         memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
1089                 } else
1090                         fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
1091                                 dpp->path, errno, strerror(errno));
1092         }
1093 }
1094
1095 static void start_buts(void)
1096 {
1097         struct list_head *p;
1098
1099         __list_for_each(p, &devpaths) {
1100                 struct devpath *dpp = list_entry(p, struct devpath, head);
1101
1102                 if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
1103                         fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
1104                                 dpp->path, errno, strerror(errno));
1105                 }
1106         }
1107 }
1108
1109 static int get_drops(struct devpath *dpp)
1110 {
1111         int fd, drops = 0;
1112         char fn[MAXPATHLEN + 64], tmp[256];
1113
1114         snprintf(fn, sizeof(fn), "%s/block/%s/dropped", debugfs_path,
1115                  dpp->buts_name);
1116
1117         fd = my_open(fn, O_RDONLY);
1118         if (fd < 0) {
1119                 /*
1120                  * This may be ok: the kernel may not support
1121                  * dropped counts.
1122                  */
1123                 if (errno != ENOENT)
1124                         fprintf(stderr, "Could not open %s: %d/%s\n",
1125                                 fn, errno, strerror(errno));
1126                 return 0;
1127         } else if (read(fd, tmp, sizeof(tmp)) < 0) {
1128                 fprintf(stderr, "Could not read %s: %d/%s\n",
1129                         fn, errno, strerror(errno));
1130         } else
1131                 drops = atoi(tmp);
1132         close(fd);
1133
1134         return drops;
1135 }
1136
1137 static void get_all_drops(void)
1138 {
1139         struct list_head *p;
1140
1141         __list_for_each(p, &devpaths) {
1142                 struct devpath *dpp = list_entry(p, struct devpath, head);
1143
1144                 dpp->drops = get_drops(dpp);
1145         }
1146 }
1147
1148 static inline struct trace_buf *alloc_trace_buf(int cpu, int bufsize)
1149 {
1150         struct trace_buf *tbp;
1151
1152         tbp = malloc(sizeof(*tbp) + bufsize);
1153         INIT_LIST_HEAD(&tbp->head);
1154         tbp->len = 0;
1155         tbp->buf = (void *)(tbp + 1);
1156         tbp->cpu = cpu;
1157         tbp->dpp = NULL;        /* Will be set when tbp is added */
1158
1159         return tbp;
1160 }
1161
1162 static void free_tracer_heads(struct devpath *dpp)
1163 {
1164         int cpu;
1165         struct tracer_devpath_head *hd;
1166
1167         for (cpu = 0, hd = dpp->heads; cpu < max_cpus; cpu++, hd++) {
1168                 if (hd->prev)
1169                         free(hd->prev);
1170
1171                 pthread_mutex_destroy(&hd->mutex);
1172         }
1173         free(dpp->heads);
1174 }
1175
1176 static int setup_tracer_devpaths(void)
1177 {
1178         struct list_head *p;
1179
1180         if (net_client_use_send())
1181                 if (open_client_connections())
1182                         return 1;
1183
1184         __list_for_each(p, &devpaths) {
1185                 int cpu;
1186                 struct tracer_devpath_head *hd;
1187                 struct devpath *dpp = list_entry(p, struct devpath, head);
1188
1189                 dpp->heads = calloc(max_cpus, sizeof(struct tracer_devpath_head));
1190                 for (cpu = 0, hd = dpp->heads; cpu < max_cpus; cpu++, hd++) {
1191                         INIT_LIST_HEAD(&hd->head);
1192                         pthread_mutex_init(&hd->mutex, NULL);
1193                         hd->prev = NULL;
1194                 }
1195         }
1196
1197         return 0;
1198 }
1199
1200 static inline void add_trace_buf(struct devpath *dpp, int cpu,
1201                                                 struct trace_buf **tbpp)
1202 {
1203         struct trace_buf *tbp = *tbpp;
1204         struct tracer_devpath_head *hd = &dpp->heads[cpu];
1205
1206         tbp->dpp = dpp;
1207
1208         pthread_mutex_lock(&hd->mutex);
1209         list_add_tail(&tbp->head, &hd->head);
1210         pthread_mutex_unlock(&hd->mutex);
1211
1212         *tbpp = alloc_trace_buf(cpu, buf_size);
1213 }
1214
1215 static inline void incr_entries(int entries_handled)
1216 {
1217         pthread_mutex_lock(&dp_mutex);
1218         if (dp_entries == 0)
1219                 pthread_cond_signal(&dp_cond);
1220         dp_entries += entries_handled;
1221         pthread_mutex_unlock(&dp_mutex);
1222 }
1223
1224 static void decr_entries(int handled)
1225 {
1226         pthread_mutex_lock(&dp_mutex);
1227         dp_entries -= handled;
1228         pthread_mutex_unlock(&dp_mutex);
1229 }
1230
1231 static int wait_empty_entries(void)
1232 {
1233         pthread_mutex_lock(&dp_mutex);
1234         while (!done && dp_entries == 0)
1235                 t_pthread_cond_wait(&dp_cond, &dp_mutex);
1236         pthread_mutex_unlock(&dp_mutex);
1237
1238         return !done;
1239 }
1240
1241 static int add_devpath(char *path)
1242 {
1243         int fd;
1244         struct devpath *dpp;
1245         struct list_head *p;
1246
1247         /*
1248          * Verify device is not duplicated
1249          */
1250         __list_for_each(p, &devpaths) {
1251                struct devpath *tmp = list_entry(p, struct devpath, head);
1252                if (!strcmp(tmp->path, path))
1253                         return 0;
1254         }
1255         /*
1256          * Verify device is valid before going too far
1257          */
1258         fd = my_open(path, O_RDONLY | O_NONBLOCK);
1259         if (fd < 0) {
1260                 fprintf(stderr, "Invalid path %s specified: %d/%s\n",
1261                         path, errno, strerror(errno));
1262                 return 1;
1263         }
1264
1265         dpp = malloc(sizeof(*dpp));
1266         memset(dpp, 0, sizeof(*dpp));
1267         dpp->path = strdup(path);
1268         dpp->fd = fd;
1269         ndevs++;
1270         list_add_tail(&dpp->head, &devpaths);
1271
1272         return 0;
1273 }
1274
1275 static void rel_devpaths(void)
1276 {
1277         struct list_head *p, *q;
1278
1279         list_for_each_safe(p, q, &devpaths) {
1280                 struct devpath *dpp = list_entry(p, struct devpath, head);
1281
1282                 list_del(&dpp->head);
1283                 __stop_trace(dpp->fd);
1284                 close(dpp->fd);
1285
1286                 if (dpp->heads)
1287                         free_tracer_heads(dpp);
1288
1289                 dpp_free(dpp);
1290                 ndevs--;
1291         }
1292 }
1293
1294 static int flush_subbuf_net(struct trace_buf *tbp)
1295 {
1296         int fd = cl_fds[tbp->cpu];
1297         struct devpath *dpp = tbp->dpp;
1298
1299         if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
1300                 return 1;
1301         else if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
1302                 return 1;
1303
1304         return 0;
1305 }
1306
1307 static int
1308 handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
1309                 struct list_head *list)
1310 {
1311         struct trace_buf *tbp;
1312         struct list_head *p, *q;
1313         int entries_handled = 0;
1314
1315         list_for_each_safe(p, q, list) {
1316                 tbp = list_entry(p, struct trace_buf, head);
1317
1318                 list_del(&tbp->head);
1319                 entries_handled++;
1320
1321                 if (cl_fds[tbp->cpu] >= 0) {
1322                         if (flush_subbuf_net(tbp)) {
1323                                 close(cl_fds[tbp->cpu]);
1324                                 cl_fds[tbp->cpu] = -1;
1325                         }
1326                 }
1327
1328                 free(tbp);
1329         }
1330
1331         return entries_handled;
1332 }
1333
1334 /*
1335  * Tack 'tbp's buf onto the tail of 'prev's buf
1336  */
1337 static struct trace_buf *tb_combine(struct trace_buf *prev,
1338                                     struct trace_buf *tbp)
1339 {
1340         unsigned long tot_len;
1341
1342         tot_len = prev->len + tbp->len;
1343         if (tot_len > buf_size) {
1344                 /*
1345                  * tbp->head isn't connected (it was 'prev'
1346                  * so it had been taken off of the list
1347                  * before). Therefore, we can realloc
1348                  * the whole structures, as the other fields
1349                  * are "static".
1350                  */
1351                 prev = realloc(prev, sizeof(*prev) + tot_len);
1352                 prev->buf = (void *)(prev + 1);
1353         }
1354
1355         memcpy(prev->buf + prev->len, tbp->buf, tbp->len);
1356         prev->len = tot_len;
1357
1358         free(tbp);
1359         return prev;
1360 }
1361
1362 static int handle_list_file(struct tracer_devpath_head *hd,
1363                             struct list_head *list)
1364 {
1365         int off, t_len, nevents;
1366         struct blk_io_trace *t;
1367         struct list_head *p, *q;
1368         int entries_handled = 0;
1369         struct trace_buf *tbp, *prev;
1370
1371         prev = hd->prev;
1372         list_for_each_safe(p, q, list) {
1373                 tbp = list_entry(p, struct trace_buf, head);
1374                 list_del(&tbp->head);
1375                 entries_handled++;
1376
1377                 /*
1378                  * If there was some leftover before, tack this new
1379                  * entry onto the tail of the previous one.
1380                  */
1381                 if (prev)
1382                         tbp = tb_combine(prev, tbp);
1383
1384                 /*
1385                  * See how many whole traces there are - send them
1386                  * all out in one go.
1387                  */
1388                 off = 0;
1389                 nevents = 0;
1390                 while (off + (int)sizeof(*t) <= tbp->len) {
1391                         t = (struct blk_io_trace *)(tbp->buf + off);
1392                         t_len = sizeof(*t) + t->pdu_len;
1393                         if (off + t_len > tbp->len)
1394                                 break;
1395
1396                         off += t_len;
1397                         nevents++;
1398                 }
1399                 if (nevents)
1400                         pdc_nev_update(tbp->dpp, tbp->cpu, nevents);
1401
1402                 /*
1403                  * Write any full set of traces, any remaining data is kept
1404                  * for the next pass.
1405                  */
1406                 if (off) {
1407                         if (write_data(tbp->buf, off) || off == tbp->len) {
1408                                 free(tbp);
1409                                 prev = NULL;
1410                         }
1411                         else {
1412                                 /*
1413                                  * Move valid data to beginning of buffer
1414                                  */
1415                                 tbp->len -= off;
1416                                 memmove(tbp->buf, tbp->buf + off, tbp->len);
1417                                 prev = tbp;
1418                         }
1419                 } else
1420                         prev = tbp;
1421         }
1422         hd->prev = prev;
1423
1424         return entries_handled;
1425 }
1426
1427 static void __process_trace_bufs(void)
1428 {
1429         int cpu;
1430         struct list_head *p;
1431         struct list_head list;
1432         int handled = 0;
1433
1434         __list_for_each(p, &devpaths) {
1435                 struct devpath *dpp = list_entry(p, struct devpath, head);
1436                 struct tracer_devpath_head *hd = dpp->heads;
1437
1438                 for (cpu = 0; cpu < max_cpus; cpu++, hd++) {
1439                         pthread_mutex_lock(&hd->mutex);
1440                         if (list_empty(&hd->head)) {
1441                                 pthread_mutex_unlock(&hd->mutex);
1442                                 continue;
1443                         }
1444
1445                         list_replace_init(&hd->head, &list);
1446                         pthread_mutex_unlock(&hd->mutex);
1447
1448                         handled += handle_list(hd, &list);
1449                 }
1450         }
1451
1452         if (handled)
1453                 decr_entries(handled);
1454 }
1455
1456 static void process_trace_bufs(void)
1457 {
1458         while (wait_empty_entries())
1459                 __process_trace_bufs();
1460 }
1461
1462 static void clean_trace_bufs(void)
1463 {
1464         /*
1465          * No mutex needed here: we're only reading from the lists,
1466          * tracers are done
1467          */
1468         while (dp_entries)
1469                 __process_trace_bufs();
1470 }
1471
1472 static inline void read_err(int cpu, char *ifn)
1473 {
1474         if (errno != EAGAIN)
1475                 fprintf(stderr, "Thread %d failed read of %s: %d/%s\n",
1476                         cpu, ifn, errno, strerror(errno));
1477 }
1478
1479 static int net_sendfile(struct io_info *iop)
1480 {
1481         int ret;
1482
1483         ret = sendfile(iop->ofd, iop->ifd, NULL, iop->ready);
1484         if (ret < 0) {
1485                 perror("sendfile");
1486                 return 1;
1487         } else if (ret < (int)iop->ready) {
1488                 fprintf(stderr, "short sendfile send (%d of %d)\n",
1489                         ret, iop->ready);
1490                 return 1;
1491         }
1492
1493         return 0;
1494 }
1495
1496 static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
1497 {
1498         struct devpath *dpp = iop->dpp;
1499
1500         if (net_send_header(iop->ofd, tp->cpu, dpp->buts_name, iop->ready))
1501                 return 1;
1502         return net_sendfile(iop);
1503 }
1504
1505 static int fill_ofname(char *dst, int dstlen, char *subdir, char *buts_name,
1506                        int cpu)
1507 {
1508         int len;
1509         struct stat sb;
1510
1511         if (output_dir)
1512                 len = snprintf(dst, dstlen, "%s/", output_dir);
1513         else
1514                 len = snprintf(dst, dstlen, "./");
1515
1516         if (subdir)
1517                 len += snprintf(dst + len, dstlen - len, "%s", subdir);
1518
1519         if (stat(dst, &sb) < 0) {
1520                 if (errno != ENOENT) {
1521                         fprintf(stderr,
1522                                 "Destination dir %s stat failed: %d/%s\n",
1523                                 dst, errno, strerror(errno));
1524                         return 1;
1525                 }
1526                 /*
1527                  * There is no synchronization between multiple threads
1528                  * trying to create the directory at once.  It's harmless
1529                  * to let them try, so just detect the problem and move on.
1530                  */
1531                 if (mkdir(dst, 0755) < 0 && errno != EEXIST) {
1532                         fprintf(stderr,
1533                                 "Destination dir %s can't be made: %d/%s\n",
1534                                 dst, errno, strerror(errno));
1535                         return 1;
1536                 }
1537         }
1538
1539         if (output_name)
1540                 snprintf(dst + len, dstlen - len, "%s.blktrace.%d",
1541                          output_name, cpu);
1542         else
1543                 snprintf(dst + len, dstlen - len, "%s.blktrace.%d",
1544                          buts_name, cpu);
1545
1546         return 0;
1547 }
1548
1549 static int set_vbuf(struct io_info *iop, int mode, size_t size)
1550 {
1551         iop->obuf = malloc(size);
1552         if (setvbuf(iop->ofp, iop->obuf, mode, size) < 0) {
1553                 fprintf(stderr, "setvbuf(%s, %d) failed: %d/%s\n",
1554                         iop->dpp->path, (int)size, errno,
1555                         strerror(errno));
1556                 free(iop->obuf);
1557                 return 1;
1558         }
1559
1560         return 0;
1561 }
1562
1563 static int iop_open(struct io_info *iop, int cpu)
1564 {
1565         char hostdir[MAXPATHLEN + 64];
1566
1567         iop->ofd = -1;
1568         if (net_mode == Net_server) {
1569                 struct cl_conn *nc = iop->nc;
1570                 int len;
1571
1572                 len = snprintf(hostdir, sizeof(hostdir), "%s-",
1573                                nc->ch->hostname);
1574                 len += strftime(hostdir + len, sizeof(hostdir) - len, "%F-%T/",
1575                                 gmtime(&iop->dpp->cl_connect_time));
1576         } else {
1577                 hostdir[0] = 0;
1578         }
1579
1580         if (fill_ofname(iop->ofn, sizeof(iop->ofn), hostdir,
1581                         iop->dpp->buts_name, cpu))
1582                 return 1;
1583
1584         iop->ofp = my_fopen(iop->ofn, "w+");
1585         if (iop->ofp == NULL) {
1586                 fprintf(stderr, "Open output file %s failed: %d/%s\n",
1587                         iop->ofn, errno, strerror(errno));
1588                 return 1;
1589         }
1590
1591         if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
1592                 fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
1593                         iop->ofn, errno, strerror(errno));
1594                 fclose(iop->ofp);
1595                 return 1;
1596         }
1597
1598         iop->ofd = fileno(iop->ofp);
1599         return 0;
1600 }
1601
1602 static void close_iop(struct io_info *iop)
1603 {
1604         struct mmap_info *mip = &iop->mmap_info;
1605
1606         if (mip->fs_buf)
1607                 munmap(mip->fs_buf, mip->fs_buf_len);
1608
1609         if (!piped_output) {
1610                 if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
1611                         fprintf(stderr,
1612                                 "Ignoring err: ftruncate(%s): %d/%s\n",
1613                                 iop->ofn, errno, strerror(errno));
1614                 }
1615         }
1616
1617         if (iop->ofp)
1618                 fclose(iop->ofp);
1619         if (iop->obuf)
1620                 free(iop->obuf);
1621 }
1622
1623 static void close_ios(struct tracer *tp)
1624 {
1625         while (tp->nios > 0) {
1626                 struct io_info *iop = &tp->ios[--tp->nios];
1627
1628                 iop->dpp->drops = get_drops(iop->dpp);
1629                 if (iop->ifd >= 0)
1630                         close(iop->ifd);
1631
1632                 if (iop->ofp)
1633                         close_iop(iop);
1634                 else if (iop->ofd >= 0) {
1635                         struct devpath *dpp = iop->dpp;
1636
1637                         net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
1638                         net_close_connection(&iop->ofd);
1639                 }
1640         }
1641
1642         free(tp->ios);
1643         free(tp->pfds);
1644 }
1645
1646 static int open_ios(struct tracer *tp)
1647 {
1648         struct pollfd *pfd;
1649         struct io_info *iop;
1650         struct list_head *p;
1651
1652         tp->ios = calloc(ndevs, sizeof(struct io_info));
1653         memset(tp->ios, 0, ndevs * sizeof(struct io_info));
1654
1655         tp->pfds = calloc(ndevs, sizeof(struct pollfd));
1656         memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
1657
1658         tp->nios = 0;
1659         iop = tp->ios;
1660         pfd = tp->pfds;
1661         __list_for_each(p, &devpaths) {
1662                 struct devpath *dpp = list_entry(p, struct devpath, head);
1663
1664                 iop->dpp = dpp;
1665                 iop->ofd = -1;
1666                 snprintf(iop->ifn, sizeof(iop->ifn), "%s/block/%s/trace%d",
1667                         debugfs_path, dpp->buts_name, tp->cpu);
1668
1669                 iop->ifd = my_open(iop->ifn, O_RDONLY | O_NONBLOCK);
1670                 if (iop->ifd < 0) {
1671                         fprintf(stderr, "Thread %d failed open %s: %d/%s\n",
1672                                 tp->cpu, iop->ifn, errno, strerror(errno));
1673                         return 1;
1674                 }
1675
1676                 init_mmap_info(&iop->mmap_info);
1677
1678                 pfd->fd = iop->ifd;
1679                 pfd->events = POLLIN;
1680
1681                 if (piped_output)
1682                         ;
1683                 else if (net_client_use_sendfile()) {
1684                         iop->ofd = net_setup_client();
1685                         if (iop->ofd < 0)
1686                                 goto err;
1687                         net_send_open(iop->ofd, tp->cpu, dpp->buts_name);
1688                 } else if (net_mode == Net_none) {
1689                         if (iop_open(iop, tp->cpu))
1690                                 goto err;
1691                 } else {
1692                         /*
1693                          * This ensures that the server knows about all
1694                          * connections & devices before _any_ closes
1695                          */
1696                         net_send_open(cl_fds[tp->cpu], tp->cpu, dpp->buts_name);
1697                 }
1698
1699                 pfd++;
1700                 iop++;
1701                 tp->nios++;
1702         }
1703
1704         return 0;
1705
1706 err:
1707         close(iop->ifd);        /* tp->nios _not_ bumped */
1708         close_ios(tp);
1709         return 1;
1710 }
1711
1712 static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
1713 {
1714         struct mmap_info *mip;
1715         int i, ret, nentries = 0;
1716         struct pollfd *pfd = tp->pfds;
1717         struct io_info *iop = tp->ios;
1718
1719         for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++) {
1720                 if (pfd->revents & POLLIN || force_read) {
1721                         mip = &iop->mmap_info;
1722
1723                         ret = setup_mmap(iop->ofd, buf_size, mip, tp);
1724                         if (ret < 0) {
1725                                 pfd->events = 0;
1726                                 break;
1727                         }
1728
1729                         ret = read(iop->ifd, mip->fs_buf + mip->fs_off,
1730                                    buf_size);
1731                         if (ret > 0) {
1732                                 pdc_dr_update(iop->dpp, tp->cpu, ret);
1733                                 mip->fs_size += ret;
1734                                 mip->fs_off += ret;
1735                                 nentries++;
1736                         } else if (ret == 0) {
1737                                 /*
1738                                  * Short reads after we're done stop us
1739                                  * from trying reads.
1740                                  */
1741                                 if (tp->is_done)
1742                                         clear_events(pfd);
1743                         } else {
1744                                 read_err(tp->cpu, iop->ifn);
1745                                 if (errno != EAGAIN || tp->is_done)
1746                                         clear_events(pfd);
1747                         }
1748                         nevs--;
1749                 }
1750         }
1751
1752         return nentries;
1753 }
1754
1755 static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
1756 {
1757         struct stat sb;
1758         int i, nentries = 0;
1759         struct pollfd *pfd = tp->pfds;
1760         struct io_info *iop = tp->ios;
1761
1762         for (i = 0; i < ndevs; i++, pfd++, iop++) {
1763                 if (pfd->revents & POLLIN || force_read) {
1764                         if (fstat(iop->ifd, &sb) < 0) {
1765                                 perror(iop->ifn);
1766                                 pfd->events = 0;
1767                         } else if (sb.st_size > (off_t)iop->data_queued) {
1768                                 iop->ready = sb.st_size - iop->data_queued;
1769                                 iop->data_queued = sb.st_size;
1770
1771                                 if (!net_sendfile_data(tp, iop)) {
1772                                         pdc_dr_update(iop->dpp, tp->cpu,
1773                                                       iop->ready);
1774                                         nentries++;
1775                                 } else
1776                                         clear_events(pfd);
1777                         }
1778                         if (--nevs == 0)
1779                                 break;
1780                 }
1781         }
1782
1783         if (nentries)
1784                 incr_entries(nentries);
1785
1786         return nentries;
1787 }
1788
1789 static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
1790 {
1791         int i, nentries = 0;
1792         struct trace_buf *tbp;
1793         struct pollfd *pfd = tp->pfds;
1794         struct io_info *iop = tp->ios;
1795
1796         tbp = alloc_trace_buf(tp->cpu, buf_size);
1797         for (i = 0; i < ndevs; i++, pfd++, iop++) {
1798                 if (pfd->revents & POLLIN || force_read) {
1799                         tbp->len = read(iop->ifd, tbp->buf, buf_size);
1800                         if (tbp->len > 0) {
1801                                 pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
1802                                 add_trace_buf(iop->dpp, tp->cpu, &tbp);
1803                                 nentries++;
1804                         } else if (tbp->len == 0) {
1805                                 /*
1806                                  * Short reads after we're done stop us
1807                                  * from trying reads.
1808                                  */
1809                                 if (tp->is_done)
1810                                         clear_events(pfd);
1811                         } else {
1812                                 read_err(tp->cpu, iop->ifn);
1813                                 if (errno != EAGAIN || tp->is_done)
1814                                         clear_events(pfd);
1815                         }
1816                         if (!piped_output && --nevs == 0)
1817                                 break;
1818                 }
1819         }
1820         free(tbp);
1821
1822         if (nentries)
1823                 incr_entries(nentries);
1824
1825         return nentries;
1826 }
1827
1828 static void *thread_main(void *arg)
1829 {
1830         int ret, ndone, to_val;
1831         struct tracer *tp = arg;
1832
1833         ret = lock_on_cpu(tp->cpu);
1834         if (ret)
1835                 goto err;
1836
1837         ret = open_ios(tp);
1838         if (ret)
1839                 goto err;
1840
1841         if (piped_output)
1842                 to_val = 50;            /* Frequent partial handles */
1843         else
1844                 to_val = 500;           /* 1/2 second intervals */
1845
1846
1847         tracer_signal_ready(tp, Th_running, 0);
1848         tracer_wait_unblock(tp);
1849
1850         while (!tp->is_done) {
1851                 ndone = poll(tp->pfds, ndevs, to_val);
1852                 if (ndone || piped_output)
1853                         (void)handle_pfds(tp, ndone, piped_output);
1854                 else if (ndone < 0 && errno != EINTR)
1855                         fprintf(stderr, "Thread %d poll failed: %d/%s\n",
1856                                 tp->cpu, errno, strerror(errno));
1857         }
1858
1859         /*
1860          * Trace is stopped, pull data until we get a short read
1861          */
1862         while (handle_pfds(tp, ndevs, 1) > 0)
1863                 ;
1864
1865         close_ios(tp);
1866         tracer_signal_ready(tp, Th_leaving, 0);
1867         return NULL;
1868
1869 err:
1870         tracer_signal_ready(tp, Th_error, ret);
1871         return NULL;
1872 }
1873
1874 static int start_tracer(int cpu)
1875 {
1876         struct tracer *tp;
1877
1878         tp = malloc(sizeof(*tp));
1879         memset(tp, 0, sizeof(*tp));
1880
1881         INIT_LIST_HEAD(&tp->head);
1882         tp->status = 0;
1883         tp->cpu = cpu;
1884
1885         if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
1886                 fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
1887                         cpu, errno, strerror(errno));
1888                 free(tp);
1889                 return 1;
1890         }
1891
1892         list_add_tail(&tp->head, &tracers);
1893         return 0;
1894 }
1895
1896 static void start_tracers(void)
1897 {
1898         int cpu, started = 0;
1899         struct list_head *p;
1900         size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
1901
1902         for (cpu = 0; cpu < max_cpus; cpu++) {
1903                 if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
1904                         continue;
1905                 if (start_tracer(cpu))
1906                         break;
1907                 started++;
1908         }
1909
1910         wait_tracers_ready(started);
1911
1912         __list_for_each(p, &tracers) {
1913                 struct tracer *tp = list_entry(p, struct tracer, head);
1914                 if (tp->status)
1915                         fprintf(stderr,
1916                                 "FAILED to start thread on CPU %d: %d/%s\n",
1917                                 tp->cpu, tp->status, strerror(tp->status));
1918         }
1919 }
1920
1921 static void stop_tracers(void)
1922 {
1923         struct list_head *p;
1924
1925         /*
1926          * Stop the tracing - makes the tracer threads clean up quicker.
1927          */
1928         __list_for_each(p, &devpaths) {
1929                 struct devpath *dpp = list_entry(p, struct devpath, head);
1930                 (void)ioctl(dpp->fd, BLKTRACESTOP);
1931         }
1932
1933         /*
1934          * Tell each tracer to quit
1935          */
1936         __list_for_each(p, &tracers) {
1937                 struct tracer *tp = list_entry(p, struct tracer, head);
1938                 tp->is_done = 1;
1939         }
1940         pthread_cond_broadcast(&mt_cond);
1941 }
1942
1943 static void del_tracers(void)
1944 {
1945         struct list_head *p, *q;
1946
1947         list_for_each_safe(p, q, &tracers) {
1948                 struct tracer *tp = list_entry(p, struct tracer, head);
1949
1950                 list_del(&tp->head);
1951                 free(tp);
1952         }
1953 }
1954
1955 static void wait_tracers(void)
1956 {
1957         struct list_head *p;
1958
1959         if (use_tracer_devpaths())
1960                 process_trace_bufs();
1961
1962         wait_tracers_leaving();
1963
1964         __list_for_each(p, &tracers) {
1965                 int ret;
1966                 struct tracer *tp = list_entry(p, struct tracer, head);
1967
1968                 ret = pthread_join(tp->thread, NULL);
1969                 if (ret)
1970                         fprintf(stderr, "Thread join %d failed %d\n",
1971                                 tp->cpu, ret);
1972         }
1973
1974         if (use_tracer_devpaths())
1975                 clean_trace_bufs();
1976
1977         get_all_drops();
1978 }
1979
1980 static void exit_tracing(void)
1981 {
1982         signal(SIGINT, SIG_IGN);
1983         signal(SIGHUP, SIG_IGN);
1984         signal(SIGTERM, SIG_IGN);
1985         signal(SIGALRM, SIG_IGN);
1986
1987         stop_tracers();
1988         wait_tracers();
1989         del_tracers();
1990         rel_devpaths();
1991 }
1992
1993 static void handle_sigint(__attribute__((__unused__)) int sig)
1994 {
1995         done = 1;
1996         stop_tracers();
1997 }
1998
1999 static void show_stats(struct list_head *devpaths)
2000 {
2001         FILE *ofp;
2002         struct list_head *p;
2003         unsigned long long nevents, data_read;
2004         unsigned long long total_drops = 0;
2005         unsigned long long total_events = 0;
2006
2007         if (piped_output)
2008                 ofp = my_fopen("/dev/null", "w");
2009         else
2010                 ofp = stdout;
2011
2012         __list_for_each(p, devpaths) {
2013                 int cpu;
2014                 struct pdc_stats *sp;
2015                 struct devpath *dpp = list_entry(p, struct devpath, head);
2016
2017                 if (net_mode == Net_server)
2018                         printf("server: end of run for %s:%s\n",
2019                                 dpp->ch->hostname, dpp->buts_name);
2020
2021                 data_read = 0;
2022                 nevents = 0;
2023
2024                 fprintf(ofp, "=== %s ===\n", dpp->buts_name);
2025                 for (cpu = 0, sp = dpp->stats; cpu < dpp->ncpus; cpu++, sp++) {
2026                         /*
2027                          * Estimate events if not known...
2028                          */
2029                         if (sp->nevents == 0) {
2030                                 sp->nevents = sp->data_read /
2031                                                 sizeof(struct blk_io_trace);
2032                         }
2033
2034                         fprintf(ofp,
2035                                 "  CPU%3d: %20llu events, %8llu KiB data\n",
2036                                 cpu, sp->nevents, (sp->data_read + 1023) >> 10);
2037
2038                         data_read += sp->data_read;
2039                         nevents += sp->nevents;
2040                 }
2041
2042                 fprintf(ofp, "  Total:  %20llu events (dropped %llu),"
2043                              " %8llu KiB data\n", nevents,
2044                              dpp->drops, (data_read + 1024) >> 10);
2045
2046                 total_drops += dpp->drops;
2047                 total_events += (nevents + dpp->drops);
2048         }
2049
2050         fflush(ofp);
2051         if (piped_output)
2052                 fclose(ofp);
2053
2054         if (total_drops) {
2055                 double drops_ratio = 1.0;
2056
2057                 if (total_events)
2058                         drops_ratio = (double)total_drops/(double)total_events;
2059
2060                 fprintf(stderr, "\nYou have %llu (%5.1lf%%) dropped events\n"
2061                                 "Consider using a larger buffer size (-b) "
2062                                 "and/or more buffers (-n)\n",
2063                         total_drops, 100.0 * drops_ratio);
2064         }
2065 }
2066
2067 static int handle_args(int argc, char *argv[])
2068 {
2069         int c, i;
2070         struct statfs st;
2071         int act_mask_tmp = 0;
2072
2073         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
2074                 switch (c) {
2075                 case 'a':
2076                         i = find_mask_map(optarg);
2077                         if (i < 0) {
2078                                 fprintf(stderr, "Invalid action mask %s\n",
2079                                         optarg);
2080                                 return 1;
2081                         }
2082                         act_mask_tmp |= i;
2083                         break;
2084
2085                 case 'A':
2086                         if ((sscanf(optarg, "%x", &i) != 1) ||
2087                                                         !valid_act_opt(i)) {
2088                                 fprintf(stderr,
2089                                         "Invalid set action mask %s/0x%x\n",
2090                                         optarg, i);
2091                                 return 1;
2092                         }
2093                         act_mask_tmp = i;
2094                         break;
2095
2096                 case 'd':
2097                         if (add_devpath(optarg) != 0)
2098                                 return 1;
2099                         break;
2100
2101                 case 'I': {
2102                         char dev_line[256];
2103                         FILE *ifp = my_fopen(optarg, "r");
2104
2105                         if (!ifp) {
2106                                 fprintf(stderr,
2107                                         "Invalid file for devices %s\n",
2108                                         optarg);
2109                                 return 1;
2110                         }
2111
2112                         while (fscanf(ifp, "%s\n", dev_line) == 1) {
2113                                 if (add_devpath(dev_line) != 0) {
2114                                         fclose(ifp);
2115                                         return 1;
2116                                 }
2117                         }
2118                         fclose(ifp);
2119                         break;
2120                 }
2121
2122                 case 'r':
2123                         debugfs_path = optarg;
2124                         break;
2125
2126                 case 'o':
2127                         output_name = optarg;
2128                         break;
2129                 case 'k':
2130                         kill_running_trace = 1;
2131                         break;
2132                 case 'w':
2133                         stop_watch = atoi(optarg);
2134                         if (stop_watch <= 0) {
2135                                 fprintf(stderr,
2136                                         "Invalid stopwatch value (%d secs)\n",
2137                                         stop_watch);
2138                                 return 1;
2139                         }
2140                         break;
2141                 case 'V':
2142                 case 'v':
2143                         printf("%s version %s\n", argv[0], blktrace_version);
2144                         exit(0);
2145                         /*NOTREACHED*/
2146                 case 'b':
2147                         buf_size = strtoul(optarg, NULL, 10);
2148                         if (buf_size <= 0 || buf_size > 16*1024) {
2149                                 fprintf(stderr, "Invalid buffer size (%lu)\n",
2150                                         buf_size);
2151                                 return 1;
2152                         }
2153                         buf_size <<= 10;
2154                         break;
2155                 case 'n':
2156                         buf_nr = strtoul(optarg, NULL, 10);
2157                         if (buf_nr <= 0) {
2158                                 fprintf(stderr,
2159                                         "Invalid buffer nr (%lu)\n", buf_nr);
2160                                 return 1;
2161                         }
2162                         break;
2163                 case 'D':
2164                         output_dir = optarg;
2165                         break;
2166                 case 'h':
2167                         net_mode = Net_client;
2168                         memset(hostname, 0, sizeof(hostname));
2169                         strncpy(hostname, optarg, sizeof(hostname));
2170                         hostname[sizeof(hostname) - 1] = '\0';
2171                         break;
2172                 case 'l':
2173                         net_mode = Net_server;
2174                         break;
2175                 case 'p':
2176                         net_port = atoi(optarg);
2177                         break;
2178                 case 's':
2179                         net_use_sendfile = 0;
2180                         break;
2181                 default:
2182                         show_usage(argv[0]);
2183                         exit(1);
2184                         /*NOTREACHED*/
2185                 }
2186         }
2187
2188         while (optind < argc)
2189                 if (add_devpath(argv[optind++]) != 0)
2190                         return 1;
2191
2192         if (net_mode != Net_server && ndevs == 0) {
2193                 show_usage(argv[0]);
2194                 return 1;
2195         }
2196
2197         if (statfs(debugfs_path, &st) < 0) {
2198                 fprintf(stderr, "Invalid debug path %s: %d/%s\n",
2199                         debugfs_path, errno, strerror(errno));
2200                 return 1;
2201         }
2202
2203         if (st.f_type != (long)DEBUGFS_TYPE) {
2204                 fprintf(stderr, "Debugfs is not mounted at %s\n", debugfs_path);
2205                 return 1;
2206         }
2207
2208         if (act_mask_tmp != 0)
2209                 act_mask = act_mask_tmp;
2210
2211         if (net_mode == Net_client && net_setup_addr())
2212                 return 1;
2213
2214         /*
2215          * Set up for appropriate PFD handler based upon output name.
2216          */
2217         if (net_client_use_sendfile())
2218                 handle_pfds = handle_pfds_netclient;
2219         else if (net_client_use_send())
2220                 handle_pfds = handle_pfds_entries;
2221         else if (output_name && (strcmp(output_name, "-") == 0)) {
2222                 piped_output = 1;
2223                 handle_pfds = handle_pfds_entries;
2224                 pfp = stdout;
2225                 if (setvbuf(pfp, NULL, _IONBF, 0)) {
2226                         perror("setvbuf stdout");
2227                         return 1;
2228                 }
2229         } else
2230                 handle_pfds = handle_pfds_file;
2231         return 0;
2232 }
2233
2234 static void ch_add_connection(struct net_server_s *ns, struct cl_host *ch,
2235                               int fd)
2236 {
2237         struct cl_conn *nc;
2238
2239         nc = malloc(sizeof(*nc));
2240         memset(nc, 0, sizeof(*nc));
2241
2242         time(&nc->connect_time);
2243         nc->ch = ch;
2244         nc->fd = fd;
2245         nc->ncpus = -1;
2246
2247         list_add_tail(&nc->ch_head, &ch->conn_list);
2248         ch->connects++;
2249
2250         list_add_tail(&nc->ns_head, &ns->conn_list);
2251         ns->connects++;
2252         ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2253 }
2254
2255 static void ch_rem_connection(struct net_server_s *ns, struct cl_host *ch,
2256                               struct cl_conn *nc)
2257 {
2258         net_close_connection(&nc->fd);
2259
2260         list_del(&nc->ch_head);
2261         ch->connects--;
2262
2263         list_del(&nc->ns_head);
2264         ns->connects--;
2265         ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2266
2267         free(nc);
2268 }
2269
2270 static struct cl_host *net_find_client_host(struct net_server_s *ns,
2271                                             struct in_addr cl_in_addr)
2272 {
2273         struct list_head *p;
2274
2275         __list_for_each(p, &ns->ch_list) {
2276                 struct cl_host *ch = list_entry(p, struct cl_host, head);
2277
2278                 if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
2279                         return ch;
2280         }
2281
2282         return NULL;
2283 }
2284
2285 static struct cl_host *net_add_client_host(struct net_server_s *ns,
2286                                            struct sockaddr_in *addr)
2287 {
2288         struct cl_host *ch;
2289
2290         ch = malloc(sizeof(*ch));
2291         memset(ch, 0, sizeof(*ch));
2292
2293         ch->ns = ns;
2294         ch->cl_in_addr = addr->sin_addr;
2295         list_add_tail(&ch->head, &ns->ch_list);
2296         ns->nchs++;
2297
2298         ch->hostname = strdup(inet_ntoa(addr->sin_addr));
2299         printf("server: connection from %s\n", ch->hostname);
2300
2301         INIT_LIST_HEAD(&ch->conn_list);
2302         INIT_LIST_HEAD(&ch->devpaths);
2303
2304         return ch;
2305 }
2306
2307 static void device_done(struct devpath *dpp, int ncpus)
2308 {
2309         int cpu;
2310         struct io_info *iop;
2311
2312         for (cpu = 0, iop = dpp->ios; cpu < ncpus; cpu++, iop++)
2313                 close_iop(iop);
2314
2315         list_del(&dpp->head);
2316         dpp_free(dpp);
2317 }
2318
2319 static void net_ch_remove(struct cl_host *ch, int ncpus)
2320 {
2321         struct list_head *p, *q;
2322         struct net_server_s *ns = ch->ns;
2323
2324         list_for_each_safe(p, q, &ch->devpaths) {
2325                 struct devpath *dpp = list_entry(p, struct devpath, head);
2326                 device_done(dpp, ncpus);
2327         }
2328
2329         list_for_each_safe(p, q, &ch->conn_list) {
2330                 struct cl_conn *nc = list_entry(p, struct cl_conn, ch_head);
2331
2332                 ch_rem_connection(ns, ch, nc);
2333         }
2334
2335         list_del(&ch->head);
2336         ns->nchs--;
2337
2338         if (ch->hostname)
2339                 free(ch->hostname);
2340         free(ch);
2341 }
2342
2343 static void net_add_connection(struct net_server_s *ns)
2344 {
2345         int fd;
2346         struct cl_host *ch;
2347         socklen_t socklen = sizeof(ns->addr);
2348
2349         fd = my_accept(ns->listen_fd, (struct sockaddr *)&ns->addr, &socklen);
2350         if (fd < 0) {
2351                 /*
2352                  * This is OK: we just won't accept this connection,
2353                  * nothing fatal.
2354                  */
2355                 perror("accept");
2356         } else {
2357                 ch = net_find_client_host(ns, ns->addr.sin_addr);
2358                 if (!ch)
2359                         ch = net_add_client_host(ns, &ns->addr);
2360
2361                 ch_add_connection(ns, ch, fd);
2362         }
2363 }
2364
2365 static struct devpath *nc_add_dpp(struct cl_conn *nc,
2366                                   struct blktrace_net_hdr *bnh,
2367                                   time_t connect_time)
2368 {
2369         int cpu;
2370         struct io_info *iop;
2371         struct devpath *dpp;
2372
2373         dpp = malloc(sizeof(*dpp));
2374         memset(dpp, 0, sizeof(*dpp));
2375
2376         dpp->buts_name = strdup(bnh->buts_name);
2377         dpp->path = strdup(bnh->buts_name);
2378         dpp->fd = -1;
2379         dpp->ch = nc->ch;
2380         dpp->cl_id = bnh->cl_id;
2381         dpp->cl_connect_time = connect_time;
2382         dpp->ncpus = nc->ncpus;
2383         dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
2384         memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
2385
2386         list_add_tail(&dpp->head, &nc->ch->devpaths);
2387         nc->ch->ndevs++;
2388
2389         dpp->ios = calloc(nc->ncpus, sizeof(*iop));
2390         memset(dpp->ios, 0, ndevs * sizeof(*iop));
2391
2392         for (cpu = 0, iop = dpp->ios; cpu < nc->ncpus; cpu++, iop++) {
2393                 iop->dpp = dpp;
2394                 iop->nc = nc;
2395                 init_mmap_info(&iop->mmap_info);
2396
2397                 if (iop_open(iop, cpu))
2398                         goto err;
2399         }
2400
2401         return dpp;
2402
2403 err:
2404         /*
2405          * Need to unravel what's been done...
2406          */
2407         while (cpu >= 0)
2408                 close_iop(&dpp->ios[cpu--]);
2409         dpp_free(dpp);
2410
2411         return NULL;
2412 }
2413
2414 static struct devpath *nc_find_dpp(struct cl_conn *nc,
2415                                    struct blktrace_net_hdr *bnh)
2416 {
2417         struct list_head *p;
2418         time_t connect_time = nc->connect_time;
2419
2420         __list_for_each(p, &nc->ch->devpaths) {
2421                 struct devpath *dpp = list_entry(p, struct devpath, head);
2422
2423                 if (!strcmp(dpp->buts_name, bnh->buts_name))
2424                         return dpp;
2425
2426                 if (dpp->cl_id == bnh->cl_id)
2427                         connect_time = dpp->cl_connect_time;
2428         }
2429
2430         return nc_add_dpp(nc, bnh, connect_time);
2431 }
2432
2433 static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
2434                                  struct blktrace_net_hdr *bnh)
2435 {
2436         int ret;
2437         struct io_info *iop = &dpp->ios[bnh->cpu];
2438         struct mmap_info *mip = &iop->mmap_info;
2439
2440         if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info, NULL)) {
2441                 fprintf(stderr, "ncd(%s:%d): mmap failed\n",
2442                         nc->ch->hostname, nc->fd);
2443                 exit(1);
2444         }
2445
2446         ret = net_recv_data(nc->fd, mip->fs_buf + mip->fs_off, bnh->len);
2447         if (ret > 0) {
2448                 pdc_dr_update(dpp, bnh->cpu, ret);
2449                 mip->fs_size += ret;
2450                 mip->fs_off += ret;
2451         } else if (ret < 0)
2452                 exit(1);
2453 }
2454
2455 /*
2456  * Returns 1 if we closed a host - invalidates other polling information
2457  * that may be present.
2458  */
2459 static int net_client_data(struct cl_conn *nc)
2460 {
2461         int ret;
2462         struct devpath *dpp;
2463         struct blktrace_net_hdr bnh;
2464
2465         ret = net_get_header(nc, &bnh);
2466         if (ret == 0)
2467                 return 0;
2468
2469         if (ret < 0) {
2470                 fprintf(stderr, "ncd(%d): header read failed\n", nc->fd);
2471                 exit(1);
2472         }
2473
2474         if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
2475                 fprintf(stderr, "ncd(%d): received data is bad\n", nc->fd);
2476                 exit(1);
2477         }
2478
2479         if (!data_is_native) {
2480                 bnh.magic = be32_to_cpu(bnh.magic);
2481                 bnh.cpu = be32_to_cpu(bnh.cpu);
2482                 bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
2483                 bnh.len = be32_to_cpu(bnh.len);
2484                 bnh.cl_id = be32_to_cpu(bnh.cl_id);
2485                 bnh.buf_size = be32_to_cpu(bnh.buf_size);
2486                 bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
2487                 bnh.page_size = be32_to_cpu(bnh.page_size);
2488         }
2489
2490         if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2491                 fprintf(stderr, "ncd(%s:%d): bad data magic\n",
2492                         nc->ch->hostname, nc->fd);
2493                 exit(1);
2494         }
2495
2496         if (nc->ncpus == -1)
2497                 nc->ncpus = bnh.max_cpus;
2498
2499         /*
2500          * len == 0 means the other end is sending us a new connection/dpp
2501          * len == 1 means that the other end signalled end-of-run
2502          */
2503         dpp = nc_find_dpp(nc, &bnh);
2504         if (bnh.len == 0) {
2505                 /*
2506                  * Just adding in the dpp above is enough
2507                  */
2508                 ack_open_close(nc->fd, dpp->buts_name);
2509                 nc->ch->cl_opens++;
2510         } else if (bnh.len == 1) {
2511                 /*
2512                  * overload cpu count with dropped events
2513                  */
2514                 dpp->drops = bnh.cpu;
2515
2516                 ack_open_close(nc->fd, dpp->buts_name);
2517                 if (--nc->ch->cl_opens == 0) {
2518                         show_stats(&nc->ch->devpaths);
2519                         net_ch_remove(nc->ch, nc->ncpus);
2520                         return 1;
2521                 }
2522         } else
2523                 net_client_read_data(nc, dpp, &bnh);
2524
2525         return 0;
2526 }
2527
2528 static void handle_client_data(struct net_server_s *ns, int events)
2529 {
2530         struct cl_conn *nc;
2531         struct pollfd *pfd;
2532         struct list_head *p, *q;
2533
2534         pfd = &ns->pfds[1];
2535         list_for_each_safe(p, q, &ns->conn_list) {
2536                 if (pfd->revents & POLLIN) {
2537                         nc = list_entry(p, struct cl_conn, ns_head);
2538
2539                         if (net_client_data(nc) || --events == 0)
2540                                 break;
2541                 }
2542                 pfd++;
2543         }
2544 }
2545
2546 static void net_setup_pfds(struct net_server_s *ns)
2547 {
2548         struct pollfd *pfd;
2549         struct list_head *p;
2550
2551         ns->pfds[0].fd = ns->listen_fd;
2552         ns->pfds[0].events = POLLIN;
2553
2554         pfd = &ns->pfds[1];
2555         __list_for_each(p, &ns->conn_list) {
2556                 struct cl_conn *nc = list_entry(p, struct cl_conn, ns_head);
2557
2558                 pfd->fd = nc->fd;
2559                 pfd->events = POLLIN;
2560                 pfd++;
2561         }
2562 }
2563
2564 static int net_server_handle_connections(struct net_server_s *ns)
2565 {
2566         int events;
2567
2568         printf("server: waiting for connections...\n");
2569
2570         while (!done) {
2571                 net_setup_pfds(ns);
2572                 events = poll(ns->pfds, ns->connects + 1, -1);
2573                 if (events < 0) {
2574                         if (errno != EINTR) {
2575                                 perror("FATAL: poll error");
2576                                 return 1;
2577                         }
2578                 } else if (events > 0) {
2579                         if (ns->pfds[0].revents & POLLIN) {
2580                                 net_add_connection(ns);
2581                                 events--;
2582                         }
2583
2584                         if (events)
2585                                 handle_client_data(ns, events);
2586                 }
2587         }
2588
2589         return 0;
2590 }
2591
2592 static int net_server(void)
2593 {
2594         int fd, opt;
2595         int ret = 1;
2596         struct net_server_s net_server;
2597         struct net_server_s *ns = &net_server;
2598
2599         memset(ns, 0, sizeof(*ns));
2600         INIT_LIST_HEAD(&ns->ch_list);
2601         INIT_LIST_HEAD(&ns->conn_list);
2602         ns->pfds = malloc(sizeof(struct pollfd));
2603
2604         fd = my_socket(AF_INET, SOCK_STREAM, 0);
2605         if (fd < 0) {
2606                 perror("server: socket");
2607                 goto out;
2608         }
2609
2610         opt = 1;
2611         if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
2612                 perror("setsockopt");
2613                 goto out;
2614         }
2615
2616         memset(&ns->addr, 0, sizeof(ns->addr));
2617         ns->addr.sin_family = AF_INET;
2618         ns->addr.sin_addr.s_addr = htonl(INADDR_ANY);
2619         ns->addr.sin_port = htons(net_port);
2620
2621         if (bind(fd, (struct sockaddr *) &ns->addr, sizeof(ns->addr)) < 0) {
2622                 perror("bind");
2623                 goto out;
2624         }
2625
2626         if (listen(fd, 1) < 0) {
2627                 perror("listen");
2628                 goto out;
2629         }
2630
2631         /*
2632          * The actual server looping is done here:
2633          */
2634         ns->listen_fd = fd;
2635         ret = net_server_handle_connections(ns);
2636
2637         /*
2638          * Clean up and return...
2639          */
2640 out:
2641         free(ns->pfds);
2642         return ret;
2643 }
2644
2645 static int run_tracers(void)
2646 {
2647         atexit(exit_tracing);
2648         if (net_mode == Net_client)
2649                 printf("blktrace: connecting to %s\n", hostname);
2650
2651         setup_buts();
2652
2653         if (use_tracer_devpaths()) {
2654                 if (setup_tracer_devpaths())
2655                         return 1;
2656
2657                 if (piped_output)
2658                         handle_list = handle_list_file;
2659                 else
2660                         handle_list = handle_list_net;
2661         }
2662
2663         start_tracers();
2664         if (nthreads_running == ncpus) {
2665                 unblock_tracers();
2666                 start_buts();
2667                 if (net_mode == Net_client)
2668                         printf("blktrace: connected!\n");
2669                 if (stop_watch)
2670                         alarm(stop_watch);
2671         } else
2672                 stop_tracers();
2673
2674         wait_tracers();
2675         if (nthreads_running == ncpus)
2676                 show_stats(&devpaths);
2677         if (net_client_use_send())
2678                 close_client_connections();
2679         del_tracers();
2680
2681         return 0;
2682 }
2683
2684 static cpu_set_t *get_online_cpus(void)
2685 {
2686         FILE *cpus;
2687         cpu_set_t *set;
2688         size_t alloc_size;
2689         int cpuid, prevcpuid = -1;
2690         char nextch;
2691         int n, ncpu, curcpu = 0;
2692         int *cpu_nums;
2693
2694         ncpu = sysconf(_SC_NPROCESSORS_CONF);
2695         if (ncpu < 0)
2696                 return NULL;
2697
2698         cpu_nums = malloc(sizeof(int)*ncpu);
2699         if (!cpu_nums) {
2700                 errno = ENOMEM;
2701                 return NULL;
2702         }
2703
2704         /*
2705          * There is no way to easily get maximum CPU number. So we have to
2706          * parse the file first to find it out and then create appropriate
2707          * cpuset
2708          */
2709         cpus = my_fopen("/sys/devices/system/cpu/online", "r");
2710         for (;;) {
2711                 n = fscanf(cpus, "%d%c", &cpuid, &nextch);
2712                 if (n <= 0)
2713                         break;
2714                 if (n == 2 && nextch == '-') {
2715                         prevcpuid = cpuid;
2716                         continue;
2717                 }
2718                 if (prevcpuid == -1)
2719                         prevcpuid = cpuid;
2720                 while (prevcpuid <= cpuid) {
2721                         /* More CPUs listed than configured? */
2722                         if (curcpu >= ncpu) {
2723                                 errno = EINVAL;
2724                                 return NULL;
2725                         }
2726                         cpu_nums[curcpu++] = prevcpuid++;
2727                 }
2728                 prevcpuid = -1;
2729         }
2730         fclose(cpus);
2731
2732         ncpu = curcpu;
2733         max_cpus = cpu_nums[ncpu - 1] + 1;
2734
2735         /* Now that we have maximum cpu number, create a cpuset */
2736         set = CPU_ALLOC(max_cpus);
2737         if (!set) {
2738                 errno = ENOMEM;
2739                 return NULL;
2740         }
2741         alloc_size = CPU_ALLOC_SIZE(max_cpus);
2742         CPU_ZERO_S(alloc_size, set);
2743
2744         for (curcpu = 0; curcpu < ncpu; curcpu++)
2745                 CPU_SET_S(cpu_nums[curcpu], alloc_size, set);
2746
2747         free(cpu_nums);
2748
2749         return set;
2750 }
2751
2752 int main(int argc, char *argv[])
2753 {
2754         int ret = 0;
2755
2756         setlocale(LC_NUMERIC, "en_US");
2757         pagesize = getpagesize();
2758         online_cpus = get_online_cpus();
2759         if (!online_cpus) {
2760                 fprintf(stderr, "cannot get online cpus %d/%s\n",
2761                         errno, strerror(errno));
2762                 ret = 1;
2763                 goto out;
2764         } else if (handle_args(argc, argv)) {
2765                 ret = 1;
2766                 goto out;
2767         }
2768
2769         ncpus = CPU_COUNT_S(CPU_ALLOC_SIZE(max_cpus), online_cpus);
2770         if (ndevs > 1 && output_name && strcmp(output_name, "-") != 0) {
2771                 fprintf(stderr, "-o not supported with multiple devices\n");
2772                 ret = 1;
2773                 goto out;
2774         }
2775
2776         signal(SIGINT, handle_sigint);
2777         signal(SIGHUP, handle_sigint);
2778         signal(SIGTERM, handle_sigint);
2779         signal(SIGALRM, handle_sigint);
2780         signal(SIGPIPE, SIG_IGN);
2781
2782         if (kill_running_trace) {
2783                 struct devpath *dpp;
2784                 struct list_head *p;
2785
2786                 __list_for_each(p, &devpaths) {
2787                         dpp = list_entry(p, struct devpath, head);
2788                         if (__stop_trace(dpp->fd)) {
2789                                 fprintf(stderr,
2790                                         "BLKTRACETEARDOWN %s failed: %d/%s\n",
2791                                         dpp->path, errno, strerror(errno));
2792                         }
2793                 }
2794         } else if (net_mode == Net_server) {
2795                 if (output_name) {
2796                         fprintf(stderr, "-o ignored in server mode\n");
2797                         output_name = NULL;
2798                 }
2799                 ret = net_server();
2800         } else
2801                 ret = run_tracers();
2802
2803 out:
2804         if (pfp)
2805                 fclose(pfp);
2806         rel_devpaths();
2807         return ret;
2808 }