Added accept as a system call needing resource increases
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  * Rewrite to have a single thread per CPU (managing all devices on that CPU)
8  *      Alan D. Brunelle <alan.brunelle@hp.com> - January 2009
9  *
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation; either version 2 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU General Public License for more details.
19  *
20  *  You should have received a copy of the GNU General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  *
24  */
25
26 #include <errno.h>
27 #include <stdarg.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <fcntl.h>
32 #include <getopt.h>
33 #include <sched.h>
34 #include <unistd.h>
35 #include <poll.h>
36 #include <signal.h>
37 #include <pthread.h>
38 #include <locale.h>
39 #include <sys/ioctl.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <sys/vfs.h>
43 #include <sys/mman.h>
44 #include <sys/param.h>
45 #include <sys/time.h>
46 #include <sys/resource.h>
47 #include <sys/socket.h>
48 #include <netinet/in.h>
49 #include <arpa/inet.h>
50 #include <netdb.h>
51 #include <sys/sendfile.h>
52
53 #include "btt/list.h"
54 #include "blktrace.h"
55
56 /*
57  * You may want to increase this even more, if you are logging at a high
58  * rate and see skipped/missed events
59  */
60 #define BUF_SIZE                (512 * 1024)
61 #define BUF_NR                  (4)
62
63 #define FILE_VBUF_SIZE          (128 * 1024)
64
65 #define DEBUGFS_TYPE            (0x64626720)
66 #define TRACE_NET_PORT          (8462)
67
68 enum {
69         Net_none = 0,
70         Net_server,
71         Net_client,
72 };
73
74 /*
75  * Generic stats collected: nevents can be _roughly_ estimated by data_read
76  * (discounting pdu...)
77  *
78  * These fields are updated w/ pdc_dr_update & pdc_nev_update below.
79  */
80 struct pdc_stats {
81         unsigned long long data_read;
82         unsigned long long nevents;
83 };
84
85 struct devpath {
86         struct list_head head;
87         char *path;                     /* path to device special file */
88         char *buts_name;                /* name returned from bt kernel code */
89         struct pdc_stats *stats;
90         int fd, idx, ncpus;
91         unsigned long long drops;
92
93         /*
94          * For piped output only:
95          *
96          * Each tracer will have a tracer_devpath_head that it will add new
97          * data onto. It's list is protected above (tracer_devpath_head.mutex)
98          * and it will signal the processing thread using the dp_cond,
99          * dp_mutex & dp_entries variables above.
100          */
101         struct tracer_devpath_head *heads;
102
103         /*
104          * For network server mode only:
105          */
106         struct cl_host *ch;
107         u32 cl_id;
108         time_t cl_connect_time;
109         struct io_info *ios;
110 };
111
112 /*
113  * For piped output to stdout we will have each tracer thread (one per dev)
114  * tack buffers read from the relay queues on a per-device list.
115  *
116  * The main thread will then collect trace buffers from each of lists in turn.
117  *
118  * We will use a mutex to guard each of the trace_buf list. The tracers
119  * can then signal the main thread using <dp_cond,dp_mutex> and
120  * dp_entries. (When dp_entries is 0, and a tracer adds an entry it will
121  * signal. When dp_entries is 0, the main thread will wait for that condition
122  * to be signalled.)
123  *
124  * adb: It may be better just to have a large buffer per tracer per dev,
125  * and then use it as a ring-buffer. This would certainly cut down a lot
126  * of malloc/free thrashing, at the cost of more memory movements (potentially).
127  */
128 struct trace_buf {
129         struct list_head head;
130         struct devpath *dpp;
131         void *buf;
132         int cpu, len;
133 };
134
135 struct tracer_devpath_head {
136         pthread_mutex_t mutex;
137         struct list_head head;
138         struct trace_buf *prev;
139 };
140
141 /*
142  * Used to handle the mmap() interfaces for output file (containing traces)
143  */
144 struct mmap_info {
145         void *fs_buf;
146         unsigned long long fs_size, fs_max_size, fs_off, fs_buf_len;
147         unsigned long buf_size, buf_nr;
148         int pagesize;
149 };
150
151 /*
152  * Each thread doing work on a (client) side of blktrace will have one
153  * of these. The ios array contains input/output information, pfds holds
154  * poll() data. The volatile's provide flags to/from the main executing
155  * thread.
156  */
157 struct tracer {
158         struct list_head head;
159         struct io_info *ios;
160         struct pollfd *pfds;
161         pthread_t thread;
162         pthread_mutex_t mutex;
163         pthread_cond_t cond;
164         int cpu, nios;
165         volatile int running, status, is_done;
166 };
167
168 /*
169  * networking stuff follows. we include a magic number so we know whether
170  * to endianness convert or not.
171  *
172  * The len field is overloaded:
173  *      0 - Indicates an "open" - allowing the server to set up for a dev/cpu
174  *      1 - Indicates a "close" - Shut down connection orderly
175  *
176  * The cpu field is overloaded on close: it will contain the number of drops.
177  */
178 struct blktrace_net_hdr {
179         u32 magic;              /* same as trace magic */
180         char buts_name[32];     /* trace name */
181         u32 cpu;                /* for which cpu */
182         u32 max_cpus;
183         u32 len;                /* length of following trace data */
184         u32 cl_id;              /* id for set of client per-cpu connections */
185         u32 buf_size;           /* client buf_size for this trace  */
186         u32 buf_nr;             /* client buf_nr for this trace  */
187         u32 page_size;          /* client page_size for this trace  */
188 };
189
190 /*
191  * Each host encountered has one of these. The head is used to link this
192  * on to the network server's ch_list. Connections associated with this
193  * host are linked on conn_list, and any devices traced on that host
194  * are connected on the devpaths list.
195  */
196 struct cl_host {
197         struct list_head head;
198         struct list_head conn_list;
199         struct list_head devpaths;
200         struct net_server_s *ns;
201         char *hostname;
202         struct in_addr cl_in_addr;
203         int connects, ndevs, cl_opens;
204 };
205
206 /*
207  * Each connection (client to server socket ('fd')) has one of these. A
208  * back reference to the host ('ch'), and lists headers (for the host
209  * list, and the network server conn_list) are also included.
210  */
211 struct cl_conn {
212         struct list_head ch_head, ns_head;
213         struct cl_host *ch;
214         int fd, ncpus;
215         time_t connect_time;
216 };
217
218 /*
219  * The network server requires some poll structures to be maintained -
220  * one per conection currently on conn_list. The nchs/ch_list values
221  * are for each host connected to this server. The addr field is used
222  * for scratch as new connections are established.
223  */
224 struct net_server_s {
225         struct list_head conn_list;
226         struct list_head ch_list;
227         struct pollfd *pfds;
228         int listen_fd, connects, nchs;
229         struct sockaddr_in addr;
230 };
231
232 /*
233  * This structure is (generically) used to providide information
234  * for a read-to-write set of values.
235  *
236  * ifn & ifd represent input information
237  *
238  * ofn, ofd, ofp, obuf & mmap_info are used for output file (optionally).
239  */
240 struct io_info {
241         struct devpath *dpp;
242         FILE *ofp;
243         char *obuf;
244         struct cl_conn *nc;     /* Server network connection */
245
246         /*
247          * mmap controlled output files
248          */
249         struct mmap_info mmap_info;
250
251         /*
252          * Client network fields
253          */
254         unsigned int ready;
255         unsigned long long data_queued;
256
257         /*
258          * Input/output file descriptors & names
259          */
260         int ifd, ofd;
261         char ifn[MAXPATHLEN + 64];
262         char ofn[MAXPATHLEN + 64];
263 };
264
265 static char blktrace_version[] = "2.0.0";
266
267 /*
268  * Linkage to blktrace helper routines (trace conversions)
269  */
270 int data_is_native = -1;
271
272 static int ncpus;
273 static int pagesize;
274 static int act_mask = ~0U;
275 static char *debugfs_path = "/sys/kernel/debug";
276 static char *output_name;
277 static char *output_dir;
278 static int kill_running_trace;
279 static int stop_watch;
280 static unsigned long buf_size = BUF_SIZE;
281 static unsigned long buf_nr = BUF_NR;
282 static LIST_HEAD(devpaths);
283 static LIST_HEAD(tracers);
284 static int ndevs;
285 static volatile int done;
286 static FILE *pfp;
287 static int piped_output;
288 static int ntracers;
289
290 static pthread_cond_t dp_cond = PTHREAD_COND_INITIALIZER;
291 static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
292 static volatile int dp_entries;
293
294 /*
295  * network cmd line params
296  */
297 static char hostname[MAXHOSTNAMELEN];
298 static int net_port = TRACE_NET_PORT;
299 static int net_use_sendfile = 1;
300 static int net_mode;
301 static int *cl_fds;
302
303 static int (*handle_pfds)(struct tracer *, int, int);
304 static int (*handle_list)(struct tracer_devpath_head *, struct list_head *);
305
306 #define S_OPTS  "d:a:A:r:o:kw:vVb:n:D:lh:p:sI:"
307 static struct option l_opts[] = {
308         {
309                 .name = "dev",
310                 .has_arg = required_argument,
311                 .flag = NULL,
312                 .val = 'd'
313         },
314         {
315                 .name = "input-devs",
316                 .has_arg = required_argument,
317                 .flag = NULL,
318                 .val = 'I'
319         },
320         {
321                 .name = "act-mask",
322                 .has_arg = required_argument,
323                 .flag = NULL,
324                 .val = 'a'
325         },
326         {
327                 .name = "set-mask",
328                 .has_arg = required_argument,
329                 .flag = NULL,
330                 .val = 'A'
331         },
332         {
333                 .name = "relay",
334                 .has_arg = required_argument,
335                 .flag = NULL,
336                 .val = 'r'
337         },
338         {
339                 .name = "output",
340                 .has_arg = required_argument,
341                 .flag = NULL,
342                 .val = 'o'
343         },
344         {
345                 .name = "kill",
346                 .has_arg = no_argument,
347                 .flag = NULL,
348                 .val = 'k'
349         },
350         {
351                 .name = "stopwatch",
352                 .has_arg = required_argument,
353                 .flag = NULL,
354                 .val = 'w'
355         },
356         {
357                 .name = "version",
358                 .has_arg = no_argument,
359                 .flag = NULL,
360                 .val = 'v'
361         },
362         {
363                 .name = "version",
364                 .has_arg = no_argument,
365                 .flag = NULL,
366                 .val = 'V'
367         },
368         {
369                 .name = "buffer-size",
370                 .has_arg = required_argument,
371                 .flag = NULL,
372                 .val = 'b'
373         },
374         {
375                 .name = "num-sub-buffers",
376                 .has_arg = required_argument,
377                 .flag = NULL,
378                 .val = 'n'
379         },
380         {
381                 .name = "output-dir",
382                 .has_arg = required_argument,
383                 .flag = NULL,
384                 .val = 'D'
385         },
386         {
387                 .name = "listen",
388                 .has_arg = no_argument,
389                 .flag = NULL,
390                 .val = 'l'
391         },
392         {
393                 .name = "host",
394                 .has_arg = required_argument,
395                 .flag = NULL,
396                 .val = 'h'
397         },
398         {
399                 .name = "port",
400                 .has_arg = required_argument,
401                 .flag = NULL,
402                 .val = 'p'
403         },
404         {
405                 .name = "no-sendfile",
406                 .has_arg = no_argument,
407                 .flag = NULL,
408                 .val = 's'
409         },
410         {
411                 .name = NULL,
412         }
413 };
414
415 static char usage_str[] = \
416         "-d <dev> [ -r debugfs path ] [ -o <output> ] [-k ] [ -w time ]\n" \
417         "[ -a action ] [ -A action mask ] [ -I  <devs file> ] [ -v ]\n\n" \
418         "\t-d Use specified device. May also be given last after options\n" \
419         "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
420         "\t-o File(s) to send output to\n" \
421         "\t-D Directory to prepend to output file names\n" \
422         "\t-k Kill a running trace\n" \
423         "\t-w Stop after defined time, in seconds\n" \
424         "\t-a Only trace specified actions. See documentation\n" \
425         "\t-A Give trace mask as a single value. See documentation\n" \
426         "\t-b Sub buffer size in KiB\n" \
427         "\t-n Number of sub buffers\n" \
428         "\t-l Run in network listen mode (blktrace server)\n" \
429         "\t-h Run in network client mode, connecting to the given host\n" \
430         "\t-p Network port to use (default 8462)\n" \
431         "\t-s Make the network client NOT use sendfile() to transfer data\n" \
432         "\t-I Add devices found in <devs file>\n" \
433         "\t-V Print program version info\n\n";
434
435 static void clear_events(struct pollfd *pfd)
436 {
437         pfd->events = 0;
438         pfd->revents = 0;
439 }
440
441 static inline int net_client_use_sendfile(void)
442 {
443         return net_mode == Net_client && net_use_sendfile;
444 }
445
446 static inline int net_client_use_send(void)
447 {
448         return net_mode == Net_client && !net_use_sendfile;
449 }
450
451 static inline int use_tracer_devpaths(void)
452 {
453         return piped_output || net_client_use_send();
454 }
455
456 static inline int in_addr_eq(struct in_addr a, struct in_addr b)
457 {
458         return a.s_addr == b.s_addr;
459 }
460
461 static inline void pdc_dr_update(struct devpath *dpp, int cpu, int data_read)
462 {
463         dpp->stats[cpu].data_read += data_read;
464 }
465
466 static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
467 {
468         dpp->stats[cpu].nevents += nevents;
469 }
470
471 static void show_usage(char *prog)
472 {
473         fprintf(stderr, "Usage: %s %s %s", prog, blktrace_version, usage_str);
474 }
475
476 static void init_mmap_info(struct mmap_info *mip)
477 {
478         mip->buf_size = buf_size;
479         mip->buf_nr = buf_nr;
480         mip->pagesize = pagesize;
481 }
482
483 static void net_close_connection(int *fd)
484 {
485         shutdown(*fd, SHUT_RDWR);
486         close(*fd);
487         *fd = -1;
488 }
489
490 static void dpp_free(struct devpath *dpp)
491 {
492         if (dpp->stats)
493                 free(dpp->stats);
494         if (dpp->ios)
495                 free(dpp->ios);
496         if (dpp->path)
497                 free(dpp->path);
498         if (dpp->buts_name)
499                 free(dpp->buts_name);
500         free(dpp);
501 }
502
503 static int lock_on_cpu(int cpu)
504 {
505         cpu_set_t cpu_mask;
506
507         CPU_ZERO(&cpu_mask);
508         CPU_SET(cpu, &cpu_mask);
509         if (sched_setaffinity(getpid(), sizeof(cpu_mask), &cpu_mask) < 0)
510                 return errno;
511
512         return 0;
513 }
514
515 /*
516  * Create a timespec 'msec' milliseconds into the future
517  */
518 static inline void make_timespec(struct timespec *tsp, long delta_msec)
519 {
520         struct timeval now;
521
522         gettimeofday(&now, NULL);
523         tsp->tv_sec = now.tv_sec;
524         tsp->tv_nsec = 1000L * now.tv_usec;
525
526         tsp->tv_nsec += (delta_msec * 1000000L);
527         if (tsp->tv_nsec > 1000000000L) {
528                 long secs = tsp->tv_nsec / 1000000000L;
529
530                 tsp->tv_sec += secs;
531                 tsp->tv_nsec -= (secs * 1000000000L);
532         }
533 }
534
535 static int increase_limit(int resource, rlim_t increase)
536 {
537         struct rlimit rlim;
538         int save_errno = errno;
539
540         if (!getrlimit(resource, &rlim)) {
541                 rlim.rlim_cur += increase;
542                 if (rlim.rlim_cur >= rlim.rlim_max)
543                         rlim.rlim_max = rlim.rlim_cur + increase;
544
545                 if (!setrlimit(resource, &rlim))
546                         return 1;
547         }
548
549         errno = save_errno;
550         return 0;
551 }
552
553 static int handle_open_failure(void)
554 {
555         if (errno == ENFILE || errno == EMFILE)
556                 return increase_limit(RLIMIT_NOFILE, 16);
557         return 0;
558 }
559
560 static int handle_mem_failure(size_t length)
561 {
562         if (errno == ENFILE)
563                 return handle_open_failure();
564         else if (errno == ENOMEM)
565                 return increase_limit(RLIMIT_MEMLOCK, 2 * length);
566         return 0;
567 }
568
569 static FILE *my_fopen(const char *path, const char *mode)
570 {
571         FILE *fp;
572
573         do {
574                 fp = fopen(path, mode);
575         } while (fp == NULL && handle_open_failure());
576
577         return fp;
578 }
579
580 static int my_open(const char *path, int flags)
581 {
582         int fd;
583
584         do {
585                 fd = open(path, flags);
586         } while (fd < 0 && handle_open_failure());
587
588         return fd;
589 }
590
591 static int my_socket(int domain, int type, int protocol)
592 {
593         int fd;
594
595         do {
596                 fd = socket(domain, type, protocol);
597         } while (fd < 0 && handle_open_failure());
598
599         return fd;
600 }
601
602 static int my_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
603 {
604         int fd;
605
606         do {
607                 fd = accept(sockfd, addr, addrlen);
608         } while (fd < 0 && handle_open_failure());
609
610         return fd;
611 }
612
613 static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
614                      off_t offset)
615 {
616         void *new;
617
618         do {
619                 new = mmap(addr, length, prot, flags, fd, offset);
620         } while (new == MAP_FAILED && handle_mem_failure(length));
621
622         return new;
623 }
624
625 static int my_mlock(const void *addr, size_t len)
626 {
627         int ret;
628
629         do {
630                 ret = mlock(addr, len);
631         } while (ret < 0 && handle_mem_failure(len));
632
633         return ret;
634 }
635
636 static int __stop_trace(int fd)
637 {
638         /*
639          * Should be stopped, don't complain if it isn't
640          */
641         ioctl(fd, BLKTRACESTOP);
642         return ioctl(fd, BLKTRACETEARDOWN);
643 }
644
645 static int write_data(char *buf, int len)
646 {
647         int ret;
648
649 rewrite:
650         ret = fwrite(buf, len, 1, pfp);
651         if (ferror(pfp) || ret != 1) {
652                 if (errno == EINTR) {
653                         clearerr(pfp);
654                         goto rewrite;
655                 }
656
657                 if (!piped_output || (errno != EPIPE && errno != EBADF)) {
658                         fprintf(stderr, "write(%d) failed: %d/%s\n",
659                                 len, errno, strerror(errno));
660                 }
661                 goto err;
662         }
663
664         fflush(pfp);
665         return 0;
666
667 err:
668         clearerr(pfp);
669         return 1;
670 }
671
672 /*
673  * Returns the number of bytes read (successfully)
674  */
675 static int __net_recv_data(int fd, void *buf, unsigned int len)
676 {
677         unsigned int bytes_left = len;
678
679         while (bytes_left && !done) {
680                 int ret = recv(fd, buf, bytes_left, MSG_WAITALL);
681
682                 if (ret == 0)
683                         break;
684                 else if (ret < 0) {
685                         if (errno != EAGAIN) {
686                                 perror("server: net_recv_data: recv failed");
687                                 break;
688                         } else
689                                 break;
690                 } else {
691                         buf += ret;
692                         bytes_left -= ret;
693                 }
694         }
695
696         return len - bytes_left;
697 }
698
699 static int net_recv_data(int fd, void *buf, unsigned int len)
700 {
701         return __net_recv_data(fd, buf, len);
702 }
703
704 /*
705  * Returns number of bytes written
706  */
707 static int net_send_data(int fd, void *buf, unsigned int buf_len)
708 {
709         int ret;
710         unsigned int bytes_left = buf_len;
711
712         while (bytes_left) {
713                 ret = send(fd, buf, bytes_left, 0);
714                 if (ret < 0) {
715                         perror("send");
716                         break;
717                 }
718
719                 buf += ret;
720                 bytes_left -= ret;
721         }
722
723         return buf_len - bytes_left;
724 }
725
726 static int net_send_header(int fd, int cpu, char *buts_name, int len)
727 {
728         struct blktrace_net_hdr hdr;
729
730         memset(&hdr, 0, sizeof(hdr));
731
732         hdr.magic = BLK_IO_TRACE_MAGIC;
733         strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
734         hdr.buts_name[sizeof(hdr.buts_name)-1] = '\0';
735         hdr.cpu = cpu;
736         hdr.max_cpus = ncpus;
737         hdr.len = len;
738         hdr.cl_id = getpid();
739         hdr.buf_size = buf_size;
740         hdr.buf_nr = buf_nr;
741         hdr.page_size = pagesize;
742
743         return net_send_data(fd, &hdr, sizeof(hdr)) != sizeof(hdr);
744 }
745
746 static void net_send_open_close(int fd, int cpu, char *buts_name, int len)
747 {
748         struct blktrace_net_hdr ret_hdr;
749
750         net_send_header(fd, cpu, buts_name, len);
751         net_recv_data(fd, &ret_hdr, sizeof(ret_hdr));
752 }
753
754 static void net_send_open(int fd, int cpu, char *buts_name)
755 {
756         net_send_open_close(fd, cpu, buts_name, 0);
757 }
758
759 static void net_send_close(int fd, char *buts_name, int drops)
760 {
761         /*
762          * Overload CPU w/ number of drops
763          *
764          * XXX: Need to clear/set done around call - done=1 (which
765          * is true here) stops reads from happening... :-(
766          */
767         done = 0;
768         net_send_open_close(fd, drops, buts_name, 1);
769         done = 1;
770 }
771
772 static void ack_open_close(int fd, char *buts_name)
773 {
774         net_send_header(fd, 0, buts_name, 2);
775 }
776
777 static void net_send_drops(int fd)
778 {
779         struct list_head *p;
780
781         __list_for_each(p, &devpaths) {
782                 struct devpath *dpp = list_entry(p, struct devpath, head);
783
784                 net_send_close(fd, dpp->buts_name, dpp->drops);
785         }
786 }
787
788 /*
789  * Returns:
790  *       0: "EOF"
791  *       1: OK
792  *      -1: Error
793  */
794 static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
795 {
796         int bytes_read;
797         int fl = fcntl(nc->fd, F_GETFL);
798
799         fcntl(nc->fd, F_SETFL, fl | O_NONBLOCK);
800         bytes_read = __net_recv_data(nc->fd, bnh, sizeof(*bnh));
801         fcntl(nc->fd, F_SETFL, fl & ~O_NONBLOCK);
802
803         if (bytes_read == sizeof(*bnh))
804                 return 1;
805         else if (bytes_read == 0)
806                 return 0;
807         return -1;
808 }
809
810 static int net_setup_client(void)
811 {
812         int fd;
813         struct sockaddr_in addr;
814
815         memset(&addr, 0, sizeof(addr));
816         addr.sin_family = AF_INET;
817         addr.sin_port = htons(net_port);
818
819         if (inet_aton(hostname, &addr.sin_addr) != 1) {
820                 struct hostent *hent = gethostbyname(hostname);
821                 if (!hent) {
822                         perror("gethostbyname");
823                         return 1;
824                 }
825
826                 memcpy(&addr.sin_addr, hent->h_addr, 4);
827                 strcpy(hostname, hent->h_name);
828         }
829
830         fd = my_socket(AF_INET, SOCK_STREAM, 0);
831         if (fd < 0) {
832                 perror("client: socket");
833                 return -1;
834         }
835
836         if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
837                 if (errno == ECONNREFUSED)
838                         fprintf(stderr,
839                                 "\nclient: Connection to %s refused, "
840                                 "perhaps the server is not started?\n\n",
841                                 hostname);
842                 else
843                         perror("client: connect");
844                 close(fd);
845                 return -1;
846         }
847
848         return fd;
849 }
850
851 static int open_client_connections(void)
852 {
853         int cpu;
854
855         cl_fds = calloc(ncpus, sizeof(*cl_fds));
856         for (cpu = 0; cpu < ncpus; cpu++) {
857                 cl_fds[cpu] = net_setup_client();
858                 if (cl_fds[cpu] < 0)
859                         goto err;
860         }
861         return 0;
862
863 err:
864         while (cpu > 0)
865                 close(cl_fds[cpu--]);
866         free(cl_fds);
867         return 1;
868 }
869
870 static void close_client_connections(void)
871 {
872         if (cl_fds) {
873                 int cpu, *fdp;
874
875                 for (cpu = 0, fdp = cl_fds; cpu < ncpus; cpu++, fdp++) {
876                         if (*fdp >= 0) {
877                                 net_send_drops(*fdp);
878                                 net_close_connection(fdp);
879                         }
880                 }
881                 free(cl_fds);
882         }
883 }
884
885 static void setup_buts(void)
886 {
887         struct list_head *p;
888
889         __list_for_each(p, &devpaths) {
890                 struct blk_user_trace_setup buts;
891                 struct devpath *dpp = list_entry(p, struct devpath, head);
892
893                 memset(&buts, 0, sizeof(buts));
894                 buts.buf_size = buf_size;
895                 buts.buf_nr = buf_nr;
896                 buts.act_mask = act_mask;
897
898                 if (ioctl(dpp->fd, BLKTRACESETUP, &buts) < 0) {
899                         fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
900                                 dpp->path, errno, strerror(errno));
901                         continue;
902                 } else if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
903                         fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
904                                 dpp->path, errno, strerror(errno));
905                         continue;
906                 }
907
908                 dpp->ncpus = ncpus;
909                 dpp->buts_name = strdup(buts.name);
910                 if (dpp->stats)
911                         free(dpp->stats);
912                 dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
913                 memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
914         }
915 }
916
917 static int get_drops(struct devpath *dpp)
918 {
919         int fd, drops = 0;
920         char fn[MAXPATHLEN + 64], tmp[256];
921
922         snprintf(fn, sizeof(fn), "%s/block/%s/dropped", debugfs_path,
923                  dpp->buts_name);
924
925         fd = my_open(fn, O_RDONLY);
926         if (fd < 0) {
927                 /*
928                  * This may be ok: the kernel may not support
929                  * dropped counts.
930                  */
931                 if (errno != ENOENT)
932                         fprintf(stderr, "Could not open %s: %d/%s\n",
933                                 fn, errno, strerror(errno));
934                 return 0;
935         } else if (read(fd, tmp, sizeof(tmp)) < 0) {
936                 fprintf(stderr, "Could not read %s: %d/%s\n",
937                         fn, errno, strerror(errno));
938         } else
939                 drops = atoi(tmp);
940         close(fd);
941
942         return drops;
943 }
944
945 static void get_all_drops(void)
946 {
947         struct list_head *p;
948
949         __list_for_each(p, &devpaths) {
950                 struct devpath *dpp = list_entry(p, struct devpath, head);
951                 dpp->drops = get_drops(dpp);
952         }
953 }
954
955 static inline struct trace_buf *alloc_trace_buf(int cpu, int bufsize)
956 {
957         struct trace_buf *tbp;
958
959         tbp = malloc(sizeof(*tbp) + bufsize);
960         INIT_LIST_HEAD(&tbp->head);
961         tbp->len = 0;
962         tbp->buf = (void *)(tbp + 1);
963         tbp->cpu = cpu;
964         tbp->dpp = NULL;        /* Will be set when tbp is added */
965
966         return tbp;
967 }
968
969 static void free_tracer_heads(struct devpath *dpp)
970 {
971         int cpu;
972         struct tracer_devpath_head *hd;
973
974         for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
975                 if (hd->prev)
976                         free(hd->prev);
977                 pthread_mutex_destroy(&hd->mutex);
978         }
979         free(dpp->heads);
980 }
981
982 static int setup_tracer_devpaths(void)
983 {
984         struct list_head *p;
985
986         if (net_client_use_send())
987                 if (open_client_connections())
988                         return 1;
989
990         __list_for_each(p, &devpaths) {
991                 int cpu;
992                 struct tracer_devpath_head *hd;
993                 struct devpath *dpp = list_entry(p, struct devpath, head);
994
995                 dpp->heads = calloc(ncpus, sizeof(struct tracer_devpath_head));
996                 for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
997                         INIT_LIST_HEAD(&hd->head);
998                         pthread_mutex_init(&hd->mutex, NULL);
999                         hd->prev = NULL;
1000                 }
1001         }
1002
1003         return 0;
1004 }
1005
1006 static inline void add_trace_buf(struct devpath *dpp, int cpu,
1007                                                 struct trace_buf **tbpp)
1008 {
1009         struct trace_buf *tbp = *tbpp;
1010         struct tracer_devpath_head *hd = &dpp->heads[cpu];
1011
1012         tbp->dpp = dpp;
1013
1014         pthread_mutex_lock(&hd->mutex);
1015         list_add_tail(&tbp->head, &hd->head);
1016         pthread_mutex_unlock(&hd->mutex);
1017
1018         *tbpp = alloc_trace_buf(cpu, buf_size);
1019 }
1020
1021 static inline void incr_entries(int entries_handled)
1022 {
1023         pthread_mutex_lock(&dp_mutex);
1024         if (dp_entries == 0)
1025                 pthread_cond_signal(&dp_cond);
1026         dp_entries += entries_handled;
1027         pthread_mutex_unlock(&dp_mutex);
1028 }
1029
1030 static int add_devpath(char *path)
1031 {
1032         int fd;
1033         struct devpath *dpp;
1034
1035         /*
1036          * Verify device is valid before going too far
1037          */
1038         fd = my_open(path, O_RDONLY | O_NONBLOCK);
1039         if (fd < 0) {
1040                 fprintf(stderr, "Invalid path %s specified: %d/%s\n",
1041                         path, errno, strerror(errno));
1042                 return 1;
1043         }
1044
1045         dpp = malloc(sizeof(*dpp));
1046         memset(dpp, 0, sizeof(*dpp));
1047         dpp->path = strdup(path);
1048         dpp->fd = fd;
1049         dpp->idx = ndevs++;
1050         list_add_tail(&dpp->head, &devpaths);
1051
1052         return 0;
1053 }
1054
1055 static void rel_devpaths(void)
1056 {
1057         struct list_head *p, *q;
1058
1059         list_for_each_safe(p, q, &devpaths) {
1060                 struct devpath *dpp = list_entry(p, struct devpath, head);
1061
1062                 list_del(&dpp->head);
1063                 __stop_trace(dpp->fd);
1064                 close(dpp->fd);
1065
1066                 if (dpp->heads)
1067                         free_tracer_heads(dpp);
1068
1069                 dpp_free(dpp);
1070                 ndevs--;
1071         }
1072 }
1073
1074 static int flush_subbuf_net(struct trace_buf *tbp)
1075 {
1076         int fd = cl_fds[tbp->cpu];
1077         struct devpath *dpp = tbp->dpp;
1078
1079         if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
1080                 return 1;
1081
1082         if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
1083                 return 1;
1084
1085         return 0;
1086 }
1087
1088 static int
1089 handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
1090                 struct list_head *list)
1091 {
1092         struct trace_buf *tbp;
1093         struct list_head *p, *q;
1094         int entries_handled = 0;
1095
1096         list_for_each_safe(p, q, list) {
1097                 tbp = list_entry(p, struct trace_buf, head);
1098
1099                 list_del(&tbp->head);
1100                 entries_handled++;
1101
1102                 if (cl_fds[tbp->cpu] >= 0) {
1103                         if (flush_subbuf_net(tbp)) {
1104                                 close(cl_fds[tbp->cpu]);
1105                                 cl_fds[tbp->cpu] = -1;
1106                         }
1107                 }
1108
1109                 free(tbp);
1110         }
1111
1112         return entries_handled;
1113 }
1114
1115 static int handle_list_file(struct tracer_devpath_head *hd,
1116                             struct list_head *list)
1117 {
1118         int off, t_len, nevents;
1119         struct blk_io_trace *t;
1120         struct list_head *p, *q;
1121         int entries_handled = 0;
1122         struct trace_buf *tbp, *prev;
1123
1124         prev = hd->prev;
1125         list_for_each_safe(p, q, list) {
1126                 tbp = list_entry(p, struct trace_buf, head);
1127                 list_del(&tbp->head);
1128                 entries_handled++;
1129
1130                 /*
1131                  * If there was some leftover before, tack this new
1132                  * entry onto the tail of the previous one.
1133                  */
1134                 if (prev) {
1135                         unsigned long tot_len;
1136                         struct trace_buf *tmp = tbp;
1137
1138                         tbp = prev;
1139                         prev = NULL;
1140
1141                         tot_len = tbp->len + tmp->len;
1142                         if (tot_len > buf_size) {
1143                                 /*
1144                                  * tbp->head isn't connected (it was 'prev'
1145                                  * so it had been taken off of the list
1146                                  * before). Therefore, we can realloc
1147                                  * the whole structures, as the other fields
1148                                  * are "static".
1149                                  */
1150                                 tbp = realloc(tbp->buf, sizeof(*tbp) + tot_len);
1151                                 tbp->buf = (void *)(tbp + 1);
1152                         }
1153
1154                         memcpy(tbp->buf + tbp->len, tmp->buf, tmp->len);
1155                         tbp->len = tot_len;
1156
1157                         free(tmp);
1158                 }
1159
1160                 /*
1161                  * See how many whole traces there are - send them
1162                  * all out in one go.
1163                  */
1164                 off = 0;
1165                 nevents = 0;
1166                 while (off + (int)sizeof(*t) <= tbp->len) {
1167                         t = (struct blk_io_trace *)(tbp->buf + off);
1168                         t_len = sizeof(*t) + t->pdu_len;
1169                         if (off + t_len > tbp->len)
1170                                 break;
1171
1172                         off += t_len;
1173                         nevents++;
1174                 }
1175                 if (nevents)
1176                         pdc_nev_update(tbp->dpp, tbp->cpu, nevents);
1177
1178                 /*
1179                  * Write any full set of traces, any remaining data is kept
1180                  * for the next pass.
1181                  */
1182                 if (off) {
1183                         if (write_data(tbp->buf, off) || off == tbp->len)
1184                                 free(tbp);
1185                         else {
1186                                 /*
1187                                  * Move valid data to beginning of buffer
1188                                  */
1189                                 tbp->len -= off;
1190                                 memmove(tbp->buf, tbp->buf + off, tbp->len);
1191                                 prev = tbp;
1192                         }
1193                 } else
1194                         prev = tbp;
1195         }
1196         hd->prev = prev;
1197
1198         return entries_handled;
1199 }
1200
1201 static void __process_trace_bufs(void)
1202 {
1203         int cpu;
1204         struct list_head *p;
1205         struct list_head list;
1206         int handled = 0;
1207
1208         __list_for_each(p, &devpaths) {
1209                 struct devpath *dpp = list_entry(p, struct devpath, head);
1210                 struct tracer_devpath_head *hd = dpp->heads;
1211
1212                 for (cpu = 0; cpu < ncpus; cpu++, hd++) {
1213                         pthread_mutex_lock(&hd->mutex);
1214                         if (list_empty(&hd->head)) {
1215                                 pthread_mutex_unlock(&hd->mutex);
1216                                 continue;
1217                         }
1218
1219                         list_replace_init(&hd->head, &list);
1220                         pthread_mutex_unlock(&hd->mutex);
1221
1222                         handled += handle_list(hd, &list);
1223                 }
1224         }
1225
1226         if (handled) {
1227                 pthread_mutex_lock(&dp_mutex);
1228                 dp_entries -= handled;
1229                 pthread_mutex_unlock(&dp_mutex);
1230         }
1231 }
1232
1233 static void process_trace_bufs(void)
1234 {
1235         while (!done) {
1236                 pthread_mutex_lock(&dp_mutex);
1237                 while (!done && dp_entries == 0) {
1238                         struct timespec ts;
1239
1240                         make_timespec(&ts, 50);
1241                         pthread_cond_timedwait(&dp_cond, &dp_mutex, &ts);
1242                 }
1243                 pthread_mutex_unlock(&dp_mutex);
1244
1245                 __process_trace_bufs();
1246         }
1247 }
1248
1249 static void clean_trace_bufs(void)
1250 {
1251         /*
1252          * No mutex needed here: we're only reading from the lists,
1253          * tracers are done
1254          */
1255         while (dp_entries)
1256                 __process_trace_bufs();
1257 }
1258
1259 static inline void read_err(int cpu, char *ifn)
1260 {
1261         if (errno != EAGAIN)
1262                 fprintf(stderr, "Thread %d failed read of %s: %d/%s\n",
1263                         cpu, ifn, errno, strerror(errno));
1264 }
1265
1266 static int net_sendfile(struct io_info *iop)
1267 {
1268         int ret;
1269
1270         ret = sendfile(iop->ofd, iop->ifd, NULL, iop->ready);
1271         if (ret < 0) {
1272                 perror("sendfile");
1273                 return 1;
1274         } else if (ret < (int)iop->ready) {
1275                 fprintf(stderr, "short sendfile send (%d of %d)\n",
1276                         ret, iop->ready);
1277                 return 1;
1278         }
1279
1280         return 0;
1281 }
1282
1283 static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
1284 {
1285         struct devpath *dpp = iop->dpp;
1286
1287         if (net_send_header(iop->ofd, tp->cpu, dpp->buts_name, iop->ready))
1288                 return 1;
1289         return net_sendfile(iop);
1290 }
1291
1292 static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
1293 {
1294         struct stat sb;
1295         int i, nentries = 0;
1296         struct pdc_stats *sp;
1297         struct pollfd *pfd = tp->pfds;
1298         struct io_info *iop = tp->ios;
1299
1300         for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++, sp++) {
1301                 if (pfd->revents & POLLIN || force_read) {
1302                         if (fstat(iop->ifd, &sb) < 0) {
1303                                 perror(iop->ifn);
1304                                 pfd->events = 0;
1305                         } else if (sb.st_size > (off_t)iop->data_queued) {
1306                                 iop->ready = sb.st_size - iop->data_queued;
1307                                 iop->data_queued = sb.st_size;
1308                                 if (!net_sendfile_data(tp, iop)) {
1309                                         pdc_dr_update(iop->dpp, tp->cpu,
1310                                                       iop->ready);
1311                                         nentries++;
1312                                 } else
1313                                         clear_events(pfd);
1314                         }
1315                         nevs--;
1316                 }
1317         }
1318
1319         if (nentries)
1320                 incr_entries(nentries);
1321
1322         return nentries;
1323 }
1324
1325 static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
1326 {
1327         int i, nentries = 0;
1328         struct trace_buf *tbp;
1329         struct pollfd *pfd = tp->pfds;
1330         struct io_info *iop = tp->ios;
1331
1332         tbp = alloc_trace_buf(tp->cpu, buf_size);
1333         for (i = 0; i < ndevs; i++, pfd++, iop++) {
1334                 if (pfd->revents & POLLIN || force_read) {
1335                         tbp->len = read(iop->ifd, tbp->buf, buf_size);
1336                         if (tbp->len > 0) {
1337                                 pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
1338                                 add_trace_buf(iop->dpp, tp->cpu, &tbp);
1339                                 nentries++;
1340                         } else if (tbp->len == 0) {
1341                                 /*
1342                                  * Short reads after we're done stop us
1343                                  * from trying reads.
1344                                  */
1345                                 if (tp->is_done)
1346                                         clear_events(pfd);
1347                         } else {
1348                                 read_err(tp->cpu, iop->ifn);
1349                                 if (errno != EAGAIN || tp->is_done)
1350                                         clear_events(pfd);
1351                         }
1352                         if (!piped_output && --nevs == 0)
1353                                 break;
1354                 }
1355         }
1356         free(tbp);
1357
1358         if (nentries)
1359                 incr_entries(nentries);
1360
1361         return nentries;
1362 }
1363
1364 static int fill_ofname(struct io_info *iop, int cpu)
1365 {
1366         int len;
1367         struct stat sb;
1368         char *dst = iop->ofn;
1369
1370         if (output_dir)
1371                 len = snprintf(iop->ofn, sizeof(iop->ofn), "%s/", output_dir);
1372         else
1373                 len = snprintf(iop->ofn, sizeof(iop->ofn), "./");
1374
1375         if (net_mode == Net_server) {
1376                 struct cl_conn *nc = iop->nc;
1377
1378                 len += sprintf(dst + len, "%s-", nc->ch->hostname);
1379                 len += strftime(dst + len, 64, "%F-%T/",
1380                                 gmtime(&iop->dpp->cl_connect_time));
1381         }
1382
1383         if (stat(iop->ofn, &sb) < 0) {
1384                 if (errno != ENOENT) {
1385                         fprintf(stderr,
1386                                 "Destination dir %s stat failed: %d/%s\n",
1387                                 iop->ofn, errno, strerror(errno));
1388                         return 1;
1389                 }
1390                 if (mkdir(iop->ofn, 0755) < 0) {
1391                         fprintf(stderr,
1392                                 "Destination dir %s can't be made: %d/%s\n",
1393                                 iop->ofn, errno, strerror(errno));
1394                         return 1;
1395                 }
1396         }
1397
1398         if (output_name)
1399                 snprintf(iop->ofn + len, sizeof(iop->ofn), "%s.blktrace.%d",
1400                          output_name, cpu);
1401         else
1402                 snprintf(iop->ofn + len, sizeof(iop->ofn), "%s.blktrace.%d",
1403                          iop->dpp->buts_name, cpu);
1404
1405         return 0;
1406 }
1407
1408 static int set_vbuf(struct io_info *iop, int mode, size_t size)
1409 {
1410         iop->obuf = malloc(size);
1411         if (setvbuf(iop->ofp, iop->obuf, mode, size) < 0) {
1412                 fprintf(stderr, "setvbuf(%s, %d) failed: %d/%s\n",
1413                         iop->dpp->path, (int)size, errno,
1414                         strerror(errno));
1415                 free(iop->obuf);
1416                 return 1;
1417         }
1418
1419         return 0;
1420 }
1421
1422 static int iop_open(struct io_info *iop, int cpu)
1423 {
1424         iop->ofd = -1;
1425         if (fill_ofname(iop, cpu))
1426                 return 1;
1427
1428         iop->ofp = my_fopen(iop->ofn, "w+");
1429         if (iop->ofp == NULL) {
1430                 fprintf(stderr, "Open output file %s failed: %d/%s\n",
1431                         iop->ofn, errno, strerror(errno));
1432                 return 1;
1433         }
1434         if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
1435                 fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
1436                         iop->ofn, errno, strerror(errno));
1437                 fclose(iop->ofp);
1438                 return 1;
1439         }
1440
1441         iop->ofd = fileno(iop->ofp);
1442         return 0;
1443 }
1444
1445 static int open_ios(struct tracer *tp)
1446 {
1447         struct pollfd *pfd;
1448         struct io_info *iop;
1449         struct list_head *p;
1450
1451         tp->ios = calloc(ndevs, sizeof(struct io_info));
1452         tp->pfds = calloc(ndevs, sizeof(struct pollfd));
1453
1454         memset(tp->ios, 0, ndevs * sizeof(struct io_info));
1455         memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
1456
1457         tp->nios = 0;
1458         iop = tp->ios;
1459         pfd = tp->pfds;
1460         __list_for_each(p, &devpaths) {
1461                 struct devpath *dpp = list_entry(p, struct devpath, head);
1462
1463                 iop->dpp = dpp;
1464                 iop->ofd = -1;
1465                 snprintf(iop->ifn, sizeof(iop->ifn), "%s/block/%s/trace%d",
1466                         debugfs_path, dpp->buts_name, tp->cpu);
1467
1468                 iop->ifd = my_open(iop->ifn, O_RDONLY | O_NONBLOCK);
1469                 if (iop->ifd < 0) {
1470                         fprintf(stderr, "Thread %d failed open %s: %d/%s\n",
1471                                 tp->cpu, iop->ifn, errno, strerror(errno));
1472                         return 1;
1473                 }
1474
1475                 init_mmap_info(&iop->mmap_info);
1476
1477                 pfd->fd = iop->ifd;
1478                 pfd->events = POLLIN;
1479
1480                 if (piped_output)
1481                         ;
1482                 else if (net_client_use_sendfile()) {
1483                         iop->ofd = net_setup_client();
1484                         if (iop->ofd < 0)
1485                                 goto err;
1486                         net_send_open(iop->ofd, tp->cpu, dpp->buts_name);
1487                 } else if (net_mode == Net_none) {
1488                         if (iop_open(iop, tp->cpu))
1489                                 goto err;
1490                 } else {
1491                         /*
1492                          * This ensures that the server knows about all
1493                          * connections & devices before _any_ closes
1494                          */
1495                         net_send_open(cl_fds[tp->cpu], tp->cpu, dpp->buts_name);
1496                 }
1497
1498                 pfd++;
1499                 iop++;
1500                 tp->nios++;
1501         }
1502
1503         return 0;
1504
1505 err:
1506         close(iop->ifd);        /* tp->nios _not_ bumped */
1507         return 1;
1508 }
1509
1510 static void close_iop(struct io_info *iop)
1511 {
1512         struct mmap_info *mip = &iop->mmap_info;
1513
1514         if (mip->fs_buf)
1515                 munmap(mip->fs_buf, mip->fs_buf_len);
1516
1517         if (!piped_output) {
1518                 if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
1519                         fprintf(stderr,
1520                                 "Ignoring err: ftruncate(%s): %d/%s\n",
1521                                 iop->ofn, errno, strerror(errno));
1522                 }
1523         }
1524
1525         if (iop->ofp)
1526                 fclose(iop->ofp);
1527         if (iop->obuf)
1528                 free(iop->obuf);
1529 }
1530
1531 static void close_ios(struct tracer *tp)
1532 {
1533         while (tp->nios > 0) {
1534                 struct io_info *iop = &tp->ios[--tp->nios];
1535
1536                 iop->dpp->drops = get_drops(iop->dpp);
1537                 if (iop->ifd >= 0)
1538                         close(iop->ifd);
1539
1540                 if (iop->ofp)
1541                         close_iop(iop);
1542                 else if (iop->ofd >= 0) {
1543                         struct devpath *dpp = iop->dpp;
1544
1545                         net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
1546                         net_close_connection(&iop->ofd);
1547                 }
1548         }
1549
1550         free(tp->ios);
1551         free(tp->pfds);
1552 }
1553
1554 static int setup_mmap(int fd, unsigned int maxlen, struct mmap_info *mip)
1555 {
1556         if (mip->fs_off + maxlen > mip->fs_buf_len) {
1557                 unsigned long nr = max(16, mip->buf_nr);
1558
1559                 if (mip->fs_buf) {
1560                         munlock(mip->fs_buf, mip->fs_buf_len);
1561                         munmap(mip->fs_buf, mip->fs_buf_len);
1562                         mip->fs_buf = NULL;
1563                 }
1564
1565                 mip->fs_off = mip->fs_size & (mip->pagesize - 1);
1566                 mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
1567                 mip->fs_max_size += mip->fs_buf_len;
1568
1569                 if (ftruncate(fd, mip->fs_max_size) < 0) {
1570                         perror("__setup_mmap: ftruncate");
1571                         return 1;
1572                 }
1573
1574                 mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
1575                                       MAP_SHARED, fd,
1576                                       mip->fs_size - mip->fs_off);
1577                 if (mip->fs_buf == MAP_FAILED) {
1578                         perror("__setup_mmap: mmap");
1579                         return 1;
1580                 }
1581                 my_mlock(mip->fs_buf, mip->fs_buf_len);
1582         }
1583
1584         return 0;
1585 }
1586
1587 static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
1588 {
1589         struct mmap_info *mip;
1590         int i, ret, nentries = 0;
1591         struct pollfd *pfd = tp->pfds;
1592         struct io_info *iop = tp->ios;
1593
1594         for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++) {
1595                 if (pfd->revents & POLLIN || force_read) {
1596                         mip = &iop->mmap_info;
1597
1598                         ret = setup_mmap(iop->ofd, buf_size, mip);
1599                         if (ret < 0) {
1600                                 pfd->events = 0;
1601                                 break;
1602                         }
1603
1604                         ret = read(iop->ifd, mip->fs_buf + mip->fs_off,
1605                                    buf_size);
1606                         if (ret > 0) {
1607                                 pdc_dr_update(iop->dpp, tp->cpu, ret);
1608                                 mip->fs_size += ret;
1609                                 mip->fs_off += ret;
1610                                 nentries++;
1611                         } else if (ret == 0) {
1612                                 /*
1613                                  * Short reads after we're done stop us
1614                                  * from trying reads.
1615                                  */
1616                                 if (tp->is_done)
1617                                         clear_events(pfd);
1618                         } else {
1619                                 read_err(tp->cpu, iop->ifn);
1620                                 if (errno != EAGAIN || tp->is_done)
1621                                         clear_events(pfd);
1622                         }
1623                         nevs--;
1624                 }
1625         }
1626
1627         return nentries;
1628 }
1629
1630 static void *thread_main(void *arg)
1631 {
1632         int ret, ndone;
1633         int to_val;
1634
1635         struct tracer *tp = arg;
1636
1637         ret = lock_on_cpu(tp->cpu);
1638         if (ret)
1639                 goto err;
1640
1641         ret = open_ios(tp);
1642         if (ret) {
1643                 close_ios(tp);
1644                 goto err;
1645         }
1646
1647         pthread_mutex_lock(&tp->mutex);
1648         tp->running = 1;
1649         pthread_cond_signal(&tp->cond);
1650         pthread_mutex_unlock(&tp->mutex);
1651
1652         if (piped_output)
1653                 to_val = 50;            /* Frequent partial handles */
1654         else
1655                 to_val = 500;           /* 1/2 second intervals */
1656
1657         while (!tp->is_done) {
1658                 ndone = poll(tp->pfds, ndevs, to_val);
1659                 if (ndone || piped_output)
1660                         (void)handle_pfds(tp, ndone, piped_output);
1661                 else if (ndone < 0 && errno != EINTR)
1662                         fprintf(stderr, "Thread %d poll failed: %d/%s\n",
1663                                 tp->cpu, errno, strerror(errno));
1664         }
1665
1666         /*
1667          * Trace is stopped, pull data until we get a short read
1668          */
1669         while (handle_pfds(tp, ndevs, 1) > 0)
1670                 ;
1671
1672         close_ios(tp);
1673
1674 err:
1675         pthread_mutex_lock(&tp->mutex);
1676         tp->running = 0;
1677         tp->status = ret;
1678         pthread_cond_signal(&tp->cond);
1679         pthread_mutex_unlock(&tp->mutex);
1680         return NULL;
1681 }
1682
1683 static int start_tracer(int cpu)
1684 {
1685         struct tracer *tp;
1686
1687         tp = malloc(sizeof(*tp));
1688         memset(tp, 0, sizeof(*tp));
1689
1690         INIT_LIST_HEAD(&tp->head);
1691         pthread_mutex_init(&tp->mutex, NULL);
1692         pthread_cond_init(&tp->cond, NULL);
1693         tp->running = 0;
1694         tp->status = 0;
1695         tp->cpu = cpu;
1696
1697         if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
1698                 fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
1699                         cpu, errno, strerror(errno));
1700                 goto err;
1701         }
1702
1703         pthread_mutex_lock(&tp->mutex);
1704         while (!tp->running && (tp->status == 0))
1705                 pthread_cond_wait(&tp->cond, &tp->mutex);
1706         pthread_mutex_unlock(&tp->mutex);
1707
1708         if (tp->status == 0) {
1709                 list_add_tail(&tp->head, &tracers);
1710                 return 0;
1711         }
1712
1713         fprintf(stderr, "FAILED to start thread on CPU %d\n", cpu);
1714
1715 err:
1716         pthread_mutex_destroy(&tp->mutex);
1717         pthread_cond_destroy(&tp->cond);
1718         free(tp);
1719         return 1;
1720 }
1721
1722 static int start_tracers(void)
1723 {
1724         int cpu;
1725
1726         for (cpu = 0; cpu < ncpus; cpu++)
1727                 if (start_tracer(cpu))
1728                         break;
1729
1730         return cpu;
1731 }
1732
1733 static void stop_tracers(void)
1734 {
1735         struct list_head *p;
1736
1737         /*
1738          * Stop the tracing - makes the tracer threads clean up quicker.
1739          */
1740         __list_for_each(p, &devpaths) {
1741                 struct devpath *dpp = list_entry(p, struct devpath, head);
1742                 (void)ioctl(dpp->fd, BLKTRACESTOP);
1743         }
1744
1745         /*
1746          * Tell each tracer to quit
1747          */
1748         __list_for_each(p, &tracers) {
1749                 struct tracer *tp = list_entry(p, struct tracer, head);
1750                 tp->is_done = 1;
1751         }
1752 }
1753
1754 static void del_tracers(void)
1755 {
1756         struct list_head *p, *q;
1757
1758         list_for_each_safe(p, q, &tracers) {
1759                 struct tracer *tp = list_entry(p, struct tracer, head);
1760
1761                 list_del(&tp->head);
1762                 free(tp);
1763         }
1764         ntracers = 0;
1765 }
1766
1767 static void wait_tracers(void)
1768 {
1769         struct list_head *p;
1770
1771         if (use_tracer_devpaths())
1772                 process_trace_bufs();
1773
1774         __list_for_each(p, &tracers) {
1775                 int ret;
1776                 struct tracer *tp = list_entry(p, struct tracer, head);
1777
1778                 pthread_mutex_lock(&tp->mutex);
1779                 while (tp->running)
1780                         pthread_cond_wait(&tp->cond, &tp->mutex);
1781                 pthread_mutex_unlock(&tp->mutex);
1782
1783                 ret = pthread_join(tp->thread, NULL);
1784                 if (ret)
1785                         fprintf(stderr, "Thread join %d failed %d\n",
1786                                 tp->cpu, ret);
1787         }
1788
1789         if (use_tracer_devpaths())
1790                 clean_trace_bufs();
1791
1792         get_all_drops();
1793 }
1794
1795 static void exit_tracing(void)
1796 {
1797         signal(SIGINT, SIG_IGN);
1798         signal(SIGHUP, SIG_IGN);
1799         signal(SIGTERM, SIG_IGN);
1800         signal(SIGALRM, SIG_IGN);
1801
1802         stop_tracers();
1803         wait_tracers();
1804         del_tracers();
1805         rel_devpaths();
1806 }
1807
1808 static void handle_sigint(__attribute__((__unused__)) int sig)
1809 {
1810         done = 1;
1811         stop_tracers();
1812 }
1813
1814 static void show_stats(struct list_head *devpaths)
1815 {
1816         FILE *ofp;
1817         struct list_head *p;
1818         unsigned long long nevents, data_read;
1819         unsigned long long total_drops = 0;
1820         unsigned long long total_events = 0;
1821
1822         if (piped_output)
1823                 ofp = my_fopen("/dev/null", "w");
1824         else
1825                 ofp = stdout;
1826
1827         __list_for_each(p, devpaths) {
1828                 int cpu;
1829                 struct pdc_stats *sp;
1830                 struct devpath *dpp = list_entry(p, struct devpath, head);
1831
1832                 if (net_mode == Net_server)
1833                         printf("server: end of run for %s:%s\n",
1834                                 dpp->ch->hostname, dpp->buts_name);
1835
1836                 data_read = 0;
1837                 nevents = 0;
1838
1839                 fprintf(ofp, "=== %s ===\n", dpp->buts_name);
1840                 for (cpu = 0, sp = dpp->stats; cpu < dpp->ncpus; cpu++, sp++) {
1841                         /*
1842                          * Estimate events if not known...
1843                          */
1844                         if (sp->nevents == 0) {
1845                                 sp->nevents = sp->data_read /
1846                                                 sizeof(struct blk_io_trace);
1847                         }
1848
1849                         fprintf(ofp,
1850                                 "  CPU%3d: %20llu events, %8llu KiB data\n",
1851                                 cpu, sp->nevents, (sp->data_read + 1023) >> 10);
1852
1853                         data_read += sp->data_read;
1854                         nevents += sp->nevents;
1855                 }
1856
1857                 fprintf(ofp, "  Total:  %20llu events (dropped %llu),"
1858                              " %8llu KiB data\n", nevents,
1859                              dpp->drops, (data_read + 1024) >> 10);
1860
1861                 total_drops += dpp->drops;
1862                 total_events += (nevents + dpp->drops);
1863         }
1864
1865         fflush(ofp);
1866         if (piped_output)
1867                 fclose(ofp);
1868
1869         if (total_drops) {
1870                 double drops_ratio = 1.0;
1871
1872                 if (total_events)
1873                         drops_ratio = (double)total_drops/(double)total_events;
1874
1875                 fprintf(stderr, "\nYou have %llu (%5.1lf%%) dropped events\n"
1876                                 "Consider using a larger buffer size (-b) "
1877                                 "and/or more buffers (-n)\n",
1878                         total_drops, 100.0 * drops_ratio);
1879         }
1880 }
1881
1882 static int handle_args(int argc, char *argv[])
1883 {
1884         int c, i;
1885         struct statfs st;
1886         int act_mask_tmp = 0;
1887
1888         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
1889                 switch (c) {
1890                 case 'a':
1891                         i = find_mask_map(optarg);
1892                         if (i < 0) {
1893                                 fprintf(stderr, "Invalid action mask %s\n",
1894                                         optarg);
1895                                 return 1;
1896                         }
1897                         act_mask_tmp |= i;
1898                         break;
1899
1900                 case 'A':
1901                         if ((sscanf(optarg, "%x", &i) != 1) ||
1902                                                         !valid_act_opt(i)) {
1903                                 fprintf(stderr,
1904                                         "Invalid set action mask %s/0x%x\n",
1905                                         optarg, i);
1906                                 return 1;
1907                         }
1908                         act_mask_tmp = i;
1909                         break;
1910
1911                 case 'd':
1912                         if (add_devpath(optarg) != 0)
1913                                 return 1;
1914                         break;
1915
1916                 case 'I': {
1917                         char dev_line[256];
1918                         FILE *ifp = my_fopen(optarg, "r");
1919
1920                         if (!ifp) {
1921                                 fprintf(stderr,
1922                                         "Invalid file for devices %s\n",
1923                                         optarg);
1924                                 return 1;
1925                         }
1926
1927                         while (fscanf(ifp, "%s\n", dev_line) == 1)
1928                                 if (add_devpath(dev_line) != 0)
1929                                         return 1;
1930                         break;
1931                 }
1932
1933                 case 'r':
1934                         debugfs_path = optarg;
1935                         break;
1936
1937                 case 'o':
1938                         output_name = optarg;
1939                         break;
1940                 case 'k':
1941                         kill_running_trace = 1;
1942                         break;
1943                 case 'w':
1944                         stop_watch = atoi(optarg);
1945                         if (stop_watch <= 0) {
1946                                 fprintf(stderr,
1947                                         "Invalid stopwatch value (%d secs)\n",
1948                                         stop_watch);
1949                                 return 1;
1950                         }
1951                         break;
1952                 case 'V':
1953                 case 'v':
1954                         printf("%s version %s\n", argv[0], blktrace_version);
1955                         exit(0);
1956                         /*NOTREACHED*/
1957                 case 'b':
1958                         buf_size = strtoul(optarg, NULL, 10);
1959                         if (buf_size <= 0 || buf_size > 16*1024) {
1960                                 fprintf(stderr, "Invalid buffer size (%lu)\n",
1961                                         buf_size);
1962                                 return 1;
1963                         }
1964                         buf_size <<= 10;
1965                         break;
1966                 case 'n':
1967                         buf_nr = strtoul(optarg, NULL, 10);
1968                         if (buf_nr <= 0) {
1969                                 fprintf(stderr,
1970                                         "Invalid buffer nr (%lu)\n", buf_nr);
1971                                 return 1;
1972                         }
1973                         break;
1974                 case 'D':
1975                         output_dir = optarg;
1976                         break;
1977                 case 'h':
1978                         net_mode = Net_client;
1979                         strcpy(hostname, optarg);
1980                         break;
1981                 case 'l':
1982                         net_mode = Net_server;
1983                         break;
1984                 case 'p':
1985                         net_port = atoi(optarg);
1986                         break;
1987                 case 's':
1988                         net_use_sendfile = 0;
1989                         break;
1990                 default:
1991                         show_usage(argv[0]);
1992                         exit(1);
1993                         /*NOTREACHED*/
1994                 }
1995         }
1996
1997         while (optind < argc)
1998                 if (add_devpath(argv[optind++]) != 0)
1999                         return 1;
2000
2001         if (net_mode != Net_server && ndevs == 0) {
2002                 show_usage(argv[0]);
2003                 return 1;
2004         }
2005
2006         if (statfs(debugfs_path, &st) < 0 || st.f_type != (long)DEBUGFS_TYPE) {
2007                 fprintf(stderr, "Invalid debug path %s: %d/%s\n",
2008                         debugfs_path, errno, strerror(errno));
2009                 return 1;
2010         }
2011
2012         if (act_mask_tmp != 0)
2013                 act_mask = act_mask_tmp;
2014
2015         /*
2016          * Set up for appropriate PFD handler based upon output name.
2017          */
2018         if (net_client_use_sendfile())
2019                 handle_pfds = handle_pfds_netclient;
2020         else if (net_client_use_send())
2021                 handle_pfds = handle_pfds_entries;
2022         else if (output_name && (strcmp(output_name, "-") == 0)) {
2023                 piped_output = 1;
2024                 handle_pfds = handle_pfds_entries;
2025                 pfp = stdout;
2026                 setvbuf(pfp, NULL, _IONBF, 0);
2027         } else
2028                 handle_pfds = handle_pfds_file;
2029         return 0;
2030 }
2031
2032 static void ch_add_connection(struct net_server_s *ns, struct cl_host *ch,
2033                               int fd)
2034 {
2035         struct cl_conn *nc;
2036
2037         nc = malloc(sizeof(*nc));
2038         memset(nc, 0, sizeof(*nc));
2039
2040         time(&nc->connect_time);
2041         nc->ch = ch;
2042         nc->fd = fd;
2043         nc->ncpus = -1;
2044
2045         list_add_tail(&nc->ch_head, &ch->conn_list);
2046         ch->connects++;
2047
2048         list_add_tail(&nc->ns_head, &ns->conn_list);
2049         ns->connects++;
2050         ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2051 }
2052
2053 static void ch_rem_connection(struct net_server_s *ns, struct cl_host *ch,
2054                               struct cl_conn *nc)
2055 {
2056         net_close_connection(&nc->fd);
2057
2058         list_del(&nc->ch_head);
2059         ch->connects--;
2060
2061         list_del(&nc->ns_head);
2062         ns->connects--;
2063         ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
2064
2065         free(nc);
2066 }
2067
2068 static struct cl_host *net_find_client_host(struct net_server_s *ns,
2069                                             struct in_addr cl_in_addr)
2070 {
2071         struct list_head *p;
2072
2073         __list_for_each(p, &ns->ch_list) {
2074                 struct cl_host *ch = list_entry(p, struct cl_host, head);
2075
2076                 if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
2077                         return ch;
2078         }
2079
2080         return NULL;
2081 }
2082
2083 static struct cl_host *net_add_client_host(struct net_server_s *ns,
2084                                            struct sockaddr_in *addr)
2085 {
2086         struct cl_host *ch;
2087
2088         ch = malloc(sizeof(*ch));
2089         memset(ch, 0, sizeof(*ch));
2090
2091         ch->ns = ns;
2092         ch->cl_in_addr = addr->sin_addr;
2093         list_add_tail(&ch->head, &ns->ch_list);
2094         ns->nchs++;
2095
2096         ch->hostname = strdup(inet_ntoa(addr->sin_addr));
2097         printf("server: connection from %s\n", ch->hostname);
2098
2099         INIT_LIST_HEAD(&ch->conn_list);
2100         INIT_LIST_HEAD(&ch->devpaths);
2101
2102         return ch;
2103 }
2104
2105 static void device_done(struct devpath *dpp, int ncpus)
2106 {
2107         int cpu;
2108         struct io_info *iop;
2109
2110         for (cpu = 0, iop = dpp->ios; cpu < ncpus; cpu++, iop++)
2111                 close_iop(iop);
2112
2113         list_del(&dpp->head);
2114         dpp_free(dpp);
2115 }
2116
2117 static void net_ch_remove(struct cl_host *ch, int ncpus)
2118 {
2119         struct list_head *p, *q;
2120         struct net_server_s *ns = ch->ns;
2121
2122         list_for_each_safe(p, q, &ch->devpaths) {
2123                 struct devpath *dpp = list_entry(p, struct devpath, head);
2124                 device_done(dpp, ncpus);
2125         }
2126
2127         list_for_each_safe(p, q, &ch->conn_list) {
2128                 struct cl_conn *nc = list_entry(p, struct cl_conn, ch_head);
2129
2130                 ch_rem_connection(ns, ch, nc);
2131         }
2132
2133         list_del(&ch->head);
2134         ns->nchs--;
2135
2136         if (ch->hostname)
2137                 free(ch->hostname);
2138         free(ch);
2139 }
2140
2141 static void net_add_connection(struct net_server_s *ns)
2142 {
2143         int fd;
2144         struct cl_host *ch;
2145         socklen_t socklen = sizeof(ns->addr);
2146
2147         fd = my_accept(ns->listen_fd, (struct sockaddr *)&ns->addr, &socklen);
2148         if (fd < 0) {
2149                 /*
2150                  * This is OK: we just won't accept this connection,
2151                  * nothing fatal.
2152                  */
2153                 perror("accept");
2154         } else {
2155                 ch = net_find_client_host(ns, ns->addr.sin_addr);
2156                 if (!ch)
2157                         ch = net_add_client_host(ns, &ns->addr);
2158
2159                 ch_add_connection(ns, ch, fd);
2160         }
2161 }
2162
2163 static struct devpath *nc_add_dpp(struct cl_conn *nc,
2164                                   struct blktrace_net_hdr *bnh,
2165                                   time_t connect_time)
2166 {
2167         int cpu;
2168         struct io_info *iop;
2169         struct devpath *dpp;
2170
2171         dpp = malloc(sizeof(*dpp));
2172         memset(dpp, 0, sizeof(*dpp));
2173
2174         dpp->buts_name = strdup(bnh->buts_name);
2175         dpp->path = strdup(bnh->buts_name);
2176         dpp->fd = -1;
2177         dpp->ch = nc->ch;
2178         dpp->cl_id = bnh->cl_id;
2179         dpp->cl_connect_time = connect_time;
2180         dpp->ncpus = nc->ncpus;
2181         dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
2182         memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
2183
2184         list_add_tail(&dpp->head, &nc->ch->devpaths);
2185         nc->ch->ndevs++;
2186
2187         dpp->ios = calloc(nc->ncpus, sizeof(*iop));
2188         memset(dpp->ios, 0, ndevs * sizeof(*iop));
2189
2190         for (cpu = 0, iop = dpp->ios; cpu < nc->ncpus; cpu++, iop++) {
2191                 iop->dpp = dpp;
2192                 iop->nc = nc;
2193                 init_mmap_info(&iop->mmap_info);
2194
2195                 if (iop_open(iop, cpu))
2196                         goto err;
2197         }
2198
2199         return dpp;
2200
2201 err:
2202         /*
2203          * Need to unravel what's been done...
2204          */
2205         while (cpu >= 0)
2206                 close_iop(&dpp->ios[cpu--]);
2207         dpp_free(dpp);
2208
2209         return NULL;
2210 }
2211
2212 static struct devpath *nc_find_dpp(struct cl_conn *nc,
2213                                    struct blktrace_net_hdr *bnh)
2214 {
2215         struct list_head *p;
2216         time_t connect_time = nc->connect_time;
2217
2218         __list_for_each(p, &nc->ch->devpaths) {
2219                 struct devpath *dpp = list_entry(p, struct devpath, head);
2220
2221                 if (!strcmp(dpp->buts_name, bnh->buts_name))
2222                         return dpp;
2223
2224                 if (dpp->cl_id == bnh->cl_id)
2225                         connect_time = dpp->cl_connect_time;
2226         }
2227
2228         return nc_add_dpp(nc, bnh, connect_time);
2229 }
2230
2231 static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
2232                                  struct blktrace_net_hdr *bnh)
2233 {
2234         int ret;
2235         struct io_info *iop = &dpp->ios[bnh->cpu];
2236         struct mmap_info *mip = &iop->mmap_info;
2237
2238         if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info)) {
2239                 fprintf(stderr, "ncd(%s:%d): mmap failed\n",
2240                         nc->ch->hostname, nc->fd);
2241                 exit(1);
2242         }
2243
2244         ret = net_recv_data(nc->fd, mip->fs_buf + mip->fs_off, bnh->len);
2245         if (ret > 0) {
2246                 pdc_dr_update(dpp, bnh->cpu, ret);
2247                 mip->fs_size += ret;
2248                 mip->fs_off += ret;
2249         } else if (ret < 0)
2250                 exit(1);
2251 }
2252
2253 /*
2254  * Returns 1 if we closed a host - invalidates other polling information
2255  * that may be present.
2256  */
2257 static int net_client_data(struct cl_conn *nc)
2258 {
2259         int ret;
2260         struct devpath *dpp;
2261         struct blktrace_net_hdr bnh;
2262
2263         ret = net_get_header(nc, &bnh);
2264         if (ret == 0)
2265                 return 0;
2266
2267         if (ret < 0) {
2268                 fprintf(stderr, "ncd(%d): header read failed\n", nc->fd);
2269                 exit(1);
2270         }
2271
2272         if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
2273                 fprintf(stderr, "ncd(%d): received data is bad\n", nc->fd);
2274                 exit(1);
2275         }
2276
2277         if (!data_is_native) {
2278                 bnh.magic = be32_to_cpu(bnh.magic);
2279                 bnh.cpu = be32_to_cpu(bnh.cpu);
2280                 bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
2281                 bnh.len = be32_to_cpu(bnh.len);
2282                 bnh.cl_id = be32_to_cpu(bnh.cl_id);
2283                 bnh.buf_size = be32_to_cpu(bnh.buf_size);
2284                 bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
2285                 bnh.page_size = be32_to_cpu(bnh.page_size);
2286         }
2287
2288         if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2289                 fprintf(stderr, "ncd(%s:%d): bad data magic\n",
2290                         nc->ch->hostname, nc->fd);
2291                 exit(1);
2292         }
2293
2294         if (nc->ncpus == -1)
2295                 nc->ncpus = bnh.max_cpus;
2296
2297         /*
2298          * len == 0 means the other end is sending us a new connection/dpp
2299          * len == 1 means that the other end signalled end-of-run
2300          */
2301         dpp = nc_find_dpp(nc, &bnh);
2302         if (bnh.len == 0) {
2303                 /*
2304                  * Just adding in the dpp above is enough
2305                  */
2306                 ack_open_close(nc->fd, dpp->buts_name);
2307                 nc->ch->cl_opens++;
2308         } else if (bnh.len == 1) {
2309                 /*
2310                  * overload cpu count with dropped events
2311                  */
2312                 dpp->drops = bnh.cpu;
2313
2314                 ack_open_close(nc->fd, dpp->buts_name);
2315                 if (--nc->ch->cl_opens == 0) {
2316                         show_stats(&nc->ch->devpaths);
2317                         net_ch_remove(nc->ch, nc->ncpus);
2318                         return 1;
2319                 }
2320         } else
2321                 net_client_read_data(nc, dpp, &bnh);
2322
2323         return 0;
2324 }
2325
2326 static void handle_client_data(struct net_server_s *ns, int events)
2327 {
2328         struct cl_conn *nc;
2329         struct pollfd *pfd;
2330         struct list_head *p, *q;
2331
2332         pfd = &ns->pfds[1];
2333         list_for_each_safe(p, q, &ns->conn_list) {
2334                 if (pfd->revents & POLLIN) {
2335                         nc = list_entry(p, struct cl_conn, ns_head);
2336
2337                         if (net_client_data(nc) || --events == 0)
2338                                 break;
2339                 }
2340                 pfd++;
2341         }
2342 }
2343
2344 static void net_setup_pfds(struct net_server_s *ns)
2345 {
2346         struct pollfd *pfd;
2347         struct list_head *p;
2348
2349         ns->pfds[0].fd = ns->listen_fd;
2350         ns->pfds[0].events = POLLIN;
2351
2352         pfd = &ns->pfds[1];
2353         __list_for_each(p, &ns->conn_list) {
2354                 struct cl_conn *nc = list_entry(p, struct cl_conn, ns_head);
2355
2356                 pfd->fd = nc->fd;
2357                 pfd->events = POLLIN;
2358                 pfd++;
2359         }
2360 }
2361
2362 static int net_server_handle_connections(struct net_server_s *ns)
2363 {
2364         int events;
2365
2366         printf("server: waiting for connections...\n");
2367
2368         while (!done) {
2369                 net_setup_pfds(ns);
2370                 events = poll(ns->pfds, ns->connects + 1, -1);
2371                 if (events < 0) {
2372                         if (errno != EINTR) {
2373                                 perror("FATAL: poll error");
2374                                 return 1;
2375                         }
2376                 } else if (events > 0) {
2377                         if (ns->pfds[0].revents & POLLIN) {
2378                                 net_add_connection(ns);
2379                                 events--;
2380                         }
2381
2382                         if (events)
2383                                 handle_client_data(ns, events);
2384                 }
2385         }
2386
2387         return 0;
2388 }
2389
2390 static int net_server(void)
2391 {
2392         int fd, opt;
2393         int ret = 1;
2394         struct net_server_s net_server;
2395         struct net_server_s *ns = &net_server;
2396
2397         memset(ns, 0, sizeof(*ns));
2398         INIT_LIST_HEAD(&ns->ch_list);
2399         INIT_LIST_HEAD(&ns->conn_list);
2400         ns->pfds = malloc(sizeof(struct pollfd));
2401
2402         fd = my_socket(AF_INET, SOCK_STREAM, 0);
2403         if (fd < 0) {
2404                 perror("server: socket");
2405                 goto out;
2406         }
2407
2408         opt = 1;
2409         if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
2410                 perror("setsockopt");
2411                 goto out;
2412         }
2413
2414         memset(&ns->addr, 0, sizeof(ns->addr));
2415         ns->addr.sin_family = AF_INET;
2416         ns->addr.sin_addr.s_addr = htonl(INADDR_ANY);
2417         ns->addr.sin_port = htons(net_port);
2418
2419         if (bind(fd, (struct sockaddr *) &ns->addr, sizeof(ns->addr)) < 0) {
2420                 perror("bind");
2421                 goto out;
2422         }
2423
2424         if (listen(fd, 1) < 0) {
2425                 perror("listen");
2426                 goto out;
2427         }
2428
2429         /*
2430          * The actual server looping is done here:
2431          */
2432         ns->listen_fd = fd;
2433         ret = net_server_handle_connections(ns);
2434
2435         /*
2436          * Clean up and return...
2437          */
2438 out:
2439         free(ns->pfds);
2440         return ret;
2441 }
2442
2443 int main(int argc, char *argv[])
2444 {
2445         int ret = 0;
2446
2447         setlocale(LC_NUMERIC, "en_US");
2448         pagesize = getpagesize();
2449         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
2450         if (ncpus < 0) {
2451                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed %d/%s\n",
2452                         errno, strerror(errno));
2453                 ret = 1;
2454                 goto out;
2455         }
2456
2457         if (handle_args(argc, argv)) {
2458                 ret = 1;
2459                 goto out;
2460         }
2461
2462         signal(SIGINT, handle_sigint);
2463         signal(SIGHUP, handle_sigint);
2464         signal(SIGTERM, handle_sigint);
2465         signal(SIGALRM, handle_sigint);
2466         signal(SIGPIPE, SIG_IGN);
2467
2468         if (kill_running_trace) {
2469                 struct devpath *dpp;
2470                 struct list_head *p;
2471
2472                 __list_for_each(p, &devpaths) {
2473                         dpp = list_entry(p, struct devpath, head);
2474                         if (__stop_trace(dpp->fd)) {
2475                                 fprintf(stderr,
2476                                         "BLKTRACETEARDOWN %s failed: %d/%s\n",
2477                                         dpp->path, errno, strerror(errno));
2478                         }
2479                 }
2480         } else if (net_mode == Net_server) {
2481                 if (output_name) {
2482                         fprintf(stderr, "-o ignored in server mode\n");
2483                         output_name = NULL;
2484                 }
2485
2486                 ret = net_server();
2487         } else {
2488                 atexit(exit_tracing);
2489
2490                 if (net_mode == Net_client)
2491                         printf("blktrace: connecting to %s\n", hostname);
2492
2493                 setup_buts();
2494
2495                 if (use_tracer_devpaths()) {
2496                         if (setup_tracer_devpaths())
2497                                 goto out;
2498
2499                         if (piped_output)
2500                                 handle_list = handle_list_file;
2501                         else
2502                                 handle_list = handle_list_net;
2503                 }
2504
2505                 ntracers = start_tracers();
2506                 if (ntracers != ncpus)
2507                         stop_tracers();
2508                 else {
2509                         if (net_mode == Net_client)
2510                                 printf("blktrace: connected!\n");
2511                         if (stop_watch)
2512                                 alarm(stop_watch);
2513                 }
2514
2515                 wait_tracers();
2516                 if (ntracers == ncpus)
2517                         show_stats(&devpaths);
2518
2519                 if (net_client_use_send())
2520                         close_client_connections();
2521                 del_tracers();
2522         }
2523
2524 out:
2525         if (pfp)
2526                 fclose(pfp);
2527         rel_devpaths();
2528         return ret;
2529 }