[PATCH] blktrace: fixes for the new net client sendfile
[blktrace.git] / blktrace.c
CommitLineData
d0ca268b
JA
1/*
2 * block queue tracing application
3 *
d956a2cd
JA
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
d0ca268b
JA
20 */
21#include <pthread.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24#include <unistd.h>
25#include <locale.h>
26#include <signal.h>
27#include <fcntl.h>
28#include <string.h>
29#include <sys/ioctl.h>
b9d4294e 30#include <sys/param.h>
e3e74029 31#include <sys/statfs.h>
eb3c8108 32#include <sys/poll.h>
b7106311 33#include <sys/mman.h>
8e86c98a 34#include <sys/socket.h>
d0ca268b
JA
35#include <stdio.h>
36#include <stdlib.h>
37#include <sched.h>
d39c04ca
AB
38#include <ctype.h>
39#include <getopt.h>
da39451f 40#include <errno.h>
8e86c98a
JA
41#include <netinet/in.h>
42#include <arpa/inet.h>
43#include <netdb.h>
32f18c48 44#include <sys/sendfile.h>
d0ca268b
JA
45
46#include "blktrace.h"
21f55651 47#include "barrier.h"
d0ca268b 48
a3225fed 49static char blktrace_version[] = "0.99.1";
52724a0e 50
8f551a39
JA
51/*
52 * You may want to increase this even more, if you are logging at a high
53 * rate and see skipped/missed events
54 */
007c233c 55#define BUF_SIZE (512 * 1024)
d0ca268b
JA
56#define BUF_NR (4)
57
007c233c
JA
58#define OFILE_BUF (128 * 1024)
59
3d06efea 60#define DEBUGFS_TYPE 0x64626720
e3e74029 61
32f18c48 62#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:s"
d5396421 63static struct option l_opts[] = {
5c86134e 64 {
d39c04ca 65 .name = "dev",
428683db 66 .has_arg = required_argument,
d39c04ca
AB
67 .flag = NULL,
68 .val = 'd'
69 },
5c86134e 70 {
d39c04ca 71 .name = "act-mask",
428683db 72 .has_arg = required_argument,
d39c04ca
AB
73 .flag = NULL,
74 .val = 'a'
75 },
5c86134e 76 {
d39c04ca 77 .name = "set-mask",
428683db 78 .has_arg = required_argument,
d39c04ca
AB
79 .flag = NULL,
80 .val = 'A'
81 },
5c86134e 82 {
5270dddd 83 .name = "relay",
428683db 84 .has_arg = required_argument,
5270dddd
JA
85 .flag = NULL,
86 .val = 'r'
87 },
d5396421
JA
88 {
89 .name = "output",
428683db 90 .has_arg = required_argument,
d5396421
JA
91 .flag = NULL,
92 .val = 'o'
93 },
bc39777c
JA
94 {
95 .name = "kill",
428683db 96 .has_arg = no_argument,
bc39777c
JA
97 .flag = NULL,
98 .val = 'k'
99 },
ece238a6
NS
100 {
101 .name = "stopwatch",
428683db 102 .has_arg = required_argument,
ece238a6
NS
103 .flag = NULL,
104 .val = 'w'
105 },
52724a0e
JA
106 {
107 .name = "version",
108 .has_arg = no_argument,
109 .flag = NULL,
57ea8602 110 .val = 'V'
52724a0e 111 },
129aa440 112 {
3f65c585 113 .name = "buffer-size",
129aa440
JA
114 .has_arg = required_argument,
115 .flag = NULL,
116 .val = 'b'
117 },
118 {
3f65c585 119 .name = "num-sub-buffers",
129aa440
JA
120 .has_arg = required_argument,
121 .flag = NULL,
122 .val = 'n'
123 },
d1d7f15f 124 {
3f65c585 125 .name = "output-dir",
d1d7f15f
JA
126 .has_arg = required_argument,
127 .flag = NULL,
128 .val = 'D'
129 },
8e86c98a
JA
130 {
131 .name = "listen",
132 .has_arg = no_argument,
133 .flag = NULL,
134 .val = 'l'
135 },
136 {
137 .name = "host",
138 .has_arg = required_argument,
139 .flag = NULL,
140 .val = 'h'
141 },
142 {
143 .name = "port",
144 .has_arg = required_argument,
145 .flag = NULL,
146 .val = 'p'
147 },
32f18c48 148 {
79971f43 149 .name = "no-sendfile",
32f18c48
JA
150 .has_arg = no_argument,
151 .flag = NULL,
152 .val = 's'
153 },
71ef8b7c
JA
154 {
155 .name = NULL,
156 }
d39c04ca
AB
157};
158
9db17354 159struct tip_subbuf {
9db17354 160 void *buf;
5be4bdaf
JA
161 unsigned int len;
162 unsigned int max_len;
9db17354
JA
163};
164
21f55651
JA
165#define FIFO_SIZE (1024) /* should be plenty big! */
166#define CL_SIZE (128) /* cache line, any bigger? */
167
168struct tip_subbuf_fifo {
169 int tail __attribute__((aligned(CL_SIZE)));
170 int head __attribute__((aligned(CL_SIZE)));
171 struct tip_subbuf *q[FIFO_SIZE];
172};
173
d0ca268b
JA
174struct thread_information {
175 int cpu;
176 pthread_t thread;
b9d4294e
JA
177
178 int fd;
a3e4d330 179 void *fd_buf;
b9d4294e
JA
180 char fn[MAXPATHLEN + 64];
181
007c233c
JA
182 FILE *ofile;
183 char *ofile_buffer;
32f18c48 184 off_t ofile_offset;
9db17354 185 int ofile_stdout;
8e86c98a 186 int ofile_mmap;
007c233c 187
0cc7d25e
JA
188 int (*get_subbuf)(struct thread_information *, unsigned int);
189 int (*flush_subbuf)(struct thread_information *, struct tip_subbuf *);
190 int (*read_data)(struct thread_information *, void *, unsigned int);
191
d0ca268b 192 unsigned long events_processed;
b7106311 193 unsigned long long data_read;
bcbeb60f 194 unsigned long long data_queued;
e7c9f3ff 195 struct device_information *device;
9db17354
JA
196
197 int exited;
198
b7106311
JA
199 /*
200 * piped fifo buffers
201 */
21f55651 202 struct tip_subbuf_fifo fifo;
7de86b12 203 struct tip_subbuf *leftover_ts;
b7106311
JA
204
205 /*
206 * mmap controlled output files
207 */
208 unsigned long long fs_size;
209 unsigned long long fs_max_size;
210 unsigned long fs_off;
211 void *fs_buf;
212 unsigned long fs_buf_len;
ff11d54c
TZ
213
214 struct net_connection *nc;
d0ca268b
JA
215};
216
e7c9f3ff
NS
217struct device_information {
218 int fd;
219 char *path;
220 char buts_name[32];
99c1f5ab 221 volatile int trace_started;
eb3c8108 222 unsigned long drop_count;
e7c9f3ff 223 struct thread_information *threads;
ff11d54c
TZ
224
225 struct cl_host *ch;
226 u32 cl_id;
227 time_t cl_connect_time;
e7c9f3ff 228};
d0ca268b 229
e7c9f3ff 230static int ncpus;
d0ca268b 231static struct thread_information *thread_information;
e7c9f3ff
NS
232static int ndevs;
233static struct device_information *device_information;
234
235/* command line option globals */
3d06efea 236static char *debugfs_path;
d5396421 237static char *output_name;
d1d7f15f 238static char *output_dir;
5c86134e 239static int act_mask = ~0U;
bc39777c 240static int kill_running_trace;
eb3c8108
JA
241static unsigned long buf_size = BUF_SIZE;
242static unsigned long buf_nr = BUF_NR;
b7106311 243static unsigned int page_size;
d39c04ca 244
e7c9f3ff
NS
245#define is_done() (*(volatile int *)(&done))
246static volatile int done;
247
eb3c8108
JA
248#define is_trace_stopped() (*(volatile int *)(&trace_stopped))
249static volatile int trace_stopped;
250
251#define is_stat_shown() (*(volatile int *)(&stat_shown))
252static volatile int stat_shown;
a3e4d330 253
8e86c98a
JA
254int data_is_native = -1;
255
72ca8801
NS
256static void exit_trace(int status);
257
99c1f5ab
JA
258#define dip_tracing(dip) (*(volatile int *)(&(dip)->trace_started))
259#define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
260
ce020676 261#define __for_each_dip(__d, __di, __e, __i) \
e0a1988b
JA
262 for (__i = 0, __d = __di; __i < __e; __i++, __d++)
263
264#define for_each_dip(__d, __i) \
ce020676 265 __for_each_dip(__d, device_information, ndevs, __i)
e0a1988b 266#define for_each_nc_dip(__nc, __d, __i) \
ff11d54c 267 __for_each_dip(__d, (__nc)->ch->device_information, (__nc)->ch->ndevs, __i)
99c1f5ab 268
ce020676
JA
269#define __for_each_tip(__d, __t, __ncpus, __j) \
270 for (__j = 0, __t = (__d)->threads; __j < __ncpus; __j++, __t++)
9db17354 271#define for_each_tip(__d, __t, __j) \
ce020676 272 __for_each_tip(__d, __t, ncpus, __j)
ff11d54c
TZ
273#define for_each_cl_host(__c) \
274 for (__c = cl_host_list; __c; __c = __c->list_next)
99c1f5ab 275
8e86c98a
JA
276/*
277 * networking stuff follows. we include a magic number so we know whether
278 * to endianness convert or not
279 */
280struct blktrace_net_hdr {
281 u32 magic; /* same as trace magic */
22cd0c02 282 char buts_name[32]; /* trace name */
8e86c98a 283 u32 cpu; /* for which cpu */
22cd0c02 284 u32 max_cpus;
8e86c98a 285 u32 len; /* length of following trace data */
ff11d54c 286 u32 cl_id; /* id for set of client per-cpu connections */
8e86c98a
JA
287};
288
289#define TRACE_NET_PORT (8462)
290
291enum {
292 Net_none = 0,
293 Net_server,
294 Net_client,
295};
296
297/*
298 * network cmd line params
299 */
300static char hostname[MAXHOSTNAMELEN];
301static int net_port = TRACE_NET_PORT;
302static int net_mode = 0;
79971f43 303static int net_use_sendfile = 1;
8e86c98a 304
ff11d54c
TZ
305struct cl_host {
306 struct cl_host *list_next;
e0a1988b 307 struct in_addr cl_in_addr;
ff11d54c
TZ
308 struct net_connection *net_connections;
309 int nconn;
e0a1988b
JA
310 struct device_information *device_information;
311 int ndevs;
312 int ncpus;
ff11d54c 313 int ndevs_done;
e0a1988b
JA
314};
315
ff11d54c
TZ
316struct net_connection {
317 int in_fd;
318 struct pollfd pfd;
319 time_t connect_time;
320 struct cl_host *ch;
321 int ncpus;
322};
323
324#define NET_MAX_CL_HOSTS (1024)
325static struct cl_host *cl_host_list;
326static int cl_hosts;
e0a1988b 327static int net_connects;
ff11d54c
TZ
328
329static int *net_out_fd;
8e86c98a
JA
330
331static void handle_sigint(__attribute__((__unused__)) int sig)
332{
7035d92d
JA
333 struct device_information *dip;
334 int i;
335
336 /*
337 * stop trace so we can reap currently produced data
338 */
339 for_each_dip(dip, i) {
921b05fe
JA
340 if (dip->fd == -1)
341 continue;
7035d92d
JA
342 if (ioctl(dip->fd, BLKTRACESTOP) < 0)
343 perror("BLKTRACESTOP");
344 }
345
8e86c98a
JA
346 done = 1;
347}
348
eb3c8108
JA
349static int get_dropped_count(const char *buts_name)
350{
351 int fd;
352 char tmp[MAXPATHLEN + 64];
353
354 snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
3d06efea 355 debugfs_path, buts_name);
eb3c8108
JA
356
357 fd = open(tmp, O_RDONLY);
358 if (fd < 0) {
359 /*
360 * this may be ok, if the kernel doesn't support dropped counts
361 */
362 if (errno == ENOENT)
363 return 0;
364
365 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
366 return -1;
367 }
368
369 if (read(fd, tmp, sizeof(tmp)) < 0) {
370 perror(tmp);
371 close(fd);
372 return -1;
373 }
374
375 close(fd);
376
377 return atoi(tmp);
378}
379
e7c9f3ff 380static int start_trace(struct device_information *dip)
d0ca268b
JA
381{
382 struct blk_user_trace_setup buts;
383
1f79c4a0 384 memset(&buts, 0, sizeof(buts));
129aa440
JA
385 buts.buf_size = buf_size;
386 buts.buf_nr = buf_nr;
d39c04ca 387 buts.act_mask = act_mask;
d0ca268b 388
ed71a31e
JA
389 if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
390 perror("BLKTRACESETUP");
391 return 1;
392 }
393
394 if (ioctl(dip->fd, BLKTRACESTART) < 0) {
395 perror("BLKTRACESTART");
d0ca268b
JA
396 return 1;
397 }
398
e7c9f3ff 399 memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
99c1f5ab 400 dip_set_tracing(dip, 1);
d0ca268b
JA
401 return 0;
402}
403
e7c9f3ff 404static void stop_trace(struct device_information *dip)
d0ca268b 405{
99c1f5ab
JA
406 if (dip_tracing(dip) || kill_running_trace) {
407 dip_set_tracing(dip, 0);
cf9208ea 408
7035d92d
JA
409 /*
410 * should be stopped, just don't complain if it isn't
411 */
412 ioctl(dip->fd, BLKTRACESTOP);
413
ed71a31e
JA
414 if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
415 perror("BLKTRACETEARDOWN");
cf9208ea 416
e7c9f3ff 417 close(dip->fd);
cf9208ea 418 dip->fd = -1;
707b0914 419 }
d0ca268b
JA
420}
421
e7c9f3ff
NS
422static void stop_all_traces(void)
423{
424 struct device_information *dip;
425 int i;
426
eb3c8108
JA
427 for_each_dip(dip, i) {
428 dip->drop_count = get_dropped_count(dip->buts_name);
e7c9f3ff 429 stop_trace(dip);
eb3c8108 430 }
e7c9f3ff
NS
431}
432
ff11d54c 433static void wait_for_data(struct thread_information *tip, int timeout)
eb3c8108 434{
ff11d54c 435 struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
eb3c8108 436
ff11d54c
TZ
437 while (!is_done()) {
438 if (poll(&pfd, 1, timeout) < 0) {
7934e668
JA
439 perror("poll");
440 break;
441 }
ff11d54c 442 if (pfd.revents & POLLIN)
9db17354
JA
443 break;
444 if (tip->ofile_stdout)
445 break;
ff11d54c 446 }
eb3c8108
JA
447}
448
0cc7d25e
JA
449static int read_data_file(struct thread_information *tip, void *buf,
450 unsigned int len)
d0ca268b 451{
ae9f71b3 452 int ret = 0;
bbabf03a 453
9db17354 454 do {
ff11d54c 455 wait_for_data(tip, 100);
ae9f71b3 456
9db17354
JA
457 ret = read(tip->fd, buf, len);
458 if (!ret)
459 continue;
460 else if (ret > 0)
461 return ret;
462 else {
bbabf03a 463 if (errno != EAGAIN) {
a3e4d330
JA
464 perror(tip->fn);
465 fprintf(stderr,"Thread %d failed read of %s\n",
466 tip->cpu, tip->fn);
467 break;
468 }
9db17354 469 continue;
bbabf03a 470 }
9db17354 471 } while (!is_done());
8a43bac5 472
bbabf03a 473 return ret;
8e86c98a 474
8a43bac5
JA
475}
476
0cc7d25e
JA
477static int read_data_net(struct thread_information *tip, void *buf,
478 unsigned int len)
8e86c98a 479{
ff11d54c 480 struct net_connection *nc = tip->nc;
8e86c98a
JA
481 unsigned int bytes_left = len;
482 int ret = 0;
483
484 do {
e0a1988b 485 ret = recv(nc->in_fd, buf, bytes_left, MSG_WAITALL);
8e86c98a
JA
486
487 if (!ret)
488 continue;
489 else if (ret < 0) {
490 if (errno != EAGAIN) {
491 perror(tip->fn);
492 fprintf(stderr, "server: failed read\n");
493 return 0;
494 }
495 continue;
496 } else {
497 buf += ret;
498 bytes_left -= ret;
499 }
500 } while (!is_done() && bytes_left);
501
410d7c62 502 return len - bytes_left;
8e86c98a
JA
503}
504
8e86c98a
JA
505static inline struct tip_subbuf *
506subbuf_fifo_dequeue(struct thread_information *tip)
a3e4d330 507{
21f55651
JA
508 const int head = tip->fifo.head;
509 const int next = (head + 1) & (FIFO_SIZE - 1);
510
511 if (head != tip->fifo.tail) {
512 struct tip_subbuf *ts = tip->fifo.q[head];
513
514 store_barrier();
515 tip->fifo.head = next;
516 return ts;
517 }
518
519 return NULL;
9db17354 520}
eb3c8108 521
21f55651
JA
522static inline int subbuf_fifo_queue(struct thread_information *tip,
523 struct tip_subbuf *ts)
9db17354 524{
21f55651
JA
525 const int tail = tip->fifo.tail;
526 const int next = (tail + 1) & (FIFO_SIZE - 1);
527
528 if (next != tip->fifo.head) {
529 tip->fifo.q[tail] = ts;
530 store_barrier();
531 tip->fifo.tail = next;
532 return 0;
533 }
534
535 fprintf(stderr, "fifo too small!\n");
536 return 1;
a3e4d330
JA
537}
538
b7106311
JA
539/*
540 * For file output, truncate and mmap the file appropriately
541 */
8e86c98a 542static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
b7106311
JA
543{
544 int ofd = fileno(tip->ofile);
545 int ret;
546
547 /*
548 * extend file, if we have to. use chunks of 16 subbuffers.
549 */
550 if (tip->fs_off + buf_size > tip->fs_buf_len) {
551 if (tip->fs_buf) {
5975d309 552 munlock(tip->fs_buf, tip->fs_buf_len);
b7106311
JA
553 munmap(tip->fs_buf, tip->fs_buf_len);
554 tip->fs_buf = NULL;
555 }
556
557 tip->fs_off = tip->fs_size & (page_size - 1);
558 tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
559 tip->fs_max_size += tip->fs_buf_len;
560
561 if (ftruncate(ofd, tip->fs_max_size) < 0) {
562 perror("ftruncate");
563 return -1;
564 }
565
566 tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
567 MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
568 if (tip->fs_buf == MAP_FAILED) {
569 perror("mmap");
570 return -1;
571 }
5975d309 572 mlock(tip->fs_buf, tip->fs_buf_len);
b7106311
JA
573 }
574
7934e668 575 ret = tip->read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
b7106311 576 if (ret >= 0) {
dbfbd6db 577 tip->data_read += ret;
b7106311
JA
578 tip->fs_size += ret;
579 tip->fs_off += ret;
580 return 0;
581 }
582
583 return -1;
584}
585
18eed2a7
JA
586/*
587 * Use the copy approach for pipes and network
588 */
589static int get_subbuf(struct thread_information *tip, unsigned int maxlen)
590{
591 struct tip_subbuf *ts = malloc(sizeof(*ts));
592 int ret;
593
594 ts->buf = malloc(buf_size);
595 ts->max_len = maxlen;
596
7934e668 597 ret = tip->read_data(tip, ts->buf, ts->max_len);
18eed2a7
JA
598 if (ret > 0) {
599 ts->len = ret;
dbfbd6db 600 tip->data_read += ret;
7035d92d
JA
601 if (subbuf_fifo_queue(tip, ts))
602 return -1;
18eed2a7
JA
603 }
604
605 return ret;
606}
607
9db17354 608static void close_thread(struct thread_information *tip)
a3e4d330 609{
9db17354
JA
610 if (tip->fd != -1)
611 close(tip->fd);
612 if (tip->ofile)
613 fclose(tip->ofile);
614 if (tip->ofile_buffer)
615 free(tip->ofile_buffer);
616 if (tip->fd_buf)
617 free(tip->fd_buf);
1c99bc21 618
9db17354
JA
619 tip->fd = -1;
620 tip->ofile = NULL;
621 tip->ofile_buffer = NULL;
622 tip->fd_buf = NULL;
a3e4d330
JA
623}
624
8e86c98a
JA
625static void tip_ftrunc_final(struct thread_information *tip)
626{
627 /*
628 * truncate to right size and cleanup mmap
629 */
c196b5f2 630 if (tip->ofile_mmap && tip->ofile) {
8e86c98a
JA
631 int ofd = fileno(tip->ofile);
632
633 if (tip->fs_buf)
634 munmap(tip->fs_buf, tip->fs_buf_len);
635
636 ftruncate(ofd, tip->fs_size);
637 }
638}
639
9db17354 640static void *thread_main(void *arg)
a3e4d330 641{
9db17354
JA
642 struct thread_information *tip = arg;
643 pid_t pid = getpid();
644 cpu_set_t cpu_mask;
a3e4d330 645
9db17354
JA
646 CPU_ZERO(&cpu_mask);
647 CPU_SET((tip->cpu), &cpu_mask);
a3e4d330 648
9db17354
JA
649 if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
650 perror("sched_setaffinity");
651 exit_trace(1);
652 }
a3e4d330 653
9db17354 654 snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
3d06efea 655 debugfs_path, tip->device->buts_name, tip->cpu);
9db17354
JA
656 tip->fd = open(tip->fn, O_RDONLY);
657 if (tip->fd < 0) {
658 perror(tip->fn);
659 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
660 tip->fn);
661 exit_trace(1);
a3e4d330
JA
662 }
663
b7106311 664 while (!is_done()) {
7035d92d 665 if (tip->get_subbuf(tip, buf_size) < 0)
0cc7d25e 666 break;
b7106311
JA
667 }
668
7035d92d
JA
669 /*
670 * trace is stopped, pull data until we get a short read
671 */
672 while (tip->get_subbuf(tip, buf_size) > 0)
673 ;
674
8e86c98a
JA
675 tip_ftrunc_final(tip);
676 tip->exited = 1;
677 return NULL;
678}
b7106311 679
8e86c98a
JA
680static int write_data_net(int fd, void *buf, unsigned int buf_len)
681{
682 unsigned int bytes_left = buf_len;
683 int ret;
b7106311 684
8e86c98a
JA
685 while (bytes_left) {
686 ret = send(fd, buf, bytes_left, 0);
687 if (ret < 0) {
688 perror("send");
689 return 1;
690 }
691
692 buf += ret;
693 bytes_left -= ret;
9db17354 694 }
a3e4d330 695
8e86c98a 696 return 0;
a3e4d330
JA
697}
698
32f18c48 699static int net_send_header(struct thread_information *tip, unsigned int len)
8e86c98a
JA
700{
701 struct blktrace_net_hdr hdr;
8e86c98a
JA
702
703 hdr.magic = BLK_IO_TRACE_MAGIC;
22cd0c02 704 strcpy(hdr.buts_name, tip->device->buts_name);
8e86c98a 705 hdr.cpu = tip->cpu;
22cd0c02 706 hdr.max_cpus = ncpus;
32f18c48 707 hdr.len = len;
ff11d54c 708 hdr.cl_id = getpid();
8e86c98a 709
ff11d54c 710 return write_data_net(net_out_fd[tip->cpu], &hdr, sizeof(hdr));
32f18c48 711}
8e86c98a 712
6a752c90
JA
713/*
714 * send header with 0 length to signal end-of-run
715 */
716static void net_client_send_close(void)
717{
7934e668 718 struct device_information *dip;
6a752c90 719 struct blktrace_net_hdr hdr;
7934e668 720 int i;
6a752c90 721
7934e668 722 for_each_dip(dip, i) {
7ab2f837
JA
723 hdr.magic = BLK_IO_TRACE_MAGIC;
724 hdr.max_cpus = ncpus;
725 hdr.len = 0;
7934e668 726 strcpy(hdr.buts_name, dip->buts_name);
7ab2f837 727 hdr.cpu = get_dropped_count(dip->buts_name);
ff11d54c 728 hdr.cl_id = getpid();
7ab2f837 729
ff11d54c 730 write_data_net(net_out_fd[0], &hdr, sizeof(hdr));
7934e668
JA
731 }
732
6a752c90
JA
733}
734
32f18c48
JA
735static int flush_subbuf_net(struct thread_information *tip,
736 struct tip_subbuf *ts)
737{
738 if (net_send_header(tip, ts->len))
7934e668 739 return -1;
ff11d54c 740 if (write_data_net(net_out_fd[tip->cpu], ts->buf, ts->len))
7934e668 741 return -1;
8e86c98a 742
f0597a7e 743 free(ts->buf);
8e86c98a 744 free(ts);
7934e668 745 return 1;
8e86c98a
JA
746}
747
f6fead25
JA
748static int net_sendfile(struct thread_information *tip, struct tip_subbuf *ts)
749{
ff11d54c 750 int ret = sendfile(net_out_fd[tip->cpu], tip->fd, NULL, ts->len);
11629347
JA
751
752 if (ret < 0) {
753 perror("sendfile");
754 return 1;
755 } else if (ret < (int) ts->len) {
756 fprintf(stderr, "short sendfile send (%d of %d)\n", ret, ts->len);
757 return 1;
758 }
759
760 return 0;
761}
762
32f18c48
JA
763static int flush_subbuf_sendfile(struct thread_information *tip,
764 struct tip_subbuf *ts)
765{
7934e668 766 int ret = -1;
18eed2a7 767
f6fead25 768 if (net_send_header(tip, ts->len))
11629347 769 goto err;
f6fead25 770 if (net_sendfile(tip, ts))
11629347 771 goto err;
32f18c48 772
f6fead25 773 tip->data_read += ts->len;
e076d33b 774 tip->ofile_offset += buf_size;
7934e668 775 ret = 1;
11629347 776err:
32f18c48 777 free(ts);
11629347 778 return ret;
32f18c48
JA
779}
780
ff11d54c 781static int get_subbuf_sendfile(struct thread_information *tip,
d042efdf 782 __attribute__((__unused__)) unsigned int maxlen)
ff11d54c
TZ
783{
784 struct tip_subbuf *ts;
785 struct stat sb;
786 unsigned int ready;
ff11d54c 787
4aeec019 788 wait_for_data(tip, 250);
ff11d54c
TZ
789
790 if (fstat(tip->fd, &sb) < 0) {
791 perror("trace stat");
792 return -1;
793 }
4aeec019 794
ff11d54c
TZ
795 ready = sb.st_size - tip->data_queued;
796 if (!ready) {
797 usleep(1000);
798 return 0;
799 }
800
801 ts = malloc(sizeof(*ts));
802 ts->buf = NULL;
803 ts->max_len = 0;
804 ts->len = ready;
805 tip->data_queued += ready;
806
d042efdf
JA
807 if (flush_subbuf_sendfile(tip, ts) < 0)
808 return -1;
ff11d54c
TZ
809
810 return ready;
811}
812
8e86c98a
JA
813static int write_data(struct thread_information *tip, void *buf,
814 unsigned int buf_len)
8a43bac5 815{
7126171a 816 int ret;
8a43bac5 817
6480258a
JA
818 if (!buf_len)
819 return 0;
820
7126171a
JA
821 while (1) {
822 ret = fwrite(buf, buf_len, 1, tip->ofile);
007c233c 823 if (ret == 1)
8a43bac5
JA
824 break;
825
db6fe5bc
JA
826 if (ret < 0) {
827 perror("write");
828 return 1;
8a43bac5 829 }
d0ca268b
JA
830 }
831
9db17354 832 if (tip->ofile_stdout)
7126171a
JA
833 fflush(tip->ofile);
834
8a43bac5
JA
835 return 0;
836}
837
8e86c98a
JA
838static int flush_subbuf_file(struct thread_information *tip,
839 struct tip_subbuf *ts)
8a43bac5 840{
9db17354
JA
841 unsigned int offset = 0;
842 struct blk_io_trace *t;
843 int pdu_len, events = 0;
8a43bac5 844
9db17354 845 /*
7de86b12 846 * surplus from last run
9db17354 847 */
7de86b12
AB
848 if (tip->leftover_ts) {
849 struct tip_subbuf *prev_ts = tip->leftover_ts;
850
9e8b753c 851 if (prev_ts->len + ts->len > prev_ts->max_len) {
7de86b12
AB
852 prev_ts->max_len += ts->len;
853 prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
854 }
855
9e8b753c 856 memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
7de86b12
AB
857 prev_ts->len += ts->len;
858
859 free(ts->buf);
860 free(ts);
861
862 ts = prev_ts;
863 tip->leftover_ts = NULL;
9db17354 864 }
d0ca268b 865
9db17354
JA
866 while (offset + sizeof(*t) <= ts->len) {
867 t = ts->buf + offset;
3a9d6c13 868
9cfa6c2b
AB
869 if (verify_trace(t)) {
870 write_data(tip, ts->buf, offset);
9db17354 871 return -1;
9cfa6c2b 872 }
3a9d6c13 873
9db17354 874 pdu_len = t->pdu_len;
3a9d6c13 875
9db17354 876 if (offset + sizeof(*t) + pdu_len > ts->len)
3a9d6c13 877 break;
4b5db44a 878
9db17354
JA
879 offset += sizeof(*t) + pdu_len;
880 tip->events_processed++;
b7106311 881 tip->data_read += sizeof(*t) + pdu_len;
9db17354 882 events++;
3a9d6c13
JA
883 }
884
9cfa6c2b
AB
885 if (write_data(tip, ts->buf, offset))
886 return -1;
887
3a9d6c13 888 /*
9db17354 889 * leftover bytes, save them for next time
3a9d6c13 890 */
9db17354 891 if (offset != ts->len) {
7de86b12 892 tip->leftover_ts = ts;
9e8b753c
JA
893 ts->len -= offset;
894 memmove(ts->buf, ts->buf + offset, ts->len);
7de86b12
AB
895 } else {
896 free(ts->buf);
897 free(ts);
9db17354 898 }
4b5db44a 899
9db17354 900 return events;
4b5db44a
JA
901}
902
9db17354 903static int write_tip_events(struct thread_information *tip)
d5396421 904{
21f55651 905 struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
d5396421 906
0cc7d25e
JA
907 if (ts)
908 return tip->flush_subbuf(tip, ts);
91816d54 909
9db17354 910 return 0;
91816d54
JA
911}
912
9db17354
JA
913/*
914 * scans the tips we know and writes out the subbuffers we accumulate
915 */
916static void get_and_write_events(void)
d0ca268b 917{
9db17354
JA
918 struct device_information *dip;
919 struct thread_information *tip;
27223f19 920 int i, j, events, ret, tips_running;
d0ca268b 921
9db17354
JA
922 while (!is_done()) {
923 events = 0;
d0ca268b 924
9db17354
JA
925 for_each_dip(dip, i) {
926 for_each_tip(dip, tip, j) {
927 ret = write_tip_events(tip);
928 if (ret > 0)
929 events += ret;
930 }
931 }
d0ca268b 932
9db17354 933 if (!events)
7934e668 934 usleep(100000);
d0ca268b
JA
935 }
936
a3e4d330 937 /*
9db17354 938 * reap stored events
a3e4d330 939 */
9db17354
JA
940 do {
941 events = 0;
27223f19 942 tips_running = 0;
9db17354
JA
943 for_each_dip(dip, i) {
944 for_each_tip(dip, tip, j) {
945 ret = write_tip_events(tip);
946 if (ret > 0)
947 events += ret;
27223f19 948 tips_running += !tip->exited;
9db17354 949 }
69e65a9e 950 }
9db17354 951 usleep(10);
27223f19 952 } while (events || tips_running);
d0ca268b
JA
953}
954
b7106311
JA
955static void wait_for_threads(void)
956{
957 /*
8e86c98a
JA
958 * for piped or network output, poll and fetch data for writeout.
959 * for files, we just wait around for trace threads to exit
b7106311 960 */
8e86c98a 961 if ((output_name && !strcmp(output_name, "-")) ||
ff11d54c 962 ((net_mode == Net_client) && !net_use_sendfile))
b7106311
JA
963 get_and_write_events();
964 else {
965 struct device_information *dip;
966 struct thread_information *tip;
967 int i, j, tips_running;
968
969 do {
970 tips_running = 0;
7934e668 971 usleep(100000);
b7106311
JA
972
973 for_each_dip(dip, i)
974 for_each_tip(dip, tip, j)
975 tips_running += !tip->exited;
976 } while (tips_running);
977 }
6a752c90
JA
978
979 if (net_mode == Net_client)
980 net_client_send_close();
b7106311
JA
981}
982
97159c02
JA
983static int fill_ofname(struct device_information *dip,
984 struct thread_information *tip, char *dst,
e3bf54d8 985 char *buts_name)
8e86c98a 986{
e3bf54d8 987 struct stat sb;
8e86c98a
JA
988 int len = 0;
989
990 if (output_dir)
991 len = sprintf(dst, "%s/", output_dir);
dd870ef6
AB
992 else
993 len = sprintf(dst, "./");
8e86c98a 994
e3bf54d8 995 if (net_mode == Net_server) {
ff11d54c 996 struct net_connection *nc = tip->nc;
e0a1988b 997
ff11d54c
TZ
998 len += sprintf(dst + len, "%s-", inet_ntoa(nc->ch->cl_in_addr));
999 len += strftime(dst + len, 64, "%F-%T/", gmtime(&dip->cl_connect_time));
e3bf54d8
JA
1000 }
1001
1002 if (stat(dst, &sb) < 0) {
1003 if (errno != ENOENT) {
1004 perror("stat");
1005 return 1;
1006 }
1007 if (mkdir(dst, 0755) < 0) {
1008 perror(dst);
1009 fprintf(stderr, "Can't make output dir\n");
1010 return 1;
1011 }
1012 }
1013
8e86c98a 1014 if (output_name)
e3bf54d8 1015 sprintf(dst + len, "%s.blktrace.%d", output_name, tip->cpu);
8e86c98a 1016 else
e3bf54d8
JA
1017 sprintf(dst + len, "%s.blktrace.%d", buts_name, tip->cpu);
1018
1019 return 0;
8e86c98a
JA
1020}
1021
0cc7d25e
JA
1022static void fill_ops(struct thread_information *tip)
1023{
1024 /*
1025 * setup ops
1026 */
32f18c48 1027 if (net_mode == Net_client) {
36808255 1028 if (net_use_sendfile) {
32f18c48 1029 tip->get_subbuf = get_subbuf_sendfile;
ff11d54c 1030 tip->flush_subbuf = NULL;
32f18c48
JA
1031 } else {
1032 tip->get_subbuf = get_subbuf;
1033 tip->flush_subbuf = flush_subbuf_net;
1034 }
1035 } else {
1036 if (tip->ofile_mmap)
1037 tip->get_subbuf = mmap_subbuf;
1038 else
1039 tip->get_subbuf = get_subbuf;
0cc7d25e 1040
0cc7d25e 1041 tip->flush_subbuf = flush_subbuf_file;
32f18c48
JA
1042 }
1043
0cc7d25e
JA
1044 if (net_mode == Net_server)
1045 tip->read_data = read_data_net;
1046 else
1047 tip->read_data = read_data_file;
1048}
1049
ddf22842
JA
1050static int tip_open_output(struct device_information *dip,
1051 struct thread_information *tip)
d0ca268b 1052{
ddf22842 1053 int pipeline = output_name && !strcmp(output_name, "-");
8e86c98a 1054 int mode, vbuf_size;
e3bf54d8 1055 char op[128];
d0ca268b 1056
ddf22842
JA
1057 if (net_mode == Net_client) {
1058 tip->ofile = NULL;
1059 tip->ofile_stdout = 0;
1060 tip->ofile_mmap = 0;
0c0b75b4 1061 goto done;
ddf22842
JA
1062 } else if (pipeline) {
1063 tip->ofile = fdopen(STDOUT_FILENO, "w");
1064 tip->ofile_stdout = 1;
1065 tip->ofile_mmap = 0;
1066 mode = _IOLBF;
1067 vbuf_size = 512;
1068 } else {
97159c02 1069 if (fill_ofname(dip, tip, op, dip->buts_name))
e3bf54d8 1070 return 1;
ddf22842
JA
1071 tip->ofile = fopen(op, "w+");
1072 tip->ofile_stdout = 0;
1073 tip->ofile_mmap = 1;
1074 mode = _IOFBF;
1075 vbuf_size = OFILE_BUF;
1076 }
d5396421 1077
0c0b75b4 1078 if (tip->ofile == NULL) {
ddf22842
JA
1079 perror(op);
1080 return 1;
1081 }
d5396421 1082
0c0b75b4
JA
1083 tip->ofile_buffer = malloc(vbuf_size);
1084 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
1085 perror("setvbuf");
1086 close_thread(tip);
1087 return 1;
ddf22842
JA
1088 }
1089
0c0b75b4 1090done:
ddf22842
JA
1091 fill_ops(tip);
1092 return 0;
1093}
007c233c 1094
ddf22842
JA
1095static int start_threads(struct device_information *dip)
1096{
1097 struct thread_information *tip;
1098 int j;
1099
1100 for_each_tip(dip, tip, j) {
1101 tip->cpu = j;
1102 tip->device = dip;
1103 tip->events_processed = 0;
11eedd9b 1104 tip->fd = -1;
ddf22842
JA
1105 memset(&tip->fifo, 0, sizeof(tip->fifo));
1106 tip->leftover_ts = NULL;
1107
1108 if (tip_open_output(dip, tip))
1109 return 1;
0cc7d25e 1110
9db17354 1111 if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
e7c9f3ff 1112 perror("pthread_create");
007c233c 1113 close_thread(tip);
e7c9f3ff 1114 return 1;
d0ca268b
JA
1115 }
1116 }
1117
e7c9f3ff 1118 return 0;
d0ca268b
JA
1119}
1120
e7c9f3ff 1121static void stop_threads(struct device_information *dip)
3aabcd89 1122{
e7c9f3ff 1123 struct thread_information *tip;
91816d54 1124 unsigned long ret;
007c233c
JA
1125 int i;
1126
9db17354 1127 for_each_tip(dip, tip, i) {
91816d54 1128 (void) pthread_join(tip->thread, (void *) &ret);
9db17354
JA
1129 close_thread(tip);
1130 }
3aabcd89
JA
1131}
1132
e7c9f3ff 1133static void stop_all_threads(void)
72ca8801 1134{
e7c9f3ff 1135 struct device_information *dip;
72ca8801
NS
1136 int i;
1137
99c1f5ab 1138 for_each_dip(dip, i)
e7c9f3ff
NS
1139 stop_threads(dip);
1140}
1141
1142static void stop_all_tracing(void)
1143{
1144 struct device_information *dip;
91816d54 1145 int i;
007c233c 1146
91816d54 1147 for_each_dip(dip, i)
e7c9f3ff 1148 stop_trace(dip);
72ca8801
NS
1149}
1150
1151static void exit_trace(int status)
1152{
eb3c8108
JA
1153 if (!is_trace_stopped()) {
1154 trace_stopped = 1;
1155 stop_all_threads();
1156 stop_all_tracing();
1157 }
1158
72ca8801
NS
1159 exit(status);
1160}
1161
e7c9f3ff
NS
1162static int resize_devices(char *path)
1163{
1164 int size = (ndevs + 1) * sizeof(struct device_information);
1165
1166 device_information = realloc(device_information, size);
1167 if (!device_information) {
1168 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
1169 return 1;
1170 }
1171 device_information[ndevs].path = path;
1172 ndevs++;
1173 return 0;
1174}
1175
1176static int open_devices(void)
d0ca268b 1177{
e7c9f3ff 1178 struct device_information *dip;
d0ca268b 1179 int i;
d0ca268b 1180
99c1f5ab 1181 for_each_dip(dip, i) {
cf9208ea 1182 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
e7c9f3ff
NS
1183 if (dip->fd < 0) {
1184 perror(dip->path);
1185 return 1;
1186 }
1187 }
99c1f5ab 1188
e7c9f3ff
NS
1189 return 0;
1190}
1191
1192static int start_devices(void)
1193{
1194 struct device_information *dip;
1195 int i, j, size;
1196
1197 size = ncpus * sizeof(struct thread_information);
1198 thread_information = malloc(size * ndevs);
1199 if (!thread_information) {
1200 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
1201 return 1;
1202 }
d5396421 1203
99c1f5ab 1204 for_each_dip(dip, i) {
e7c9f3ff
NS
1205 if (start_trace(dip)) {
1206 close(dip->fd);
1207 fprintf(stderr, "Failed to start trace on %s\n",
1208 dip->path);
1209 break;
1210 }
1211 }
99c1f5ab 1212
e7c9f3ff 1213 if (i != ndevs) {
ce020676 1214 __for_each_dip(dip, device_information, i, j)
e7c9f3ff 1215 stop_trace(dip);
99c1f5ab 1216
e7c9f3ff
NS
1217 return 1;
1218 }
1219
99c1f5ab 1220 for_each_dip(dip, i) {
e7c9f3ff
NS
1221 dip->threads = thread_information + (i * ncpus);
1222 if (start_threads(dip)) {
1223 fprintf(stderr, "Failed to start worker threads\n");
1224 break;
1225 }
1226 }
99c1f5ab 1227
e7c9f3ff 1228 if (i != ndevs) {
ce020676 1229 __for_each_dip(dip, device_information, i, j)
e7c9f3ff 1230 stop_threads(dip);
99c1f5ab 1231 for_each_dip(dip, i)
e7c9f3ff 1232 stop_trace(dip);
99c1f5ab 1233
e7c9f3ff 1234 return 1;
d0ca268b
JA
1235 }
1236
e7c9f3ff 1237 return 0;
d0ca268b
JA
1238}
1239
e0a1988b 1240static void show_stats(struct device_information *dips, int ndips, int cpus)
e7c9f3ff 1241{
e7c9f3ff
NS
1242 struct device_information *dip;
1243 struct thread_information *tip;
b7106311 1244 unsigned long long events_processed, data_read;
eb3c8108 1245 unsigned long total_drops;
2f903295 1246 int i, j, no_stdout = 0;
eb3c8108
JA
1247
1248 if (is_stat_shown())
1249 return;
1250
2f903295
JA
1251 if (output_name && !strcmp(output_name, "-"))
1252 no_stdout = 1;
1253
eb3c8108 1254 stat_shown = 1;
428683db 1255
56070ea4 1256 total_drops = 0;
ce020676 1257 __for_each_dip(dip, dips, ndips, i) {
2f903295 1258 if (!no_stdout)
56070ea4 1259 printf("Device: %s\n", dip->path);
e7c9f3ff 1260 events_processed = 0;
b7106311 1261 data_read = 0;
ce020676 1262 __for_each_tip(dip, tip, cpus, j) {
2f903295 1263 if (!no_stdout)
b7106311
JA
1264 printf(" CPU%3d: %20lu events, %8llu KiB data\n",
1265 tip->cpu, tip->events_processed,
54824c20 1266 (tip->data_read + 1023) >> 10);
e7c9f3ff 1267 events_processed += tip->events_processed;
b7106311 1268 data_read += tip->data_read;
e7c9f3ff 1269 }
eb3c8108 1270 total_drops += dip->drop_count;
2f903295 1271 if (!no_stdout)
b7106311
JA
1272 printf(" Total: %20llu events (dropped %lu), %8llu KiB data\n",
1273 events_processed, dip->drop_count,
18d8437d 1274 (data_read + 1023) >> 10);
e7c9f3ff 1275 }
56070ea4
JA
1276
1277 if (total_drops)
1278 fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
e7c9f3ff 1279}
52724a0e 1280
e0a1988b 1281static struct device_information *net_get_dip(struct net_connection *nc,
ff11d54c 1282 char *buts_name, u32 cl_id)
8e86c98a 1283{
ff11d54c
TZ
1284 struct device_information *dip, *cl_dip = NULL;
1285 struct cl_host *ch = nc->ch;
8e86c98a
JA
1286 int i;
1287
ff11d54c
TZ
1288 for (i = 0; i < ch->ndevs; i++) {
1289 dip = &ch->device_information[i];
8e86c98a 1290
22cd0c02
JA
1291 if (!strcmp(dip->buts_name, buts_name))
1292 return dip;
ff11d54c
TZ
1293
1294 if (dip->cl_id == cl_id)
1295 cl_dip = dip;
8e86c98a
JA
1296 }
1297
ff11d54c
TZ
1298 ch->device_information = realloc(ch->device_information, (ch->ndevs + 1) * sizeof(*dip));
1299 dip = &ch->device_information[ch->ndevs];
921b05fe
JA
1300 memset(dip, 0, sizeof(*dip));
1301 dip->fd = -1;
ff11d54c
TZ
1302 dip->ch = ch;
1303 dip->cl_id = cl_id;
1304 if (cl_dip)
1305 dip->cl_connect_time = cl_dip->cl_connect_time;
1306 else
1307 dip->cl_connect_time = nc->connect_time;
22cd0c02 1308 strcpy(dip->buts_name, buts_name);
921b05fe 1309 dip->path = strdup(buts_name);
7ab2f837 1310 dip->trace_started = 1;
ff11d54c 1311 ch->ndevs++;
e0a1988b
JA
1312 dip->threads = malloc(nc->ncpus * sizeof(struct thread_information));
1313 memset(dip->threads, 0, nc->ncpus * sizeof(struct thread_information));
22cd0c02
JA
1314
1315 /*
1316 * open all files
1317 */
e0a1988b 1318 for (i = 0; i < nc->ncpus; i++) {
22cd0c02 1319 struct thread_information *tip = &dip->threads[i];
8e86c98a 1320
22cd0c02 1321 tip->cpu = i;
22cd0c02 1322 tip->device = dip;
1366e53a 1323 tip->fd = -1;
ff11d54c
TZ
1324 tip->nc = nc;
1325
ddf22842 1326 if (tip_open_output(dip, tip))
22cd0c02 1327 return NULL;
ff11d54c
TZ
1328
1329 tip->nc = NULL;
8e86c98a
JA
1330 }
1331
22cd0c02
JA
1332 return dip;
1333}
1334
e0a1988b
JA
1335static struct thread_information *net_get_tip(struct net_connection *nc,
1336 struct blktrace_net_hdr *bnh)
22cd0c02
JA
1337{
1338 struct device_information *dip;
ff11d54c 1339 struct thread_information *tip;
22cd0c02 1340
ff11d54c 1341 dip = net_get_dip(nc, bnh->buts_name, bnh->cl_id);
7ab2f837
JA
1342 if (!dip->trace_started) {
1343 fprintf(stderr, "Events for closed devices %s\n", dip->buts_name);
1344 return NULL;
1345 }
1346
ff11d54c
TZ
1347 tip = &dip->threads[bnh->cpu];
1348 if (!tip->nc)
1349 tip->nc = nc;
1350
1351 return tip;
8e86c98a
JA
1352}
1353
e0a1988b
JA
1354static int net_get_header(struct net_connection *nc,
1355 struct blktrace_net_hdr *bnh)
8e86c98a 1356{
e0a1988b 1357 int fl = fcntl(nc->in_fd, F_GETFL);
8e86c98a
JA
1358 int bytes_left, ret;
1359 void *p = bnh;
1360
e0a1988b 1361 fcntl(nc->in_fd, F_SETFL, fl | O_NONBLOCK);
8e86c98a
JA
1362 bytes_left = sizeof(*bnh);
1363 while (bytes_left && !is_done()) {
e0a1988b 1364 ret = recv(nc->in_fd, p, bytes_left, MSG_WAITALL);
8e86c98a
JA
1365 if (ret < 0) {
1366 if (errno != EAGAIN) {
1367 perror("recv header");
1368 return 1;
1369 }
7934e668 1370 usleep(1000);
8e86c98a
JA
1371 continue;
1372 } else if (!ret) {
7934e668 1373 usleep(1000);
8e86c98a
JA
1374 continue;
1375 } else {
1376 p += ret;
1377 bytes_left -= ret;
1378 }
1379 }
e0a1988b 1380 fcntl(nc->in_fd, F_SETFL, fl & ~O_NONBLOCK);
227f89ff 1381 return bytes_left;
8e86c98a
JA
1382}
1383
e0a1988b
JA
1384/*
1385 * finalize a net client: truncate files, show stats, cleanup, etc
1386 */
ff11d54c 1387static void device_done(struct net_connection *nc, struct device_information *dip)
e0a1988b 1388{
e0a1988b 1389 struct thread_information *tip;
ff11d54c 1390 int i;
e0a1988b 1391
ff11d54c
TZ
1392 __for_each_tip(dip, tip, nc->ncpus, i)
1393 tip_ftrunc_final(tip);
e0a1988b 1394
ff11d54c 1395 show_stats(dip, 1, nc->ncpus);
e0a1988b
JA
1396
1397 /*
1398 * cleanup for next run
1399 */
ff11d54c
TZ
1400 __for_each_tip(dip, tip, nc->ncpus, i) {
1401 if (tip->ofile)
1402 fclose(tip->ofile);
e0a1988b
JA
1403 }
1404
ff11d54c
TZ
1405 free(dip->threads);
1406 free(dip->path);
e0a1988b
JA
1407
1408 close(nc->in_fd);
1409 nc->in_fd = -1;
1410
ff11d54c
TZ
1411 stat_shown = 0;
1412}
e0a1988b 1413
ff11d54c
TZ
1414static inline int in_addr_eq(struct in_addr a, struct in_addr b)
1415{
1416 return a.s_addr == b.s_addr;
1417}
1418
1419static void net_add_client_host(struct cl_host *ch)
1420{
1421 ch->list_next = cl_host_list;
1422 cl_host_list = ch;
1423 cl_hosts++;
1424}
1425
1426static void net_remove_client_host(struct cl_host *ch)
1427{
1428 struct cl_host *p, *c;
1429
1430 for (p = c = cl_host_list; c; c = c->list_next) {
1431 if (c == ch) {
1432 if (p == c)
1433 cl_host_list = c->list_next;
1434 else
1435 p->list_next = c->list_next;
1436 cl_hosts--;
1437 return;
1438 }
1439 p = c;
e0a1988b 1440 }
ff11d54c 1441}
e0a1988b 1442
ff11d54c
TZ
1443static struct cl_host *net_find_client_host(struct in_addr cl_in_addr)
1444{
1445 struct cl_host *ch = cl_host_list;
1446
1447 while (ch) {
1448 if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
1449 return ch;
1450 ch = ch->list_next;
1451 }
1452
1453 return NULL;
1454}
1455
ff11d54c
TZ
1456static void net_client_host_done(struct cl_host *ch)
1457{
1458 free(ch->device_information);
1459 free(ch->net_connections);
1460 net_connects -= ch->nconn;
1461 net_remove_client_host(ch);
1462 free(ch);
e0a1988b
JA
1463}
1464
1465/*
1466 * handle incoming events from a net client
1467 */
1468static int net_client_data(struct net_connection *nc)
8e86c98a
JA
1469{
1470 struct thread_information *tip;
1471 struct blktrace_net_hdr bnh;
1472
e0a1988b 1473 if (net_get_header(nc, &bnh))
8e86c98a
JA
1474 return 1;
1475
1476 if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
1477 fprintf(stderr, "server: received data is bad\n");
1478 return 1;
1479 }
1480
1481 if (!data_is_native) {
227f89ff 1482 bnh.magic = be32_to_cpu(bnh.magic);
8e86c98a 1483 bnh.cpu = be32_to_cpu(bnh.cpu);
4aeec019 1484 bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
8e86c98a 1485 bnh.len = be32_to_cpu(bnh.len);
ff11d54c 1486 bnh.cl_id = be32_to_cpu(bnh.cl_id);
8e86c98a
JA
1487 }
1488
227f89ff
JA
1489 if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1490 fprintf(stderr, "server: bad data magic\n");
1491 return 1;
1492 }
1493
4aeec019
JA
1494 if (nc->ncpus == -1)
1495 nc->ncpus = bnh.max_cpus;
1496
6a752c90
JA
1497 /*
1498 * len == 0 means that the other end signalled end-of-run
1499 */
1500 if (!bnh.len) {
7934e668
JA
1501 /*
1502 * overload cpu count with dropped events
1503 */
1504 struct device_information *dip;
1505
ff11d54c 1506 dip = net_get_dip(nc, bnh.buts_name, bnh.cl_id);
7934e668 1507 dip->drop_count = bnh.cpu;
7ab2f837 1508 dip->trace_started = 0;
7934e668 1509
e0a1988b 1510 printf("server: end of run for %s\n", dip->buts_name);
4aeec019 1511
ff11d54c
TZ
1512 device_done(nc, dip);
1513
1514 if (++nc->ch->ndevs_done == nc->ch->ndevs)
1515 net_client_host_done(nc->ch);
4aeec019 1516
e0a1988b 1517 return 0;
6a752c90
JA
1518 }
1519
e0a1988b 1520 tip = net_get_tip(nc, &bnh);
8e86c98a
JA
1521 if (!tip)
1522 return 1;
1523
1524 if (mmap_subbuf(tip, bnh.len))
1525 return 1;
1526
1527 return 0;
1528}
1529
e0a1988b 1530static void net_add_connection(int listen_fd, struct sockaddr_in *addr)
659bcc3f 1531{
e0a1988b
JA
1532 socklen_t socklen = sizeof(*addr);
1533 struct net_connection *nc;
ff11d54c
TZ
1534 struct cl_host *ch;
1535 int in_fd;
659bcc3f 1536
ff11d54c
TZ
1537 in_fd = accept(listen_fd, (struct sockaddr *) addr, &socklen);
1538 if (in_fd < 0) {
1539 perror("accept");
e0a1988b 1540 return;
659bcc3f 1541 }
659bcc3f 1542
ff11d54c
TZ
1543 ch = net_find_client_host(addr->sin_addr);
1544 if (!ch) {
1545 if (cl_hosts == NET_MAX_CL_HOSTS) {
1546 fprintf(stderr, "server: no more clients allowed\n");
1547 return;
1548 }
1549 ch = malloc(sizeof(struct cl_host));
1550 memset(ch, 0, sizeof(*ch));
1551 ch->cl_in_addr = addr->sin_addr;
1552 net_add_client_host(ch);
659bcc3f
JA
1553 }
1554
ff11d54c
TZ
1555 ch->net_connections = realloc(ch->net_connections, (ch->nconn + 1) * sizeof(*nc));
1556 nc = &ch->net_connections[ch->nconn++];
1557 memset(nc, 0, sizeof(*nc));
1558
e0a1988b
JA
1559 printf("server: connection from %s\n", inet_ntoa(addr->sin_addr));
1560 time(&nc->connect_time);
ff11d54c
TZ
1561 nc->ch = ch;
1562 nc->in_fd = in_fd;
4aeec019 1563 nc->ncpus = -1;
e0a1988b
JA
1564 net_connects++;
1565}
1566
1567/*
1568 * event driven loop, handle new incoming connections and data from
1569 * existing connections
1570 */
1571static void net_server_handle_connections(int listen_fd,
1572 struct sockaddr_in *addr)
1573{
ff11d54c
TZ
1574 struct pollfd *pfds = NULL;
1575 struct net_connection **ncs = NULL;
1576 int max_connects = 0;
1577 int i, nconns, events;
1578 struct cl_host *ch;
1579 struct net_connection *nc;
1580
e0a1988b
JA
1581 printf("server: waiting for connections...\n");
1582
1583 while (!is_done()) {
ff11d54c
TZ
1584 if (net_connects >= max_connects) {
1585 pfds = realloc(pfds, (net_connects + 1) * sizeof(*pfds));
1586 ncs = realloc(ncs, (net_connects + 1) * sizeof(*ncs));
1587 max_connects = net_connects + 1;
1588 }
e0a1988b
JA
1589 /*
1590 * the zero entry is for incoming connections, remaining
1591 * entries for clients
1592 */
1593 pfds[0].fd = listen_fd;
1594 pfds[0].events = POLLIN;
ff11d54c
TZ
1595 nconns = 0;
1596 for_each_cl_host(ch) {
1597 for (i = 0; i < ch->nconn; i++) {
1598 nc = &ch->net_connections[i];
1599 pfds[nconns + 1].fd = nc->in_fd;
1600 pfds[nconns + 1].events = POLLIN;
1601 ncs[nconns++] = nc;
1602 }
e0a1988b
JA
1603 }
1604
ff11d54c 1605 events = poll(pfds, 1 + nconns, -1);
e0a1988b
JA
1606 if (events < 0) {
1607 if (errno == EINTR)
1608 continue;
1609
1610 perror("poll");
1611 break;
1612 } else if (!events)
1613 continue;
1614
1615 if (pfds[0].revents & POLLIN) {
1616 net_add_connection(listen_fd, addr);
1617 events--;
1618 }
1619
ff11d54c 1620 for (i = 0; events && i < nconns; i++) {
e0a1988b 1621 if (pfds[i + 1].revents & POLLIN) {
ff11d54c 1622 net_client_data(ncs[i]);
e0a1988b
JA
1623 events--;
1624 }
1625 }
1626 }
659bcc3f
JA
1627}
1628
8e86c98a
JA
1629/*
1630 * Start here when we are in server mode - just fetch data from the network
1631 * and dump to files
1632 */
1633static int net_server(void)
1634{
1635 struct sockaddr_in addr;
e0a1988b 1636 int fd, opt;
8e86c98a
JA
1637
1638 fd = socket(AF_INET, SOCK_STREAM, 0);
1639 if (fd < 0) {
1640 perror("server: socket");
1641 return 1;
1642 }
1643
1644 opt = 1;
1645 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
1646 perror("setsockopt");
1647 return 1;
1648 }
1649
1650 memset(&addr, 0, sizeof(addr));
1651 addr.sin_family = AF_INET;
1652 addr.sin_addr.s_addr = htonl(INADDR_ANY);
1653 addr.sin_port = htons(net_port);
1654
1655 if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
1656 perror("bind");
1657 return 1;
1658 }
1659
1660 if (listen(fd, 1) < 0) {
1661 perror("listen");
1662 return 1;
1663 }
1664
e0a1988b
JA
1665 net_server_handle_connections(fd, &addr);
1666 return 0;
8e86c98a
JA
1667}
1668
1669/*
1670 * Setup outgoing network connection where we will transmit data
1671 */
ff11d54c 1672static int net_setup_client_cpu(int i, struct sockaddr_in *addr)
8e86c98a 1673{
8e86c98a
JA
1674 int fd;
1675
1676 fd = socket(AF_INET, SOCK_STREAM, 0);
1677 if (fd < 0) {
1678 perror("client: socket");
1679 return 1;
1680 }
1681
ff11d54c
TZ
1682 if (connect(fd, (struct sockaddr *) addr, sizeof(*addr)) < 0) {
1683 perror("client: connect");
1684 return 1;
1685 }
1686
1687 net_out_fd[i] = fd;
1688 return 0;
1689}
1690
1691static int net_setup_client(void)
1692{
1693 struct sockaddr_in addr;
1694 int i;
1695
8e86c98a
JA
1696 memset(&addr, 0, sizeof(addr));
1697 addr.sin_family = AF_INET;
1698 addr.sin_port = htons(net_port);
1699
1700 if (inet_aton(hostname, &addr.sin_addr) != 1) {
1701 struct hostent *hent = gethostbyname(hostname);
1702 if (!hent) {
1703 perror("gethostbyname");
1704 return 1;
1705 }
1706
1707 memcpy(&addr.sin_addr, hent->h_addr, 4);
1708 strcpy(hostname, hent->h_name);
1709 }
1710
1711 printf("blktrace: connecting to %s\n", hostname);
1712
ff11d54c
TZ
1713 net_out_fd = malloc(ncpus * sizeof(*net_out_fd));
1714 for (i = 0; i < ncpus; i++) {
1715 if (net_setup_client_cpu(i, &addr))
1716 return 1;
8e86c98a
JA
1717 }
1718
1719 printf("blktrace: connected!\n");
ff11d54c 1720
8e86c98a
JA
1721 return 0;
1722}
1723
52724a0e 1724static char usage_str[] = \
3d06efea 1725 "-d <dev> [ -r debugfs path ] [ -o <output> ] [-k ] [ -w time ]\n" \
52724a0e
JA
1726 "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
1727 "\t-d Use specified device. May also be given last after options\n" \
3d06efea 1728 "\t-r Path to mounted debugfs, defaults to /debug\n" \
52724a0e 1729 "\t-o File(s) to send output to\n" \
d1d7f15f 1730 "\t-D Directory to prepend to output file names\n" \
52724a0e
JA
1731 "\t-k Kill a running trace\n" \
1732 "\t-w Stop after defined time, in seconds\n" \
1733 "\t-a Only trace specified actions. See documentation\n" \
1734 "\t-A Give trace mask as a single value. See documentation\n" \
129aa440
JA
1735 "\t-b Sub buffer size in KiB\n" \
1736 "\t-n Number of sub buffers\n" \
f531b94d
JA
1737 "\t-l Run in network listen mode (blktrace server)\n" \
1738 "\t-h Run in network client mode, connecting to the given host\n" \
1739 "\t-p Network port to use (default 8462)\n" \
ff11d54c 1740 "\t-s Make the network client NOT use sendfile() to transfer data\n" \
f531b94d 1741 "\t-V Print program version info\n\n";
52724a0e 1742
ee1f4158
NS
1743static void show_usage(char *program)
1744{
52724a0e 1745 fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
ee1f4158 1746}
d0ca268b
JA
1747
1748int main(int argc, char *argv[])
1749{
3d06efea 1750 static char default_debugfs_path[] = "/debug";
e3e74029 1751 struct statfs st;
d39c04ca 1752 int i, c;
ece238a6 1753 int stop_watch = 0;
d39c04ca
AB
1754 int act_mask_tmp = 0;
1755
1756 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
1757 switch (c) {
1758 case 'a':
1759 i = find_mask_map(optarg);
1760 if (i < 0) {
ab197ca7 1761 fprintf(stderr,"Invalid action mask %s\n",
d39c04ca 1762 optarg);
7425d456 1763 return 1;
d39c04ca
AB
1764 }
1765 act_mask_tmp |= i;
1766 break;
1767
1768 case 'A':
98f8386b
AB
1769 if ((sscanf(optarg, "%x", &i) != 1) ||
1770 !valid_act_opt(i)) {
d39c04ca 1771 fprintf(stderr,
ab197ca7 1772 "Invalid set action mask %s/0x%x\n",
d39c04ca 1773 optarg, i);
7425d456 1774 return 1;
d39c04ca
AB
1775 }
1776 act_mask_tmp = i;
1777 break;
d0ca268b 1778
d39c04ca 1779 case 'd':
e7c9f3ff
NS
1780 if (resize_devices(optarg) != 0)
1781 return 1;
d39c04ca
AB
1782 break;
1783
5270dddd 1784 case 'r':
3d06efea 1785 debugfs_path = optarg;
5270dddd
JA
1786 break;
1787
d5396421 1788 case 'o':
66efebf8 1789 output_name = optarg;
d5396421 1790 break;
bc39777c
JA
1791 case 'k':
1792 kill_running_trace = 1;
1793 break;
ece238a6
NS
1794 case 'w':
1795 stop_watch = atoi(optarg);
1796 if (stop_watch <= 0) {
1797 fprintf(stderr,
1798 "Invalid stopwatch value (%d secs)\n",
1799 stop_watch);
1800 return 1;
1801 }
1802 break;
57ea8602 1803 case 'V':
52724a0e
JA
1804 printf("%s version %s\n", argv[0], blktrace_version);
1805 return 0;
129aa440 1806 case 'b':
eb3c8108 1807 buf_size = strtoul(optarg, NULL, 10);
183a0855 1808 if (buf_size <= 0 || buf_size > 16*1024) {
129aa440 1809 fprintf(stderr,
eb3c8108 1810 "Invalid buffer size (%lu)\n",buf_size);
129aa440
JA
1811 return 1;
1812 }
1813 buf_size <<= 10;
1814 break;
1815 case 'n':
eb3c8108 1816 buf_nr = strtoul(optarg, NULL, 10);
129aa440
JA
1817 if (buf_nr <= 0) {
1818 fprintf(stderr,
eb3c8108 1819 "Invalid buffer nr (%lu)\n", buf_nr);
129aa440
JA
1820 return 1;
1821 }
1822 break;
d1d7f15f
JA
1823 case 'D':
1824 output_dir = optarg;
1825 break;
8e86c98a
JA
1826 case 'h':
1827 net_mode = Net_client;
1828 strcpy(hostname, optarg);
1829 break;
1830 case 'l':
1831 net_mode = Net_server;
1832 break;
1833 case 'p':
1834 net_port = atoi(optarg);
1835 break;
32f18c48 1836 case 's':
79971f43 1837 net_use_sendfile = 0;
32f18c48 1838 break;
d39c04ca 1839 default:
ee1f4158 1840 show_usage(argv[0]);
7425d456 1841 return 1;
d39c04ca
AB
1842 }
1843 }
1844
8e86c98a
JA
1845 setlocale(LC_NUMERIC, "en_US");
1846
1847 page_size = getpagesize();
1848
1849 if (net_mode == Net_server)
1850 return net_server();
1851
22cd0c02
JA
1852 while (optind < argc) {
1853 if (resize_devices(argv[optind++]) != 0)
1854 return 1;
1855 }
1856
e7c9f3ff 1857 if (ndevs == 0) {
ee1f4158 1858 show_usage(argv[0]);
7425d456 1859 return 1;
d39c04ca
AB
1860 }
1861
d5396421 1862 if (act_mask_tmp != 0)
d39c04ca 1863 act_mask = act_mask_tmp;
d0ca268b 1864
3d06efea
JA
1865 if (!debugfs_path)
1866 debugfs_path = default_debugfs_path;
1867
1868 if (statfs(debugfs_path, &st) < 0) {
e3e74029
NS
1869 perror("statfs");
1870 fprintf(stderr,"%s does not appear to be a valid path\n",
3d06efea 1871 debugfs_path);
e3e74029 1872 return 1;
3d06efea
JA
1873 } else if (st.f_type != (long) DEBUGFS_TYPE) {
1874 fprintf(stderr,"%s does not appear to be a debug filesystem\n",
1875 debugfs_path);
7425d456 1876 return 1;
d0ca268b
JA
1877 }
1878
e7c9f3ff 1879 if (open_devices() != 0)
7425d456 1880 return 1;
bc39777c
JA
1881
1882 if (kill_running_trace) {
e7c9f3ff 1883 stop_all_traces();
7425d456 1884 return 0;
bc39777c
JA
1885 }
1886
e7c9f3ff
NS
1887 ncpus = sysconf(_SC_NPROCESSORS_ONLN);
1888 if (ncpus < 0) {
1889 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
7425d456 1890 return 1;
d0ca268b
JA
1891 }
1892
d0ca268b
JA
1893 signal(SIGINT, handle_sigint);
1894 signal(SIGHUP, handle_sigint);
1895 signal(SIGTERM, handle_sigint);
ece238a6 1896 signal(SIGALRM, handle_sigint);
d0ca268b 1897
8e86c98a
JA
1898 if (net_mode == Net_client && net_setup_client())
1899 return 1;
1900
1901 if (start_devices() != 0)
1902 return 1;
1903
e7c9f3ff 1904 atexit(stop_all_tracing);
830fd65c 1905
ece238a6
NS
1906 if (stop_watch)
1907 alarm(stop_watch);
1908
b7106311 1909 wait_for_threads();
d0ca268b 1910
eb3c8108
JA
1911 if (!is_trace_stopped()) {
1912 trace_stopped = 1;
91816d54
JA
1913 stop_all_threads();
1914 stop_all_traces();
91816d54 1915 }
d0ca268b 1916
e0a1988b 1917 show_stats(device_information, ndevs, ncpus);
eb3c8108 1918
d0ca268b
JA
1919 return 0;
1920}
1921