Commit | Line | Data |
---|---|---|
fc44ef5a | 1 | // SPDX-License-Identifier: GPL-2.0-only |
192dc405 ED |
2 | /* |
3 | * Copyright 2018 Google Inc. | |
4 | * Author: Eric Dumazet (edumazet@google.com) | |
5 | * | |
6 | * Reference program demonstrating tcp mmap() usage, | |
7 | * and SO_RCVLOWAT hints for receiver. | |
8 | * | |
9 | * Note : NIC with header split is needed to use mmap() on TCP : | |
10 | * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload. | |
11 | * | |
12 | * How to use on loopback interface : | |
13 | * | |
14 | * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header) | |
15 | * tcp_mmap -s -z & | |
16 | * tcp_mmap -H ::1 -z | |
17 | * | |
18 | * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12) | |
19 | * (4096 : page size on x86, 12: TCP TS option length) | |
20 | * tcp_mmap -s -z -M $((4096+12)) & | |
21 | * tcp_mmap -H ::1 -z -M $((4096+12)) | |
22 | * | |
23 | * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface. | |
24 | * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;) | |
25 | * | |
26 | * $ ./tcp_mmap -s & # Without mmap() | |
27 | * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done | |
28 | * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit | |
29 | * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches | |
30 | * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit | |
31 | * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches | |
32 | * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit | |
33 | * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches | |
34 | * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit | |
35 | * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches | |
36 | * $ kill %1 # kill tcp_mmap server | |
37 | * | |
38 | * $ ./tcp_mmap -s -z & # With mmap() | |
39 | * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done | |
40 | * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit | |
41 | * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches | |
42 | * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit | |
43 | * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches | |
44 | * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit | |
45 | * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches | |
46 | * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit | |
47 | * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches | |
192dc405 ED |
48 | */ |
49 | #define _GNU_SOURCE | |
50 | #include <pthread.h> | |
51 | #include <sys/types.h> | |
52 | #include <fcntl.h> | |
53 | #include <error.h> | |
54 | #include <sys/socket.h> | |
55 | #include <sys/mman.h> | |
56 | #include <sys/resource.h> | |
57 | #include <unistd.h> | |
58 | #include <string.h> | |
59 | #include <stdlib.h> | |
60 | #include <stdio.h> | |
61 | #include <errno.h> | |
62 | #include <time.h> | |
63 | #include <sys/time.h> | |
64 | #include <netinet/in.h> | |
192dc405 ED |
65 | #include <arpa/inet.h> |
66 | #include <poll.h> | |
aacb0c2e ED |
67 | #include <linux/tcp.h> |
68 | #include <assert.h> | |
192dc405 ED |
69 | |
70 | #ifndef MSG_ZEROCOPY | |
71 | #define MSG_ZEROCOPY 0x4000000 | |
72 | #endif | |
73 | ||
74 | #define FILE_SZ (1UL << 35) | |
75 | static int cfg_family = AF_INET6; | |
76 | static socklen_t cfg_alen = sizeof(struct sockaddr_in6); | |
77 | static int cfg_port = 8787; | |
78 | ||
79 | static int rcvbuf; /* Default: autotuning. Can be set with -r <integer> option */ | |
80 | static int sndbuf; /* Default: autotuning. Can be set with -w <integer> option */ | |
81 | static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */ | |
82 | static int xflg; /* hash received data (simple xor) (-h option) */ | |
83 | static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */ | |
84 | ||
85 | static int chunk_size = 512*1024; | |
86 | ||
87 | unsigned long htotal; | |
88 | ||
89 | static inline void prefetch(const void *x) | |
90 | { | |
91 | #if defined(__x86_64__) | |
92 | asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x)); | |
93 | #endif | |
94 | } | |
95 | ||
96 | void hash_zone(void *zone, unsigned int length) | |
97 | { | |
98 | unsigned long temp = htotal; | |
99 | ||
100 | while (length >= 8*sizeof(long)) { | |
101 | prefetch(zone + 384); | |
102 | temp ^= *(unsigned long *)zone; | |
103 | temp ^= *(unsigned long *)(zone + sizeof(long)); | |
104 | temp ^= *(unsigned long *)(zone + 2*sizeof(long)); | |
105 | temp ^= *(unsigned long *)(zone + 3*sizeof(long)); | |
106 | temp ^= *(unsigned long *)(zone + 4*sizeof(long)); | |
107 | temp ^= *(unsigned long *)(zone + 5*sizeof(long)); | |
108 | temp ^= *(unsigned long *)(zone + 6*sizeof(long)); | |
109 | temp ^= *(unsigned long *)(zone + 7*sizeof(long)); | |
110 | zone += 8*sizeof(long); | |
111 | length -= 8*sizeof(long); | |
112 | } | |
113 | while (length >= 1) { | |
114 | temp ^= *(unsigned char *)zone; | |
115 | zone += 1; | |
116 | length--; | |
117 | } | |
118 | htotal = temp; | |
119 | } | |
120 | ||
121 | void *child_thread(void *arg) | |
122 | { | |
123 | unsigned long total_mmap = 0, total = 0; | |
aacb0c2e | 124 | struct tcp_zerocopy_receive zc; |
192dc405 ED |
125 | unsigned long delta_usec; |
126 | int flags = MAP_SHARED; | |
127 | struct timeval t0, t1; | |
128 | char *buffer = NULL; | |
aacb0c2e | 129 | void *addr = NULL; |
192dc405 ED |
130 | double throughput; |
131 | struct rusage ru; | |
132 | int lu, fd; | |
133 | ||
134 | fd = (int)(unsigned long)arg; | |
135 | ||
136 | gettimeofday(&t0, NULL); | |
137 | ||
138 | fcntl(fd, F_SETFL, O_NDELAY); | |
139 | buffer = malloc(chunk_size); | |
140 | if (!buffer) { | |
141 | perror("malloc"); | |
142 | goto error; | |
143 | } | |
aacb0c2e ED |
144 | if (zflg) { |
145 | addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0); | |
146 | if (addr == (void *)-1) | |
147 | zflg = 0; | |
148 | } | |
192dc405 ED |
149 | while (1) { |
150 | struct pollfd pfd = { .fd = fd, .events = POLLIN, }; | |
151 | int sub; | |
152 | ||
153 | poll(&pfd, 1, 10000); | |
154 | if (zflg) { | |
aacb0c2e ED |
155 | socklen_t zc_len = sizeof(zc); |
156 | int res; | |
157 | ||
158 | zc.address = (__u64)addr; | |
159 | zc.length = chunk_size; | |
160 | zc.recv_skip_hint = 0; | |
161 | res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, | |
162 | &zc, &zc_len); | |
163 | if (res == -1) | |
192dc405 | 164 | break; |
aacb0c2e ED |
165 | |
166 | if (zc.length) { | |
167 | assert(zc.length <= chunk_size); | |
168 | total_mmap += zc.length; | |
169 | if (xflg) | |
170 | hash_zone(addr, zc.length); | |
171 | total += zc.length; | |
192dc405 | 172 | } |
aacb0c2e ED |
173 | if (zc.recv_skip_hint) { |
174 | assert(zc.recv_skip_hint <= chunk_size); | |
175 | lu = read(fd, buffer, zc.recv_skip_hint); | |
176 | if (lu > 0) { | |
177 | if (xflg) | |
178 | hash_zone(buffer, lu); | |
179 | total += lu; | |
180 | } | |
192dc405 ED |
181 | } |
182 | continue; | |
183 | } | |
192dc405 ED |
184 | sub = 0; |
185 | while (sub < chunk_size) { | |
186 | lu = read(fd, buffer + sub, chunk_size - sub); | |
187 | if (lu == 0) | |
188 | goto end; | |
189 | if (lu < 0) | |
190 | break; | |
191 | if (xflg) | |
192 | hash_zone(buffer + sub, lu); | |
193 | total += lu; | |
194 | sub += lu; | |
195 | } | |
196 | } | |
197 | end: | |
198 | gettimeofday(&t1, NULL); | |
199 | delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; | |
200 | ||
201 | throughput = 0; | |
202 | if (delta_usec) | |
203 | throughput = total * 8.0 / (double)delta_usec / 1000.0; | |
204 | getrusage(RUSAGE_THREAD, &ru); | |
205 | if (total > 1024*1024) { | |
206 | unsigned long total_usec; | |
207 | unsigned long mb = total >> 20; | |
208 | total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec + | |
209 | 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec; | |
210 | printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n" | |
211 | " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n", | |
212 | total / (1024.0 * 1024.0), | |
213 | 100.0*total_mmap/total, | |
214 | (double)delta_usec / 1000000.0, | |
215 | throughput, | |
216 | (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0, | |
217 | (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0, | |
218 | (double)total_usec/mb, | |
219 | ru.ru_nvcsw); | |
220 | } | |
221 | error: | |
222 | free(buffer); | |
223 | close(fd); | |
aacb0c2e ED |
224 | if (zflg) |
225 | munmap(addr, chunk_size); | |
192dc405 ED |
226 | pthread_exit(0); |
227 | } | |
228 | ||
229 | static void apply_rcvsnd_buf(int fd) | |
230 | { | |
231 | if (rcvbuf && setsockopt(fd, SOL_SOCKET, | |
232 | SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) { | |
233 | perror("setsockopt SO_RCVBUF"); | |
234 | } | |
235 | ||
236 | if (sndbuf && setsockopt(fd, SOL_SOCKET, | |
237 | SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) { | |
238 | perror("setsockopt SO_SNDBUF"); | |
239 | } | |
240 | } | |
241 | ||
242 | ||
243 | static void setup_sockaddr(int domain, const char *str_addr, | |
244 | struct sockaddr_storage *sockaddr) | |
245 | { | |
246 | struct sockaddr_in6 *addr6 = (void *) sockaddr; | |
247 | struct sockaddr_in *addr4 = (void *) sockaddr; | |
248 | ||
249 | switch (domain) { | |
250 | case PF_INET: | |
251 | memset(addr4, 0, sizeof(*addr4)); | |
252 | addr4->sin_family = AF_INET; | |
253 | addr4->sin_port = htons(cfg_port); | |
254 | if (str_addr && | |
255 | inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) | |
256 | error(1, 0, "ipv4 parse error: %s", str_addr); | |
257 | break; | |
258 | case PF_INET6: | |
259 | memset(addr6, 0, sizeof(*addr6)); | |
260 | addr6->sin6_family = AF_INET6; | |
261 | addr6->sin6_port = htons(cfg_port); | |
262 | if (str_addr && | |
263 | inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) | |
264 | error(1, 0, "ipv6 parse error: %s", str_addr); | |
265 | break; | |
266 | default: | |
267 | error(1, 0, "illegal domain"); | |
268 | } | |
269 | } | |
270 | ||
271 | static void do_accept(int fdlisten) | |
272 | { | |
20021578 ED |
273 | pthread_attr_t attr; |
274 | ||
275 | pthread_attr_init(&attr); | |
276 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); | |
277 | ||
192dc405 ED |
278 | if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, |
279 | &chunk_size, sizeof(chunk_size)) == -1) { | |
280 | perror("setsockopt SO_RCVLOWAT"); | |
281 | } | |
282 | ||
283 | apply_rcvsnd_buf(fdlisten); | |
284 | ||
285 | while (1) { | |
286 | struct sockaddr_in addr; | |
287 | socklen_t addrlen = sizeof(addr); | |
288 | pthread_t th; | |
289 | int fd, res; | |
290 | ||
291 | fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen); | |
292 | if (fd == -1) { | |
293 | perror("accept"); | |
294 | continue; | |
295 | } | |
20021578 | 296 | res = pthread_create(&th, &attr, child_thread, |
192dc405 ED |
297 | (void *)(unsigned long)fd); |
298 | if (res) { | |
299 | errno = res; | |
300 | perror("pthread_create"); | |
301 | close(fd); | |
302 | } | |
303 | } | |
304 | } | |
305 | ||
306 | int main(int argc, char *argv[]) | |
307 | { | |
308 | struct sockaddr_storage listenaddr, addr; | |
309 | unsigned int max_pacing_rate = 0; | |
310 | unsigned long total = 0; | |
311 | char *host = NULL; | |
312 | int fd, c, on = 1; | |
313 | char *buffer; | |
314 | int sflg = 0; | |
315 | int mss = 0; | |
316 | ||
317 | while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) { | |
318 | switch (c) { | |
319 | case '4': | |
320 | cfg_family = PF_INET; | |
321 | cfg_alen = sizeof(struct sockaddr_in); | |
322 | break; | |
323 | case '6': | |
324 | cfg_family = PF_INET6; | |
325 | cfg_alen = sizeof(struct sockaddr_in6); | |
326 | break; | |
327 | case 'p': | |
328 | cfg_port = atoi(optarg); | |
329 | break; | |
330 | case 'H': | |
331 | host = optarg; | |
332 | break; | |
333 | case 's': /* server : listen for incoming connections */ | |
334 | sflg++; | |
335 | break; | |
336 | case 'r': | |
337 | rcvbuf = atoi(optarg); | |
338 | break; | |
339 | case 'w': | |
340 | sndbuf = atoi(optarg); | |
341 | break; | |
342 | case 'z': | |
343 | zflg = 1; | |
344 | break; | |
345 | case 'M': | |
346 | mss = atoi(optarg); | |
347 | break; | |
348 | case 'x': | |
349 | xflg = 1; | |
350 | break; | |
351 | case 'k': | |
352 | keepflag = 1; | |
353 | break; | |
354 | case 'P': | |
355 | max_pacing_rate = atoi(optarg) ; | |
356 | break; | |
357 | default: | |
358 | exit(1); | |
359 | } | |
360 | } | |
361 | if (sflg) { | |
362 | int fdlisten = socket(cfg_family, SOCK_STREAM, 0); | |
363 | ||
364 | if (fdlisten == -1) { | |
365 | perror("socket"); | |
366 | exit(1); | |
367 | } | |
368 | apply_rcvsnd_buf(fdlisten); | |
369 | setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); | |
370 | ||
371 | setup_sockaddr(cfg_family, host, &listenaddr); | |
372 | ||
373 | if (mss && | |
aacb0c2e ED |
374 | setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG, |
375 | &mss, sizeof(mss)) == -1) { | |
192dc405 ED |
376 | perror("setsockopt TCP_MAXSEG"); |
377 | exit(1); | |
378 | } | |
379 | if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) { | |
380 | perror("bind"); | |
381 | exit(1); | |
382 | } | |
383 | if (listen(fdlisten, 128) == -1) { | |
384 | perror("listen"); | |
385 | exit(1); | |
386 | } | |
387 | do_accept(fdlisten); | |
388 | } | |
389 | buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, | |
390 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
391 | if (buffer == (char *)-1) { | |
392 | perror("mmap"); | |
393 | exit(1); | |
394 | } | |
395 | ||
258fe208 | 396 | fd = socket(cfg_family, SOCK_STREAM, 0); |
192dc405 ED |
397 | if (fd == -1) { |
398 | perror("socket"); | |
399 | exit(1); | |
400 | } | |
401 | apply_rcvsnd_buf(fd); | |
402 | ||
403 | setup_sockaddr(cfg_family, host, &addr); | |
404 | ||
405 | if (mss && | |
aacb0c2e | 406 | setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { |
192dc405 ED |
407 | perror("setsockopt TCP_MAXSEG"); |
408 | exit(1); | |
409 | } | |
410 | if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) { | |
411 | perror("connect"); | |
412 | exit(1); | |
413 | } | |
414 | if (max_pacing_rate && | |
415 | setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE, | |
416 | &max_pacing_rate, sizeof(max_pacing_rate)) == -1) | |
417 | perror("setsockopt SO_MAX_PACING_RATE"); | |
418 | ||
419 | if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, | |
420 | &on, sizeof(on)) == -1) { | |
421 | perror("setsockopt SO_ZEROCOPY, (-z option disabled)"); | |
422 | zflg = 0; | |
423 | } | |
424 | while (total < FILE_SZ) { | |
425 | long wr = FILE_SZ - total; | |
426 | ||
427 | if (wr > chunk_size) | |
428 | wr = chunk_size; | |
429 | /* Note : we just want to fill the pipe with 0 bytes */ | |
430 | wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0); | |
431 | if (wr <= 0) | |
432 | break; | |
433 | total += wr; | |
434 | } | |
435 | close(fd); | |
436 | munmap(buffer, chunk_size); | |
437 | return 0; | |
438 | } |