Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3d14c5d2 | 2 | #include <linux/ceph/ceph_debug.h> |
31b8006e SW |
3 | |
4 | #include <linux/crc32c.h> | |
5 | #include <linux/ctype.h> | |
6 | #include <linux/highmem.h> | |
7 | #include <linux/inet.h> | |
8 | #include <linux/kthread.h> | |
9 | #include <linux/net.h> | |
757856d2 | 10 | #include <linux/nsproxy.h> |
633ee407 | 11 | #include <linux/sched/mm.h> |
5a0e3ad6 | 12 | #include <linux/slab.h> |
31b8006e SW |
13 | #include <linux/socket.h> |
14 | #include <linux/string.h> | |
3ebc21f7 | 15 | #ifdef CONFIG_BLOCK |
68b4476b | 16 | #include <linux/bio.h> |
3ebc21f7 | 17 | #endif /* CONFIG_BLOCK */ |
ee3b56f2 | 18 | #include <linux/dns_resolver.h> |
31b8006e | 19 | #include <net/tcp.h> |
40e0b090 | 20 | #include <trace/events/sock.h> |
31b8006e | 21 | |
2b3e0c90 | 22 | #include <linux/ceph/ceph_features.h> |
3d14c5d2 YS |
23 | #include <linux/ceph/libceph.h> |
24 | #include <linux/ceph/messenger.h> | |
25 | #include <linux/ceph/decode.h> | |
26 | #include <linux/ceph/pagelist.h> | |
bc3b2d7f | 27 | #include <linux/export.h> |
31b8006e SW |
28 | |
29 | /* | |
30 | * Ceph uses the messenger to exchange ceph_msg messages with other | |
31 | * hosts in the system. The messenger provides ordered and reliable | |
32 | * delivery. We tolerate TCP disconnects by reconnecting (with | |
33 | * exponential backoff) in the case of a fault (disconnection, bad | |
34 | * crc, protocol error). Acks allow sent messages to be discarded by | |
35 | * the sender. | |
36 | */ | |
37 | ||
bc18f4b1 AE |
38 | /* |
39 | * We track the state of the socket on a given connection using | |
40 | * values defined below. The transition to a new socket state is | |
41 | * handled by a function which verifies we aren't coming from an | |
42 | * unexpected state. | |
43 | * | |
44 | * -------- | |
45 | * | NEW* | transient initial state | |
46 | * -------- | |
47 | * | con_sock_state_init() | |
48 | * v | |
49 | * ---------- | |
50 | * | CLOSED | initialized, but no socket (and no | |
51 | * ---------- TCP connection) | |
52 | * ^ \ | |
53 | * | \ con_sock_state_connecting() | |
54 | * | ---------------------- | |
55 | * | \ | |
56 | * + con_sock_state_closed() \ | |
fbb85a47 SW |
57 | * |+--------------------------- \ |
58 | * | \ \ \ | |
59 | * | ----------- \ \ | |
60 | * | | CLOSING | socket event; \ \ | |
61 | * | ----------- await close \ \ | |
62 | * | ^ \ | | |
63 | * | | \ | | |
64 | * | + con_sock_state_closing() \ | | |
65 | * | / \ | | | |
66 | * | / --------------- | | | |
67 | * | / \ v v | |
bc18f4b1 AE |
68 | * | / -------------- |
69 | * | / -----------------| CONNECTING | socket created, TCP | |
70 | * | | / -------------- connect initiated | |
71 | * | | | con_sock_state_connected() | |
72 | * | | v | |
73 | * ------------- | |
74 | * | CONNECTED | TCP connection established | |
75 | * ------------- | |
76 | * | |
77 | * State values for ceph_connection->sock_state; NEW is assumed to be 0. | |
78 | */ | |
ce2c8903 AE |
79 | |
80 | #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ | |
81 | #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ | |
82 | #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ | |
83 | #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ | |
84 | #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ | |
85 | ||
c9ffc77a AE |
86 | static bool con_flag_valid(unsigned long con_flag) |
87 | { | |
88 | switch (con_flag) { | |
3fefd43e ID |
89 | case CEPH_CON_F_LOSSYTX: |
90 | case CEPH_CON_F_KEEPALIVE_PENDING: | |
91 | case CEPH_CON_F_WRITE_PENDING: | |
92 | case CEPH_CON_F_SOCK_CLOSED: | |
93 | case CEPH_CON_F_BACKOFF: | |
c9ffc77a AE |
94 | return true; |
95 | default: | |
96 | return false; | |
97 | } | |
98 | } | |
99 | ||
6503e0b6 | 100 | void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) |
c9ffc77a AE |
101 | { |
102 | BUG_ON(!con_flag_valid(con_flag)); | |
103 | ||
104 | clear_bit(con_flag, &con->flags); | |
105 | } | |
106 | ||
6503e0b6 | 107 | void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) |
c9ffc77a AE |
108 | { |
109 | BUG_ON(!con_flag_valid(con_flag)); | |
110 | ||
111 | set_bit(con_flag, &con->flags); | |
112 | } | |
113 | ||
6503e0b6 | 114 | bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) |
c9ffc77a AE |
115 | { |
116 | BUG_ON(!con_flag_valid(con_flag)); | |
117 | ||
118 | return test_bit(con_flag, &con->flags); | |
119 | } | |
120 | ||
6503e0b6 ID |
121 | bool ceph_con_flag_test_and_clear(struct ceph_connection *con, |
122 | unsigned long con_flag) | |
c9ffc77a AE |
123 | { |
124 | BUG_ON(!con_flag_valid(con_flag)); | |
125 | ||
126 | return test_and_clear_bit(con_flag, &con->flags); | |
127 | } | |
128 | ||
6503e0b6 ID |
129 | bool ceph_con_flag_test_and_set(struct ceph_connection *con, |
130 | unsigned long con_flag) | |
c9ffc77a AE |
131 | { |
132 | BUG_ON(!con_flag_valid(con_flag)); | |
133 | ||
134 | return test_and_set_bit(con_flag, &con->flags); | |
135 | } | |
136 | ||
e3d5d638 AE |
137 | /* Slab caches for frequently-allocated structures */ |
138 | ||
139 | static struct kmem_cache *ceph_msg_cache; | |
140 | ||
a6a5349d SW |
141 | #ifdef CONFIG_LOCKDEP |
142 | static struct lock_class_key socket_class; | |
143 | #endif | |
144 | ||
31b8006e | 145 | static void queue_con(struct ceph_connection *con); |
37ab77ac | 146 | static void cancel_con(struct ceph_connection *con); |
68931622 | 147 | static void ceph_con_workfn(struct work_struct *); |
93209264 | 148 | static void con_fault(struct ceph_connection *con); |
31b8006e | 149 | |
31b8006e | 150 | /* |
f64a9317 AE |
151 | * Nicely render a sockaddr as a string. An array of formatted |
152 | * strings is used, to approximate reentrancy. | |
31b8006e | 153 | */ |
f64a9317 AE |
154 | #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ |
155 | #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) | |
156 | #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) | |
157 | #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ | |
158 | ||
159 | static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; | |
160 | static atomic_t addr_str_seq = ATOMIC_INIT(0); | |
31b8006e | 161 | |
699921d9 | 162 | struct page *ceph_zero_page; /* used in certain error cases */ |
57666519 | 163 | |
b726ec97 | 164 | const char *ceph_pr_addr(const struct ceph_entity_addr *addr) |
31b8006e SW |
165 | { |
166 | int i; | |
167 | char *s; | |
b726ec97 JL |
168 | struct sockaddr_storage ss = addr->in_addr; /* align */ |
169 | struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; | |
170 | struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; | |
31b8006e | 171 | |
f64a9317 | 172 | i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; |
31b8006e SW |
173 | s = addr_str[i]; |
174 | ||
b726ec97 | 175 | switch (ss.ss_family) { |
31b8006e | 176 | case AF_INET: |
d3c3c0a8 JL |
177 | snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu", |
178 | le32_to_cpu(addr->type), &in4->sin_addr, | |
bd406145 | 179 | ntohs(in4->sin_port)); |
31b8006e SW |
180 | break; |
181 | ||
182 | case AF_INET6: | |
d3c3c0a8 JL |
183 | snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu", |
184 | le32_to_cpu(addr->type), &in6->sin6_addr, | |
bd406145 | 185 | ntohs(in6->sin6_port)); |
31b8006e SW |
186 | break; |
187 | ||
188 | default: | |
d3002b97 | 189 | snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", |
b726ec97 | 190 | ss.ss_family); |
31b8006e SW |
191 | } |
192 | ||
193 | return s; | |
194 | } | |
3d14c5d2 | 195 | EXPORT_SYMBOL(ceph_pr_addr); |
31b8006e | 196 | |
6503e0b6 | 197 | void ceph_encode_my_addr(struct ceph_messenger *msgr) |
63f2d211 | 198 | { |
cd1a677c ID |
199 | if (!ceph_msgr2(from_msgr(msgr))) { |
200 | memcpy(&msgr->my_enc_addr, &msgr->inst.addr, | |
201 | sizeof(msgr->my_enc_addr)); | |
202 | ceph_encode_banner_addr(&msgr->my_enc_addr); | |
203 | } | |
63f2d211 SW |
204 | } |
205 | ||
31b8006e SW |
206 | /* |
207 | * work queue for all reading and writing to/from the socket. | |
208 | */ | |
e0f43c94 | 209 | static struct workqueue_struct *ceph_msgr_wq; |
31b8006e | 210 | |
e3d5d638 AE |
211 | static int ceph_msgr_slab_init(void) |
212 | { | |
213 | BUG_ON(ceph_msg_cache); | |
5ee61e95 | 214 | ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); |
81b36be4 AE |
215 | if (!ceph_msg_cache) |
216 | return -ENOMEM; | |
217 | ||
0d9c1ab3 | 218 | return 0; |
e3d5d638 AE |
219 | } |
220 | ||
221 | static void ceph_msgr_slab_exit(void) | |
222 | { | |
223 | BUG_ON(!ceph_msg_cache); | |
224 | kmem_cache_destroy(ceph_msg_cache); | |
225 | ceph_msg_cache = NULL; | |
226 | } | |
227 | ||
15417167 | 228 | static void _ceph_msgr_exit(void) |
6173d1f0 | 229 | { |
d3002b97 | 230 | if (ceph_msgr_wq) { |
6173d1f0 | 231 | destroy_workqueue(ceph_msgr_wq); |
d3002b97 AE |
232 | ceph_msgr_wq = NULL; |
233 | } | |
6173d1f0 | 234 | |
699921d9 ID |
235 | BUG_ON(!ceph_zero_page); |
236 | put_page(ceph_zero_page); | |
237 | ceph_zero_page = NULL; | |
d920ff6f BC |
238 | |
239 | ceph_msgr_slab_exit(); | |
6173d1f0 AE |
240 | } |
241 | ||
57a35dfb | 242 | int __init ceph_msgr_init(void) |
31b8006e | 243 | { |
d920ff6f BC |
244 | if (ceph_msgr_slab_init()) |
245 | return -ENOMEM; | |
246 | ||
699921d9 ID |
247 | BUG_ON(ceph_zero_page); |
248 | ceph_zero_page = ZERO_PAGE(0); | |
249 | get_page(ceph_zero_page); | |
57666519 | 250 | |
f9865f06 ID |
251 | /* |
252 | * The number of active work items is limited by the number of | |
253 | * connections, so leave @max_active at default. | |
254 | */ | |
255 | ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); | |
6173d1f0 AE |
256 | if (ceph_msgr_wq) |
257 | return 0; | |
57666519 | 258 | |
6173d1f0 AE |
259 | pr_err("msgr_init failed to create workqueue\n"); |
260 | _ceph_msgr_exit(); | |
57666519 | 261 | |
6173d1f0 | 262 | return -ENOMEM; |
31b8006e SW |
263 | } |
264 | ||
265 | void ceph_msgr_exit(void) | |
266 | { | |
57666519 | 267 | BUG_ON(ceph_msgr_wq == NULL); |
57666519 | 268 | |
6173d1f0 | 269 | _ceph_msgr_exit(); |
31b8006e SW |
270 | } |
271 | ||
cd84db6e | 272 | void ceph_msgr_flush(void) |
a922d38f SW |
273 | { |
274 | flush_workqueue(ceph_msgr_wq); | |
275 | } | |
3d14c5d2 | 276 | EXPORT_SYMBOL(ceph_msgr_flush); |
a922d38f | 277 | |
ce2c8903 AE |
278 | /* Connection socket state transition functions */ |
279 | ||
280 | static void con_sock_state_init(struct ceph_connection *con) | |
281 | { | |
282 | int old_state; | |
283 | ||
284 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); | |
285 | if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) | |
286 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
287 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
288 | CON_SOCK_STATE_CLOSED); | |
ce2c8903 AE |
289 | } |
290 | ||
291 | static void con_sock_state_connecting(struct ceph_connection *con) | |
292 | { | |
293 | int old_state; | |
294 | ||
295 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); | |
296 | if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) | |
297 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
298 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
299 | CON_SOCK_STATE_CONNECTING); | |
ce2c8903 AE |
300 | } |
301 | ||
302 | static void con_sock_state_connected(struct ceph_connection *con) | |
303 | { | |
304 | int old_state; | |
305 | ||
306 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); | |
307 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) | |
308 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
309 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
310 | CON_SOCK_STATE_CONNECTED); | |
ce2c8903 AE |
311 | } |
312 | ||
313 | static void con_sock_state_closing(struct ceph_connection *con) | |
314 | { | |
315 | int old_state; | |
316 | ||
317 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); | |
318 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && | |
319 | old_state != CON_SOCK_STATE_CONNECTED && | |
320 | old_state != CON_SOCK_STATE_CLOSING)) | |
321 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
322 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
323 | CON_SOCK_STATE_CLOSING); | |
ce2c8903 AE |
324 | } |
325 | ||
326 | static void con_sock_state_closed(struct ceph_connection *con) | |
327 | { | |
328 | int old_state; | |
329 | ||
330 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); | |
331 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && | |
fbb85a47 | 332 | old_state != CON_SOCK_STATE_CLOSING && |
8007b8d6 SW |
333 | old_state != CON_SOCK_STATE_CONNECTING && |
334 | old_state != CON_SOCK_STATE_CLOSED)) | |
ce2c8903 | 335 | printk("%s: unexpected old state %d\n", __func__, old_state); |
8007b8d6 SW |
336 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
337 | CON_SOCK_STATE_CLOSED); | |
ce2c8903 | 338 | } |
a922d38f | 339 | |
31b8006e SW |
340 | /* |
341 | * socket callback functions | |
342 | */ | |
343 | ||
344 | /* data available on socket, or listen socket received a connect */ | |
676d2369 | 345 | static void ceph_sock_data_ready(struct sock *sk) |
31b8006e | 346 | { |
bd406145 | 347 | struct ceph_connection *con = sk->sk_user_data; |
40e0b090 PY |
348 | |
349 | trace_sk_data_ready(sk); | |
350 | ||
a2a32584 GH |
351 | if (atomic_read(&con->msgr->stopping)) { |
352 | return; | |
353 | } | |
bd406145 | 354 | |
31b8006e | 355 | if (sk->sk_state != TCP_CLOSE_WAIT) { |
30be780a | 356 | dout("%s %p state = %d, queueing work\n", __func__, |
31b8006e SW |
357 | con, con->state); |
358 | queue_con(con); | |
359 | } | |
360 | } | |
361 | ||
362 | /* socket has buffer space for writing */ | |
327800bd | 363 | static void ceph_sock_write_space(struct sock *sk) |
31b8006e | 364 | { |
d3002b97 | 365 | struct ceph_connection *con = sk->sk_user_data; |
31b8006e | 366 | |
182fac26 JS |
367 | /* only queue to workqueue if there is data we want to write, |
368 | * and there is sufficient space in the socket buffer to accept | |
327800bd | 369 | * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() |
182fac26 JS |
370 | * doesn't get called again until try_write() fills the socket |
371 | * buffer. See net/ipv4/tcp_input.c:tcp_check_space() | |
372 | * and net/core/stream.c:sk_stream_write_space(). | |
373 | */ | |
6503e0b6 | 374 | if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { |
64dc6130 | 375 | if (sk_stream_is_writeable(sk)) { |
327800bd | 376 | dout("%s %p queueing write work\n", __func__, con); |
182fac26 JS |
377 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
378 | queue_con(con); | |
379 | } | |
31b8006e | 380 | } else { |
327800bd | 381 | dout("%s %p nothing to write\n", __func__, con); |
31b8006e | 382 | } |
31b8006e SW |
383 | } |
384 | ||
385 | /* socket's state has changed */ | |
327800bd | 386 | static void ceph_sock_state_change(struct sock *sk) |
31b8006e | 387 | { |
bd406145 | 388 | struct ceph_connection *con = sk->sk_user_data; |
31b8006e | 389 | |
30be780a | 390 | dout("%s %p state = %d sk_state = %u\n", __func__, |
31b8006e SW |
391 | con, con->state, sk->sk_state); |
392 | ||
31b8006e SW |
393 | switch (sk->sk_state) { |
394 | case TCP_CLOSE: | |
327800bd | 395 | dout("%s TCP_CLOSE\n", __func__); |
df561f66 | 396 | fallthrough; |
31b8006e | 397 | case TCP_CLOSE_WAIT: |
327800bd | 398 | dout("%s TCP_CLOSE_WAIT\n", __func__); |
ce2c8903 | 399 | con_sock_state_closing(con); |
6503e0b6 | 400 | ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); |
d65c9e0b | 401 | queue_con(con); |
31b8006e SW |
402 | break; |
403 | case TCP_ESTABLISHED: | |
327800bd | 404 | dout("%s TCP_ESTABLISHED\n", __func__); |
ce2c8903 | 405 | con_sock_state_connected(con); |
31b8006e SW |
406 | queue_con(con); |
407 | break; | |
d3002b97 AE |
408 | default: /* Everything else is uninteresting */ |
409 | break; | |
31b8006e SW |
410 | } |
411 | } | |
412 | ||
413 | /* | |
414 | * set up socket callbacks | |
415 | */ | |
416 | static void set_sock_callbacks(struct socket *sock, | |
417 | struct ceph_connection *con) | |
418 | { | |
419 | struct sock *sk = sock->sk; | |
bd406145 | 420 | sk->sk_user_data = con; |
327800bd AE |
421 | sk->sk_data_ready = ceph_sock_data_ready; |
422 | sk->sk_write_space = ceph_sock_write_space; | |
423 | sk->sk_state_change = ceph_sock_state_change; | |
31b8006e SW |
424 | } |
425 | ||
426 | ||
427 | /* | |
428 | * socket helpers | |
429 | */ | |
430 | ||
431 | /* | |
432 | * initiate connection to a remote socket. | |
433 | */ | |
6503e0b6 | 434 | int ceph_tcp_connect(struct ceph_connection *con) |
31b8006e | 435 | { |
cede185b | 436 | struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ |
31b8006e | 437 | struct socket *sock; |
633ee407 | 438 | unsigned int noio_flag; |
31b8006e SW |
439 | int ret; |
440 | ||
6503e0b6 ID |
441 | dout("%s con %p peer_addr %s\n", __func__, con, |
442 | ceph_pr_addr(&con->peer_addr)); | |
31b8006e | 443 | BUG_ON(con->sock); |
633ee407 ID |
444 | |
445 | /* sock_create_kern() allocates with GFP_KERNEL */ | |
446 | noio_flag = memalloc_noio_save(); | |
cede185b | 447 | ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, |
eeb1bd5c | 448 | SOCK_STREAM, IPPROTO_TCP, &sock); |
633ee407 | 449 | memalloc_noio_restore(noio_flag); |
31b8006e | 450 | if (ret) |
41617d0c | 451 | return ret; |
6d7fdb0a | 452 | sock->sk->sk_allocation = GFP_NOFS; |
98123866 | 453 | sock->sk->sk_use_task_frag = false; |
31b8006e | 454 | |
a6a5349d SW |
455 | #ifdef CONFIG_LOCKDEP |
456 | lockdep_set_class(&sock->sk->sk_lock, &socket_class); | |
457 | #endif | |
458 | ||
31b8006e SW |
459 | set_sock_callbacks(sock, con); |
460 | ||
89a86be0 | 461 | con_sock_state_connecting(con); |
cede185b | 462 | ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), |
f91d3471 | 463 | O_NONBLOCK); |
31b8006e SW |
464 | if (ret == -EINPROGRESS) { |
465 | dout("connect %s EINPROGRESS sk_state = %u\n", | |
b726ec97 | 466 | ceph_pr_addr(&con->peer_addr), |
31b8006e | 467 | sock->sk->sk_state); |
a5bc3129 | 468 | } else if (ret < 0) { |
31b8006e | 469 | pr_err("connect %s error %d\n", |
b726ec97 | 470 | ceph_pr_addr(&con->peer_addr), ret); |
31b8006e | 471 | sock_release(sock); |
41617d0c | 472 | return ret; |
a5bc3129 | 473 | } |
89baaa57 | 474 | |
12abc5ee CH |
475 | if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) |
476 | tcp_sock_set_nodelay(sock->sk); | |
ba988f87 | 477 | |
a5bc3129 | 478 | con->sock = sock; |
41617d0c | 479 | return 0; |
31b8006e SW |
480 | } |
481 | ||
31b8006e SW |
482 | /* |
483 | * Shutdown/close the socket for the given connection. | |
484 | */ | |
6503e0b6 | 485 | int ceph_con_close_socket(struct ceph_connection *con) |
31b8006e | 486 | { |
8007b8d6 | 487 | int rc = 0; |
31b8006e | 488 | |
6503e0b6 | 489 | dout("%s con %p sock %p\n", __func__, con, con->sock); |
8007b8d6 SW |
490 | if (con->sock) { |
491 | rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); | |
492 | sock_release(con->sock); | |
493 | con->sock = NULL; | |
494 | } | |
456ea468 AE |
495 | |
496 | /* | |
4a861692 | 497 | * Forcibly clear the SOCK_CLOSED flag. It gets set |
456ea468 AE |
498 | * independent of the connection mutex, and we could have |
499 | * received a socket close event before we had the chance to | |
500 | * shut the socket down. | |
501 | */ | |
6503e0b6 | 502 | ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); |
8007b8d6 | 503 | |
ce2c8903 | 504 | con_sock_state_closed(con); |
31b8006e SW |
505 | return rc; |
506 | } | |
507 | ||
3596f4c1 ID |
508 | static void ceph_con_reset_protocol(struct ceph_connection *con) |
509 | { | |
510 | dout("%s con %p\n", __func__, con); | |
511 | ||
6503e0b6 | 512 | ceph_con_close_socket(con); |
3596f4c1 ID |
513 | if (con->in_msg) { |
514 | WARN_ON(con->in_msg->con != con); | |
515 | ceph_msg_put(con->in_msg); | |
516 | con->in_msg = NULL; | |
517 | } | |
518 | if (con->out_msg) { | |
519 | WARN_ON(con->out_msg->con != con); | |
520 | ceph_msg_put(con->out_msg); | |
521 | con->out_msg = NULL; | |
522 | } | |
038b8d1d ID |
523 | if (con->bounce_page) { |
524 | __free_page(con->bounce_page); | |
525 | con->bounce_page = NULL; | |
526 | } | |
3596f4c1 | 527 | |
cd1a677c ID |
528 | if (ceph_msgr2(from_msgr(con->msgr))) |
529 | ceph_con_v2_reset_protocol(con); | |
530 | else | |
531 | ceph_con_v1_reset_protocol(con); | |
3596f4c1 ID |
532 | } |
533 | ||
31b8006e SW |
534 | /* |
535 | * Reset a connection. Discard all incoming and outgoing messages | |
536 | * and clear *_seq state. | |
537 | */ | |
538 | static void ceph_msg_remove(struct ceph_msg *msg) | |
539 | { | |
540 | list_del_init(&msg->list_head); | |
38941f80 | 541 | |
31b8006e SW |
542 | ceph_msg_put(msg); |
543 | } | |
cd1a677c | 544 | |
31b8006e SW |
545 | static void ceph_msg_remove_list(struct list_head *head) |
546 | { | |
547 | while (!list_empty(head)) { | |
548 | struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, | |
549 | list_head); | |
550 | ceph_msg_remove(msg); | |
551 | } | |
552 | } | |
553 | ||
6503e0b6 | 554 | void ceph_con_reset_session(struct ceph_connection *con) |
31b8006e | 555 | { |
5963c3d0 | 556 | dout("%s con %p\n", __func__, con); |
3596f4c1 ID |
557 | |
558 | WARN_ON(con->in_msg); | |
559 | WARN_ON(con->out_msg); | |
31b8006e SW |
560 | ceph_msg_remove_list(&con->out_queue); |
561 | ceph_msg_remove_list(&con->out_sent); | |
31b8006e | 562 | con->out_seq = 0; |
31b8006e | 563 | con->in_seq = 0; |
0e0d5e0c | 564 | con->in_seq_acked = 0; |
a3da057b | 565 | |
cd1a677c ID |
566 | if (ceph_msgr2(from_msgr(con->msgr))) |
567 | ceph_con_v2_reset_session(con); | |
568 | else | |
569 | ceph_con_v1_reset_session(con); | |
31b8006e SW |
570 | } |
571 | ||
572 | /* | |
573 | * mark a peer down. drop any open connections. | |
574 | */ | |
575 | void ceph_con_close(struct ceph_connection *con) | |
576 | { | |
8c50c817 | 577 | mutex_lock(&con->mutex); |
b726ec97 | 578 | dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); |
6d7f62bf | 579 | con->state = CEPH_CON_S_CLOSED; |
a5988c49 | 580 | |
6503e0b6 ID |
581 | ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next |
582 | connect */ | |
583 | ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); | |
584 | ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); | |
585 | ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); | |
a5988c49 | 586 | |
3596f4c1 | 587 | ceph_con_reset_protocol(con); |
5963c3d0 | 588 | ceph_con_reset_session(con); |
37ab77ac | 589 | cancel_con(con); |
ec302645 | 590 | mutex_unlock(&con->mutex); |
31b8006e | 591 | } |
3d14c5d2 | 592 | EXPORT_SYMBOL(ceph_con_close); |
31b8006e | 593 | |
31b8006e SW |
594 | /* |
595 | * Reopen a closed connection, with a new peer address. | |
596 | */ | |
b7a9e5dd SW |
597 | void ceph_con_open(struct ceph_connection *con, |
598 | __u8 entity_type, __u64 entity_num, | |
599 | struct ceph_entity_addr *addr) | |
31b8006e | 600 | { |
5469155f | 601 | mutex_lock(&con->mutex); |
b726ec97 | 602 | dout("con_open %p %s\n", con, ceph_pr_addr(addr)); |
8dacc7da | 603 | |
6d7f62bf ID |
604 | WARN_ON(con->state != CEPH_CON_S_CLOSED); |
605 | con->state = CEPH_CON_S_PREOPEN; | |
a5988c49 | 606 | |
b7a9e5dd SW |
607 | con->peer_name.type = (__u8) entity_type; |
608 | con->peer_name.num = cpu_to_le64(entity_num); | |
609 | ||
31b8006e | 610 | memcpy(&con->peer_addr, addr, sizeof(*addr)); |
03c677e1 | 611 | con->delay = 0; /* reset backoff memory */ |
5469155f | 612 | mutex_unlock(&con->mutex); |
31b8006e SW |
613 | queue_con(con); |
614 | } | |
3d14c5d2 | 615 | EXPORT_SYMBOL(ceph_con_open); |
31b8006e | 616 | |
87b315a5 SW |
617 | /* |
618 | * return true if this connection ever successfully opened | |
619 | */ | |
620 | bool ceph_con_opened(struct ceph_connection *con) | |
621 | { | |
cd1a677c ID |
622 | if (ceph_msgr2(from_msgr(con->msgr))) |
623 | return ceph_con_v2_opened(con); | |
624 | ||
566050e1 | 625 | return ceph_con_v1_opened(con); |
87b315a5 SW |
626 | } |
627 | ||
31b8006e SW |
628 | /* |
629 | * initialize a new connection. | |
630 | */ | |
1bfd89f4 AE |
631 | void ceph_con_init(struct ceph_connection *con, void *private, |
632 | const struct ceph_connection_operations *ops, | |
b7a9e5dd | 633 | struct ceph_messenger *msgr) |
31b8006e SW |
634 | { |
635 | dout("con_init %p\n", con); | |
636 | memset(con, 0, sizeof(*con)); | |
1bfd89f4 AE |
637 | con->private = private; |
638 | con->ops = ops; | |
31b8006e | 639 | con->msgr = msgr; |
ce2c8903 AE |
640 | |
641 | con_sock_state_init(con); | |
642 | ||
ec302645 | 643 | mutex_init(&con->mutex); |
31b8006e SW |
644 | INIT_LIST_HEAD(&con->out_queue); |
645 | INIT_LIST_HEAD(&con->out_sent); | |
68931622 | 646 | INIT_DELAYED_WORK(&con->work, ceph_con_workfn); |
a5988c49 | 647 | |
6d7f62bf | 648 | con->state = CEPH_CON_S_CLOSED; |
31b8006e | 649 | } |
3d14c5d2 | 650 | EXPORT_SYMBOL(ceph_con_init); |
31b8006e | 651 | |
31b8006e SW |
652 | /* |
653 | * We maintain a global counter to order connection attempts. Get | |
654 | * a unique seq greater than @gt. | |
655 | */ | |
6503e0b6 | 656 | u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) |
31b8006e SW |
657 | { |
658 | u32 ret; | |
659 | ||
660 | spin_lock(&msgr->global_seq_lock); | |
661 | if (msgr->global_seq < gt) | |
662 | msgr->global_seq = gt; | |
663 | ret = ++msgr->global_seq; | |
664 | spin_unlock(&msgr->global_seq_lock); | |
665 | return ret; | |
666 | } | |
667 | ||
02471928 ID |
668 | /* |
669 | * Discard messages that have been acked by the server. | |
670 | */ | |
6503e0b6 | 671 | void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) |
02471928 ID |
672 | { |
673 | struct ceph_msg *msg; | |
674 | u64 seq; | |
675 | ||
676 | dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); | |
677 | while (!list_empty(&con->out_sent)) { | |
678 | msg = list_first_entry(&con->out_sent, struct ceph_msg, | |
679 | list_head); | |
680 | WARN_ON(msg->needs_out_seq); | |
681 | seq = le64_to_cpu(msg->hdr.seq); | |
682 | if (seq > ack_seq) | |
683 | break; | |
684 | ||
685 | dout("%s con %p discarding msg %p seq %llu\n", __func__, con, | |
686 | msg, seq); | |
687 | ceph_msg_remove(msg); | |
688 | } | |
689 | } | |
690 | ||
691 | /* | |
692 | * Discard messages that have been requeued in con_fault(), up to | |
693 | * reconnect_seq. This avoids gratuitously resending messages that | |
694 | * the server had received and handled prior to reconnect. | |
695 | */ | |
6503e0b6 | 696 | void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) |
02471928 ID |
697 | { |
698 | struct ceph_msg *msg; | |
699 | u64 seq; | |
700 | ||
701 | dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); | |
702 | while (!list_empty(&con->out_queue)) { | |
703 | msg = list_first_entry(&con->out_queue, struct ceph_msg, | |
704 | list_head); | |
705 | if (msg->needs_out_seq) | |
706 | break; | |
707 | seq = le64_to_cpu(msg->hdr.seq); | |
708 | if (seq > reconnect_seq) | |
709 | break; | |
710 | ||
711 | dout("%s con %p discarding msg %p seq %llu\n", __func__, con, | |
712 | msg, seq); | |
713 | ceph_msg_remove(msg); | |
714 | } | |
715 | } | |
716 | ||
df6ad1f9 | 717 | #ifdef CONFIG_BLOCK |
6aaa4511 AE |
718 | |
719 | /* | |
720 | * For a bio data item, a piece is whatever remains of the next | |
721 | * entry in the current bio iovec, or the first entry in the next | |
722 | * bio in the list. | |
723 | */ | |
8ae4f4f5 | 724 | static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, |
25aff7c5 | 725 | size_t length) |
6aaa4511 | 726 | { |
8ae4f4f5 | 727 | struct ceph_msg_data *data = cursor->data; |
5359a17d | 728 | struct ceph_bio_iter *it = &cursor->bio_iter; |
6aaa4511 | 729 | |
5359a17d ID |
730 | cursor->resid = min_t(size_t, length, data->bio_length); |
731 | *it = data->bio_pos; | |
732 | if (cursor->resid < it->iter.bi_size) | |
733 | it->iter.bi_size = cursor->resid; | |
6aaa4511 | 734 | |
5359a17d | 735 | BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); |
6aaa4511 AE |
736 | } |
737 | ||
8ae4f4f5 | 738 | static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, |
6aaa4511 AE |
739 | size_t *page_offset, |
740 | size_t *length) | |
741 | { | |
5359a17d ID |
742 | struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, |
743 | cursor->bio_iter.iter); | |
6aaa4511 | 744 | |
5359a17d ID |
745 | *page_offset = bv.bv_offset; |
746 | *length = bv.bv_len; | |
747 | return bv.bv_page; | |
6aaa4511 AE |
748 | } |
749 | ||
8ae4f4f5 AE |
750 | static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, |
751 | size_t bytes) | |
6aaa4511 | 752 | { |
5359a17d | 753 | struct ceph_bio_iter *it = &cursor->bio_iter; |
187df763 | 754 | struct page *page = bio_iter_page(it->bio, it->iter); |
6aaa4511 | 755 | |
5359a17d ID |
756 | BUG_ON(bytes > cursor->resid); |
757 | BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); | |
25aff7c5 | 758 | cursor->resid -= bytes; |
5359a17d | 759 | bio_advance_iter(it->bio, &it->iter, bytes); |
f38a5181 | 760 | |
da4ab869 | 761 | if (!cursor->resid) |
5359a17d | 762 | return false; /* no more data */ |
f38a5181 | 763 | |
187df763 ID |
764 | if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && |
765 | page == bio_iter_page(it->bio, it->iter))) | |
6aaa4511 AE |
766 | return false; /* more bytes to process in this segment */ |
767 | ||
5359a17d ID |
768 | if (!it->iter.bi_size) { |
769 | it->bio = it->bio->bi_next; | |
770 | it->iter = it->bio->bi_iter; | |
771 | if (cursor->resid < it->iter.bi_size) | |
772 | it->iter.bi_size = cursor->resid; | |
25aff7c5 | 773 | } |
6aaa4511 | 774 | |
5359a17d | 775 | BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); |
6aaa4511 AE |
776 | return true; |
777 | } | |
ea96571f | 778 | #endif /* CONFIG_BLOCK */ |
df6ad1f9 | 779 | |
b9e281c2 ID |
780 | static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, |
781 | size_t length) | |
782 | { | |
783 | struct ceph_msg_data *data = cursor->data; | |
784 | struct bio_vec *bvecs = data->bvec_pos.bvecs; | |
785 | ||
786 | cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); | |
787 | cursor->bvec_iter = data->bvec_pos.iter; | |
788 | cursor->bvec_iter.bi_size = cursor->resid; | |
789 | ||
790 | BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); | |
b9e281c2 ID |
791 | } |
792 | ||
793 | static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, | |
794 | size_t *page_offset, | |
795 | size_t *length) | |
796 | { | |
797 | struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, | |
798 | cursor->bvec_iter); | |
799 | ||
800 | *page_offset = bv.bv_offset; | |
801 | *length = bv.bv_len; | |
802 | return bv.bv_page; | |
803 | } | |
804 | ||
805 | static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, | |
806 | size_t bytes) | |
807 | { | |
808 | struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; | |
187df763 | 809 | struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter); |
b9e281c2 ID |
810 | |
811 | BUG_ON(bytes > cursor->resid); | |
812 | BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); | |
813 | cursor->resid -= bytes; | |
814 | bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); | |
815 | ||
da4ab869 | 816 | if (!cursor->resid) |
b9e281c2 | 817 | return false; /* no more data */ |
b9e281c2 | 818 | |
187df763 ID |
819 | if (!bytes || (cursor->bvec_iter.bi_bvec_done && |
820 | page == bvec_iter_page(bvecs, cursor->bvec_iter))) | |
b9e281c2 ID |
821 | return false; /* more bytes to process in this segment */ |
822 | ||
b9e281c2 | 823 | BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); |
b9e281c2 ID |
824 | return true; |
825 | } | |
826 | ||
e766d7b5 AE |
827 | /* |
828 | * For a page array, a piece comes from the first page in the array | |
829 | * that has not already been fully consumed. | |
830 | */ | |
8ae4f4f5 | 831 | static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, |
25aff7c5 | 832 | size_t length) |
e766d7b5 | 833 | { |
8ae4f4f5 | 834 | struct ceph_msg_data *data = cursor->data; |
e766d7b5 AE |
835 | int page_count; |
836 | ||
837 | BUG_ON(data->type != CEPH_MSG_DATA_PAGES); | |
838 | ||
839 | BUG_ON(!data->pages); | |
840 | BUG_ON(!data->length); | |
841 | ||
ca8b3a69 | 842 | cursor->resid = min(length, data->length); |
e766d7b5 | 843 | page_count = calc_pages_for(data->alignment, (u64)data->length); |
e766d7b5 AE |
844 | cursor->page_offset = data->alignment & ~PAGE_MASK; |
845 | cursor->page_index = 0; | |
56fc5659 AE |
846 | BUG_ON(page_count > (int)USHRT_MAX); |
847 | cursor->page_count = (unsigned short)page_count; | |
848 | BUG_ON(length > SIZE_MAX - cursor->page_offset); | |
e766d7b5 AE |
849 | } |
850 | ||
8ae4f4f5 AE |
851 | static struct page * |
852 | ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, | |
853 | size_t *page_offset, size_t *length) | |
e766d7b5 | 854 | { |
8ae4f4f5 | 855 | struct ceph_msg_data *data = cursor->data; |
e766d7b5 AE |
856 | |
857 | BUG_ON(data->type != CEPH_MSG_DATA_PAGES); | |
858 | ||
859 | BUG_ON(cursor->page_index >= cursor->page_count); | |
860 | BUG_ON(cursor->page_offset >= PAGE_SIZE); | |
e766d7b5 AE |
861 | |
862 | *page_offset = cursor->page_offset; | |
da4ab869 | 863 | *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); |
e766d7b5 AE |
864 | return data->pages[cursor->page_index]; |
865 | } | |
866 | ||
8ae4f4f5 | 867 | static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, |
e766d7b5 AE |
868 | size_t bytes) |
869 | { | |
8ae4f4f5 | 870 | BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); |
e766d7b5 AE |
871 | |
872 | BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); | |
e766d7b5 AE |
873 | |
874 | /* Advance the cursor page offset */ | |
875 | ||
876 | cursor->resid -= bytes; | |
5df521b1 AE |
877 | cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; |
878 | if (!bytes || cursor->page_offset) | |
e766d7b5 AE |
879 | return false; /* more bytes to process in the current page */ |
880 | ||
d90deda6 YZ |
881 | if (!cursor->resid) |
882 | return false; /* no more data */ | |
883 | ||
5df521b1 | 884 | /* Move on to the next page; offset is already at 0 */ |
e766d7b5 AE |
885 | |
886 | BUG_ON(cursor->page_index >= cursor->page_count); | |
e766d7b5 | 887 | cursor->page_index++; |
e766d7b5 AE |
888 | return true; |
889 | } | |
890 | ||
fe38a2b6 | 891 | /* |
dd236fcb AE |
892 | * For a pagelist, a piece is whatever remains to be consumed in the |
893 | * first page in the list, or the front of the next page. | |
fe38a2b6 | 894 | */ |
8ae4f4f5 AE |
895 | static void |
896 | ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, | |
25aff7c5 | 897 | size_t length) |
fe38a2b6 | 898 | { |
8ae4f4f5 | 899 | struct ceph_msg_data *data = cursor->data; |
fe38a2b6 AE |
900 | struct ceph_pagelist *pagelist; |
901 | struct page *page; | |
902 | ||
dd236fcb | 903 | BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); |
fe38a2b6 AE |
904 | |
905 | pagelist = data->pagelist; | |
906 | BUG_ON(!pagelist); | |
25aff7c5 AE |
907 | |
908 | if (!length) | |
fe38a2b6 AE |
909 | return; /* pagelist can be assigned but empty */ |
910 | ||
911 | BUG_ON(list_empty(&pagelist->head)); | |
912 | page = list_first_entry(&pagelist->head, struct page, lru); | |
913 | ||
ca8b3a69 | 914 | cursor->resid = min(length, pagelist->length); |
fe38a2b6 AE |
915 | cursor->page = page; |
916 | cursor->offset = 0; | |
fe38a2b6 AE |
917 | } |
918 | ||
8ae4f4f5 AE |
919 | static struct page * |
920 | ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, | |
921 | size_t *page_offset, size_t *length) | |
fe38a2b6 | 922 | { |
8ae4f4f5 | 923 | struct ceph_msg_data *data = cursor->data; |
fe38a2b6 | 924 | struct ceph_pagelist *pagelist; |
fe38a2b6 AE |
925 | |
926 | BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); | |
927 | ||
928 | pagelist = data->pagelist; | |
929 | BUG_ON(!pagelist); | |
930 | ||
931 | BUG_ON(!cursor->page); | |
25aff7c5 | 932 | BUG_ON(cursor->offset + cursor->resid != pagelist->length); |
fe38a2b6 | 933 | |
5df521b1 | 934 | /* offset of first page in pagelist is always 0 */ |
fe38a2b6 | 935 | *page_offset = cursor->offset & ~PAGE_MASK; |
da4ab869 | 936 | *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); |
8ae4f4f5 | 937 | return cursor->page; |
fe38a2b6 AE |
938 | } |
939 | ||
8ae4f4f5 | 940 | static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, |
dd236fcb | 941 | size_t bytes) |
fe38a2b6 | 942 | { |
8ae4f4f5 | 943 | struct ceph_msg_data *data = cursor->data; |
fe38a2b6 AE |
944 | struct ceph_pagelist *pagelist; |
945 | ||
946 | BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); | |
947 | ||
948 | pagelist = data->pagelist; | |
949 | BUG_ON(!pagelist); | |
25aff7c5 AE |
950 | |
951 | BUG_ON(cursor->offset + cursor->resid != pagelist->length); | |
fe38a2b6 AE |
952 | BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); |
953 | ||
954 | /* Advance the cursor offset */ | |
955 | ||
25aff7c5 | 956 | cursor->resid -= bytes; |
fe38a2b6 | 957 | cursor->offset += bytes; |
5df521b1 | 958 | /* offset of first page in pagelist is always 0 */ |
fe38a2b6 AE |
959 | if (!bytes || cursor->offset & ~PAGE_MASK) |
960 | return false; /* more bytes to process in the current page */ | |
961 | ||
d90deda6 YZ |
962 | if (!cursor->resid) |
963 | return false; /* no more data */ | |
964 | ||
fe38a2b6 AE |
965 | /* Move on to the next page */ |
966 | ||
967 | BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); | |
17ddc49b | 968 | cursor->page = list_next_entry(cursor->page, lru); |
fe38a2b6 AE |
969 | return true; |
970 | } | |
971 | ||
dee0c5f8 JL |
972 | static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor, |
973 | size_t length) | |
974 | { | |
975 | struct ceph_msg_data *data = cursor->data; | |
976 | ||
977 | cursor->iov_iter = data->iter; | |
978 | cursor->lastlen = 0; | |
979 | iov_iter_truncate(&cursor->iov_iter, length); | |
980 | cursor->resid = iov_iter_count(&cursor->iov_iter); | |
981 | } | |
982 | ||
983 | static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor, | |
984 | size_t *page_offset, size_t *length) | |
985 | { | |
986 | struct page *page; | |
987 | ssize_t len; | |
988 | ||
989 | if (cursor->lastlen) | |
990 | iov_iter_revert(&cursor->iov_iter, cursor->lastlen); | |
991 | ||
992 | len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE, | |
993 | 1, page_offset); | |
994 | BUG_ON(len < 0); | |
995 | ||
996 | cursor->lastlen = len; | |
997 | ||
998 | /* | |
999 | * FIXME: The assumption is that the pages represented by the iov_iter | |
1000 | * are pinned, with the references held by the upper-level | |
1001 | * callers, or by virtue of being under writeback. Eventually, | |
1002 | * we'll get an iov_iter_get_pages2 variant that doesn't take | |
1003 | * page refs. Until then, just put the page ref. | |
1004 | */ | |
1005 | VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page); | |
1006 | put_page(page); | |
1007 | ||
1008 | *length = min_t(size_t, len, cursor->resid); | |
1009 | return page; | |
1010 | } | |
1011 | ||
1012 | static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor, | |
1013 | size_t bytes) | |
1014 | { | |
1015 | BUG_ON(bytes > cursor->resid); | |
1016 | cursor->resid -= bytes; | |
1017 | ||
1018 | if (bytes < cursor->lastlen) { | |
1019 | cursor->lastlen -= bytes; | |
1020 | } else { | |
1021 | iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen); | |
1022 | cursor->lastlen = 0; | |
1023 | } | |
1024 | ||
1025 | return cursor->resid; | |
1026 | } | |
1027 | ||
dd236fcb AE |
1028 | /* |
1029 | * Message data is handled (sent or received) in pieces, where each | |
1030 | * piece resides on a single page. The network layer might not | |
1031 | * consume an entire piece at once. A data item's cursor keeps | |
1032 | * track of which piece is next to process and how much remains to | |
1033 | * be processed in that piece. It also tracks whether the current | |
1034 | * piece is the last one in the data item. | |
1035 | */ | |
ca8b3a69 | 1036 | static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) |
dd236fcb | 1037 | { |
ca8b3a69 | 1038 | size_t length = cursor->total_resid; |
8ae4f4f5 | 1039 | |
8ae4f4f5 | 1040 | switch (cursor->data->type) { |
dd236fcb | 1041 | case CEPH_MSG_DATA_PAGELIST: |
8ae4f4f5 | 1042 | ceph_msg_data_pagelist_cursor_init(cursor, length); |
dd236fcb | 1043 | break; |
e766d7b5 | 1044 | case CEPH_MSG_DATA_PAGES: |
8ae4f4f5 | 1045 | ceph_msg_data_pages_cursor_init(cursor, length); |
e766d7b5 | 1046 | break; |
dd236fcb AE |
1047 | #ifdef CONFIG_BLOCK |
1048 | case CEPH_MSG_DATA_BIO: | |
8ae4f4f5 | 1049 | ceph_msg_data_bio_cursor_init(cursor, length); |
6aaa4511 | 1050 | break; |
dd236fcb | 1051 | #endif /* CONFIG_BLOCK */ |
b9e281c2 ID |
1052 | case CEPH_MSG_DATA_BVECS: |
1053 | ceph_msg_data_bvecs_cursor_init(cursor, length); | |
1054 | break; | |
dee0c5f8 JL |
1055 | case CEPH_MSG_DATA_ITER: |
1056 | ceph_msg_data_iter_cursor_init(cursor, length); | |
1057 | break; | |
6aaa4511 | 1058 | case CEPH_MSG_DATA_NONE: |
dd236fcb AE |
1059 | default: |
1060 | /* BUG(); */ | |
1061 | break; | |
1062 | } | |
8ae4f4f5 | 1063 | cursor->need_crc = true; |
dd236fcb AE |
1064 | } |
1065 | ||
6503e0b6 ID |
1066 | void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, |
1067 | struct ceph_msg *msg, size_t length) | |
ca8b3a69 | 1068 | { |
ca8b3a69 AE |
1069 | BUG_ON(!length); |
1070 | BUG_ON(length > msg->data_length); | |
0d9c1ab3 | 1071 | BUG_ON(!msg->num_data_items); |
ca8b3a69 | 1072 | |
ca8b3a69 | 1073 | cursor->total_resid = length; |
0d9c1ab3 | 1074 | cursor->data = msg->data; |
ec3bc567 | 1075 | cursor->sr_resid = 0; |
ca8b3a69 AE |
1076 | |
1077 | __ceph_msg_data_cursor_init(cursor); | |
1078 | } | |
1079 | ||
dd236fcb AE |
1080 | /* |
1081 | * Return the page containing the next piece to process for a given | |
1082 | * data item, and supply the page offset and length of that piece. | |
1083 | * Indicate whether this is the last piece in this data item. | |
1084 | */ | |
6503e0b6 | 1085 | struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, |
da4ab869 | 1086 | size_t *page_offset, size_t *length) |
dd236fcb AE |
1087 | { |
1088 | struct page *page; | |
1089 | ||
8ae4f4f5 | 1090 | switch (cursor->data->type) { |
dd236fcb | 1091 | case CEPH_MSG_DATA_PAGELIST: |
8ae4f4f5 | 1092 | page = ceph_msg_data_pagelist_next(cursor, page_offset, length); |
dd236fcb | 1093 | break; |
e766d7b5 | 1094 | case CEPH_MSG_DATA_PAGES: |
8ae4f4f5 | 1095 | page = ceph_msg_data_pages_next(cursor, page_offset, length); |
e766d7b5 | 1096 | break; |
dd236fcb AE |
1097 | #ifdef CONFIG_BLOCK |
1098 | case CEPH_MSG_DATA_BIO: | |
8ae4f4f5 | 1099 | page = ceph_msg_data_bio_next(cursor, page_offset, length); |
6aaa4511 | 1100 | break; |
dd236fcb | 1101 | #endif /* CONFIG_BLOCK */ |
b9e281c2 ID |
1102 | case CEPH_MSG_DATA_BVECS: |
1103 | page = ceph_msg_data_bvecs_next(cursor, page_offset, length); | |
1104 | break; | |
dee0c5f8 JL |
1105 | case CEPH_MSG_DATA_ITER: |
1106 | page = ceph_msg_data_iter_next(cursor, page_offset, length); | |
1107 | break; | |
6aaa4511 | 1108 | case CEPH_MSG_DATA_NONE: |
dd236fcb AE |
1109 | default: |
1110 | page = NULL; | |
1111 | break; | |
1112 | } | |
5359a17d | 1113 | |
dd236fcb AE |
1114 | BUG_ON(!page); |
1115 | BUG_ON(*page_offset + *length > PAGE_SIZE); | |
1116 | BUG_ON(!*length); | |
5359a17d | 1117 | BUG_ON(*length > cursor->resid); |
dd236fcb AE |
1118 | |
1119 | return page; | |
1120 | } | |
1121 | ||
1122 | /* | |
1123 | * Returns true if the result moves the cursor on to the next piece | |
1124 | * of the data item. | |
1125 | */ | |
6503e0b6 | 1126 | void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) |
dd236fcb AE |
1127 | { |
1128 | bool new_piece; | |
1129 | ||
25aff7c5 | 1130 | BUG_ON(bytes > cursor->resid); |
8ae4f4f5 | 1131 | switch (cursor->data->type) { |
dd236fcb | 1132 | case CEPH_MSG_DATA_PAGELIST: |
8ae4f4f5 | 1133 | new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); |
dd236fcb | 1134 | break; |
e766d7b5 | 1135 | case CEPH_MSG_DATA_PAGES: |
8ae4f4f5 | 1136 | new_piece = ceph_msg_data_pages_advance(cursor, bytes); |
e766d7b5 | 1137 | break; |
dd236fcb AE |
1138 | #ifdef CONFIG_BLOCK |
1139 | case CEPH_MSG_DATA_BIO: | |
8ae4f4f5 | 1140 | new_piece = ceph_msg_data_bio_advance(cursor, bytes); |
6aaa4511 | 1141 | break; |
dd236fcb | 1142 | #endif /* CONFIG_BLOCK */ |
b9e281c2 ID |
1143 | case CEPH_MSG_DATA_BVECS: |
1144 | new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); | |
1145 | break; | |
dee0c5f8 JL |
1146 | case CEPH_MSG_DATA_ITER: |
1147 | new_piece = ceph_msg_data_iter_advance(cursor, bytes); | |
1148 | break; | |
6aaa4511 | 1149 | case CEPH_MSG_DATA_NONE: |
dd236fcb AE |
1150 | default: |
1151 | BUG(); | |
1152 | break; | |
1153 | } | |
ca8b3a69 | 1154 | cursor->total_resid -= bytes; |
dd236fcb | 1155 | |
ca8b3a69 | 1156 | if (!cursor->resid && cursor->total_resid) { |
0d9c1ab3 | 1157 | cursor->data++; |
ca8b3a69 | 1158 | __ceph_msg_data_cursor_init(cursor); |
a51b272e | 1159 | new_piece = true; |
ca8b3a69 | 1160 | } |
a51b272e | 1161 | cursor->need_crc = new_piece; |
dd236fcb AE |
1162 | } |
1163 | ||
6503e0b6 ID |
1164 | u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, |
1165 | unsigned int length) | |
35b62808 AE |
1166 | { |
1167 | char *kaddr; | |
1168 | ||
1169 | kaddr = kmap(page); | |
1170 | BUG_ON(kaddr == NULL); | |
1171 | crc = crc32c(crc, kaddr + page_offset, length); | |
1172 | kunmap(page); | |
1173 | ||
1174 | return crc; | |
1175 | } | |
31b8006e | 1176 | |
6503e0b6 | 1177 | bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) |
31b8006e | 1178 | { |
cede185b JL |
1179 | struct sockaddr_storage ss = addr->in_addr; /* align */ |
1180 | struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; | |
1181 | struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; | |
c44bd69c | 1182 | |
cede185b | 1183 | switch (ss.ss_family) { |
31b8006e | 1184 | case AF_INET: |
cede185b | 1185 | return addr4->s_addr == htonl(INADDR_ANY); |
31b8006e | 1186 | case AF_INET6: |
c44bd69c ID |
1187 | return ipv6_addr_any(addr6); |
1188 | default: | |
1189 | return true; | |
31b8006e | 1190 | } |
31b8006e | 1191 | } |
8ff2c64c | 1192 | EXPORT_SYMBOL(ceph_addr_is_blank); |
31b8006e | 1193 | |
6503e0b6 | 1194 | int ceph_addr_port(const struct ceph_entity_addr *addr) |
31b8006e | 1195 | { |
cede185b | 1196 | switch (get_unaligned(&addr->in_addr.ss_family)) { |
31b8006e | 1197 | case AF_INET: |
cede185b | 1198 | return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); |
31b8006e | 1199 | case AF_INET6: |
cede185b | 1200 | return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); |
31b8006e SW |
1201 | } |
1202 | return 0; | |
1203 | } | |
1204 | ||
6503e0b6 | 1205 | void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) |
31b8006e | 1206 | { |
cede185b | 1207 | switch (get_unaligned(&addr->in_addr.ss_family)) { |
31b8006e | 1208 | case AF_INET: |
cede185b | 1209 | put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); |
a2a79609 | 1210 | break; |
31b8006e | 1211 | case AF_INET6: |
cede185b | 1212 | put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); |
a2a79609 | 1213 | break; |
31b8006e SW |
1214 | } |
1215 | } | |
1216 | ||
ee3b56f2 NW |
1217 | /* |
1218 | * Unlike other *_pton function semantics, zero indicates success. | |
1219 | */ | |
cede185b | 1220 | static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, |
ee3b56f2 NW |
1221 | char delim, const char **ipend) |
1222 | { | |
cede185b | 1223 | memset(&addr->in_addr, 0, sizeof(addr->in_addr)); |
ee3b56f2 | 1224 | |
cede185b JL |
1225 | if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { |
1226 | put_unaligned(AF_INET, &addr->in_addr.ss_family); | |
ee3b56f2 NW |
1227 | return 0; |
1228 | } | |
1229 | ||
cede185b JL |
1230 | if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { |
1231 | put_unaligned(AF_INET6, &addr->in_addr.ss_family); | |
ee3b56f2 NW |
1232 | return 0; |
1233 | } | |
1234 | ||
1235 | return -EINVAL; | |
1236 | } | |
1237 | ||
1238 | /* | |
1239 | * Extract hostname string and resolve using kernel DNS facility. | |
1240 | */ | |
1241 | #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER | |
1242 | static int ceph_dns_resolve_name(const char *name, size_t namelen, | |
cede185b | 1243 | struct ceph_entity_addr *addr, char delim, const char **ipend) |
ee3b56f2 NW |
1244 | { |
1245 | const char *end, *delim_p; | |
1246 | char *colon_p, *ip_addr = NULL; | |
1247 | int ip_len, ret; | |
1248 | ||
1249 | /* | |
1250 | * The end of the hostname occurs immediately preceding the delimiter or | |
1251 | * the port marker (':') where the delimiter takes precedence. | |
1252 | */ | |
1253 | delim_p = memchr(name, delim, namelen); | |
1254 | colon_p = memchr(name, ':', namelen); | |
1255 | ||
1256 | if (delim_p && colon_p) | |
1257 | end = delim_p < colon_p ? delim_p : colon_p; | |
1258 | else if (!delim_p && colon_p) | |
1259 | end = colon_p; | |
1260 | else { | |
1261 | end = delim_p; | |
1262 | if (!end) /* case: hostname:/ */ | |
1263 | end = name + namelen; | |
1264 | } | |
1265 | ||
1266 | if (end <= name) | |
1267 | return -EINVAL; | |
1268 | ||
1269 | /* do dns_resolve upcall */ | |
a58946c1 DH |
1270 | ip_len = dns_query(current->nsproxy->net_ns, |
1271 | NULL, name, end - name, NULL, &ip_addr, NULL, false); | |
ee3b56f2 | 1272 | if (ip_len > 0) |
cede185b | 1273 | ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); |
ee3b56f2 NW |
1274 | else |
1275 | ret = -ESRCH; | |
1276 | ||
1277 | kfree(ip_addr); | |
1278 | ||
1279 | *ipend = end; | |
1280 | ||
1281 | pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, | |
b726ec97 | 1282 | ret, ret ? "failed" : ceph_pr_addr(addr)); |
ee3b56f2 NW |
1283 | |
1284 | return ret; | |
1285 | } | |
1286 | #else | |
1287 | static inline int ceph_dns_resolve_name(const char *name, size_t namelen, | |
cede185b | 1288 | struct ceph_entity_addr *addr, char delim, const char **ipend) |
ee3b56f2 NW |
1289 | { |
1290 | return -EINVAL; | |
1291 | } | |
1292 | #endif | |
1293 | ||
1294 | /* | |
1295 | * Parse a server name (IP or hostname). If a valid IP address is not found | |
1296 | * then try to extract a hostname to resolve using userspace DNS upcall. | |
1297 | */ | |
1298 | static int ceph_parse_server_name(const char *name, size_t namelen, | |
cede185b | 1299 | struct ceph_entity_addr *addr, char delim, const char **ipend) |
ee3b56f2 NW |
1300 | { |
1301 | int ret; | |
1302 | ||
cede185b | 1303 | ret = ceph_pton(name, namelen, addr, delim, ipend); |
ee3b56f2 | 1304 | if (ret) |
cede185b | 1305 | ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); |
ee3b56f2 NW |
1306 | |
1307 | return ret; | |
1308 | } | |
1309 | ||
31b8006e SW |
1310 | /* |
1311 | * Parse an ip[:port] list into an addr array. Use the default | |
1312 | * monitor port if a port isn't specified. | |
1313 | */ | |
1314 | int ceph_parse_ips(const char *c, const char *end, | |
1315 | struct ceph_entity_addr *addr, | |
2d7c86a8 | 1316 | int max_count, int *count, char delim) |
31b8006e | 1317 | { |
ee3b56f2 | 1318 | int i, ret = -EINVAL; |
31b8006e SW |
1319 | const char *p = c; |
1320 | ||
1321 | dout("parse_ips on '%.*s'\n", (int)(end-c), c); | |
1322 | for (i = 0; i < max_count; i++) { | |
2d7c86a8 | 1323 | char cur_delim = delim; |
31b8006e | 1324 | const char *ipend; |
31b8006e | 1325 | int port; |
39139f64 SW |
1326 | |
1327 | if (*p == '[') { | |
2d7c86a8 | 1328 | cur_delim = ']'; |
39139f64 SW |
1329 | p++; |
1330 | } | |
31b8006e | 1331 | |
2d7c86a8 VS |
1332 | ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, |
1333 | &ipend); | |
ee3b56f2 | 1334 | if (ret) |
31b8006e | 1335 | goto bad; |
ee3b56f2 NW |
1336 | ret = -EINVAL; |
1337 | ||
31b8006e SW |
1338 | p = ipend; |
1339 | ||
2d7c86a8 | 1340 | if (cur_delim == ']') { |
39139f64 SW |
1341 | if (*p != ']') { |
1342 | dout("missing matching ']'\n"); | |
1343 | goto bad; | |
1344 | } | |
1345 | p++; | |
1346 | } | |
1347 | ||
31b8006e SW |
1348 | /* port? */ |
1349 | if (p < end && *p == ':') { | |
1350 | port = 0; | |
1351 | p++; | |
1352 | while (p < end && *p >= '0' && *p <= '9') { | |
1353 | port = (port * 10) + (*p - '0'); | |
1354 | p++; | |
1355 | } | |
f48db1e9 ID |
1356 | if (port == 0) |
1357 | port = CEPH_MON_PORT; | |
1358 | else if (port > 65535) | |
31b8006e SW |
1359 | goto bad; |
1360 | } else { | |
1361 | port = CEPH_MON_PORT; | |
1362 | } | |
1363 | ||
6503e0b6 | 1364 | ceph_addr_set_port(&addr[i], port); |
cd1a677c ID |
1365 | /* |
1366 | * We want the type to be set according to ms_mode | |
1367 | * option, but options are normally parsed after mon | |
1368 | * addresses. Rather than complicating parsing, set | |
1369 | * to LEGACY and override in build_initial_monmap() | |
1370 | * for mon addresses and ceph_messenger_init() for | |
1371 | * ip option. | |
1372 | */ | |
d3c3c0a8 | 1373 | addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; |
cd1a677c | 1374 | addr[i].nonce = 0; |
31b8006e | 1375 | |
2d7c86a8 | 1376 | dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); |
31b8006e SW |
1377 | |
1378 | if (p == end) | |
1379 | break; | |
2d7c86a8 | 1380 | if (*p != delim) |
31b8006e SW |
1381 | goto bad; |
1382 | p++; | |
1383 | } | |
1384 | ||
1385 | if (p != end) | |
1386 | goto bad; | |
1387 | ||
1388 | if (count) | |
1389 | *count = i + 1; | |
1390 | return 0; | |
1391 | ||
1392 | bad: | |
ee3b56f2 | 1393 | return ret; |
31b8006e SW |
1394 | } |
1395 | ||
31b8006e SW |
1396 | /* |
1397 | * Process message. This happens in the worker thread. The callback should | |
1398 | * be careful not to do anything that waits on other incoming messages or it | |
1399 | * may deadlock. | |
1400 | */ | |
6503e0b6 | 1401 | void ceph_con_process_message(struct ceph_connection *con) |
31b8006e | 1402 | { |
583d0fef | 1403 | struct ceph_msg *msg = con->in_msg; |
31b8006e | 1404 | |
38941f80 | 1405 | BUG_ON(con->in_msg->con != con); |
31b8006e SW |
1406 | con->in_msg = NULL; |
1407 | ||
1408 | /* if first message, set peer_name */ | |
1409 | if (con->peer_name.type == 0) | |
dbad185d | 1410 | con->peer_name = msg->hdr.src; |
31b8006e | 1411 | |
31b8006e | 1412 | con->in_seq++; |
ec302645 | 1413 | mutex_unlock(&con->mutex); |
31b8006e | 1414 | |
b77f8f0e | 1415 | dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", |
31b8006e | 1416 | msg, le64_to_cpu(msg->hdr.seq), |
dbad185d | 1417 | ENTITY_NAME(msg->hdr.src), |
31b8006e SW |
1418 | le16_to_cpu(msg->hdr.type), |
1419 | ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), | |
1420 | le32_to_cpu(msg->hdr.front_len), | |
b77f8f0e | 1421 | le32_to_cpu(msg->hdr.middle_len), |
31b8006e SW |
1422 | le32_to_cpu(msg->hdr.data_len), |
1423 | con->in_front_crc, con->in_middle_crc, con->in_data_crc); | |
1424 | con->ops->dispatch(con, msg); | |
ec302645 SW |
1425 | |
1426 | mutex_lock(&con->mutex); | |
31b8006e SW |
1427 | } |
1428 | ||
31b8006e | 1429 | /* |
802c6d96 AE |
1430 | * Atomically queue work on a connection after the specified delay. |
1431 | * Bump @con reference to avoid races with connection teardown. | |
1432 | * Returns 0 if work was queued, or an error code otherwise. | |
31b8006e | 1433 | */ |
802c6d96 | 1434 | static int queue_con_delay(struct ceph_connection *con, unsigned long delay) |
31b8006e | 1435 | { |
31b8006e | 1436 | if (!con->ops->get(con)) { |
802c6d96 | 1437 | dout("%s %p ref count 0\n", __func__, con); |
802c6d96 | 1438 | return -ENOENT; |
31b8006e SW |
1439 | } |
1440 | ||
418af5b3 ID |
1441 | if (delay >= HZ) |
1442 | delay = round_jiffies_relative(delay); | |
1443 | ||
5a5036c8 | 1444 | dout("%s %p %lu\n", __func__, con, delay); |
802c6d96 AE |
1445 | if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { |
1446 | dout("%s %p - already queued\n", __func__, con); | |
31b8006e | 1447 | con->ops->put(con); |
802c6d96 | 1448 | return -EBUSY; |
31b8006e | 1449 | } |
802c6d96 | 1450 | |
802c6d96 AE |
1451 | return 0; |
1452 | } | |
1453 | ||
1454 | static void queue_con(struct ceph_connection *con) | |
1455 | { | |
1456 | (void) queue_con_delay(con, 0); | |
31b8006e SW |
1457 | } |
1458 | ||
37ab77ac ID |
1459 | static void cancel_con(struct ceph_connection *con) |
1460 | { | |
1461 | if (cancel_delayed_work(&con->work)) { | |
1462 | dout("%s %p\n", __func__, con); | |
1463 | con->ops->put(con); | |
1464 | } | |
1465 | } | |
1466 | ||
7bb21d68 AE |
1467 | static bool con_sock_closed(struct ceph_connection *con) |
1468 | { | |
6503e0b6 | 1469 | if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) |
7bb21d68 AE |
1470 | return false; |
1471 | ||
1472 | #define CASE(x) \ | |
6d7f62bf | 1473 | case CEPH_CON_S_ ## x: \ |
7bb21d68 AE |
1474 | con->error_msg = "socket closed (con state " #x ")"; \ |
1475 | break; | |
1476 | ||
1477 | switch (con->state) { | |
1478 | CASE(CLOSED); | |
1479 | CASE(PREOPEN); | |
6d7f62bf ID |
1480 | CASE(V1_BANNER); |
1481 | CASE(V1_CONNECT_MSG); | |
cd1a677c ID |
1482 | CASE(V2_BANNER_PREFIX); |
1483 | CASE(V2_BANNER_PAYLOAD); | |
1484 | CASE(V2_HELLO); | |
1485 | CASE(V2_AUTH); | |
1486 | CASE(V2_AUTH_SIGNATURE); | |
1487 | CASE(V2_SESSION_CONNECT); | |
1488 | CASE(V2_SESSION_RECONNECT); | |
7bb21d68 AE |
1489 | CASE(OPEN); |
1490 | CASE(STANDBY); | |
1491 | default: | |
7bb21d68 | 1492 | BUG(); |
7bb21d68 AE |
1493 | } |
1494 | #undef CASE | |
1495 | ||
1496 | return true; | |
1497 | } | |
1498 | ||
f20a39fd AE |
1499 | static bool con_backoff(struct ceph_connection *con) |
1500 | { | |
1501 | int ret; | |
1502 | ||
6503e0b6 | 1503 | if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) |
f20a39fd AE |
1504 | return false; |
1505 | ||
418af5b3 | 1506 | ret = queue_con_delay(con, con->delay); |
f20a39fd AE |
1507 | if (ret) { |
1508 | dout("%s: con %p FAILED to back off %lu\n", __func__, | |
1509 | con, con->delay); | |
1510 | BUG_ON(ret == -ENOENT); | |
6503e0b6 | 1511 | ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); |
f20a39fd AE |
1512 | } |
1513 | ||
1514 | return true; | |
1515 | } | |
1516 | ||
93209264 AE |
1517 | /* Finish fault handling; con->mutex must *not* be held here */ |
1518 | ||
1519 | static void con_fault_finish(struct ceph_connection *con) | |
1520 | { | |
f6330cc1 ID |
1521 | dout("%s %p\n", __func__, con); |
1522 | ||
93209264 AE |
1523 | /* |
1524 | * in case we faulted due to authentication, invalidate our | |
1525 | * current tickets so that we can get new ones. | |
1526 | */ | |
a56dd9bf ID |
1527 | if (con->v1.auth_retry) { |
1528 | dout("auth_retry %d, invalidating\n", con->v1.auth_retry); | |
f6330cc1 ID |
1529 | if (con->ops->invalidate_authorizer) |
1530 | con->ops->invalidate_authorizer(con); | |
a56dd9bf | 1531 | con->v1.auth_retry = 0; |
93209264 AE |
1532 | } |
1533 | ||
1534 | if (con->ops->fault) | |
1535 | con->ops->fault(con); | |
1536 | } | |
1537 | ||
31b8006e SW |
1538 | /* |
1539 | * Do some work on a connection. Drop a connection ref when we're done. | |
1540 | */ | |
68931622 | 1541 | static void ceph_con_workfn(struct work_struct *work) |
31b8006e SW |
1542 | { |
1543 | struct ceph_connection *con = container_of(work, struct ceph_connection, | |
1544 | work.work); | |
49659416 | 1545 | bool fault; |
31b8006e | 1546 | |
9dd4658d | 1547 | mutex_lock(&con->mutex); |
49659416 AE |
1548 | while (true) { |
1549 | int ret; | |
31b8006e | 1550 | |
49659416 AE |
1551 | if ((fault = con_sock_closed(con))) { |
1552 | dout("%s: con %p SOCK_CLOSED\n", __func__, con); | |
1553 | break; | |
1554 | } | |
1555 | if (con_backoff(con)) { | |
1556 | dout("%s: con %p BACKOFF\n", __func__, con); | |
1557 | break; | |
1558 | } | |
6d7f62bf | 1559 | if (con->state == CEPH_CON_S_STANDBY) { |
49659416 AE |
1560 | dout("%s: con %p STANDBY\n", __func__, con); |
1561 | break; | |
1562 | } | |
6d7f62bf | 1563 | if (con->state == CEPH_CON_S_CLOSED) { |
49659416 AE |
1564 | dout("%s: con %p CLOSED\n", __func__, con); |
1565 | BUG_ON(con->sock); | |
1566 | break; | |
1567 | } | |
6d7f62bf | 1568 | if (con->state == CEPH_CON_S_PREOPEN) { |
49659416 AE |
1569 | dout("%s: con %p PREOPEN\n", __func__, con); |
1570 | BUG_ON(con->sock); | |
1571 | } | |
0da5d703 | 1572 | |
cd1a677c ID |
1573 | if (ceph_msgr2(from_msgr(con->msgr))) |
1574 | ret = ceph_con_v2_try_read(con); | |
1575 | else | |
1576 | ret = ceph_con_v1_try_read(con); | |
49659416 AE |
1577 | if (ret < 0) { |
1578 | if (ret == -EAGAIN) | |
1579 | continue; | |
67c64eb7 ID |
1580 | if (!con->error_msg) |
1581 | con->error_msg = "socket error on read"; | |
49659416 AE |
1582 | fault = true; |
1583 | break; | |
1584 | } | |
1585 | ||
cd1a677c ID |
1586 | if (ceph_msgr2(from_msgr(con->msgr))) |
1587 | ret = ceph_con_v2_try_write(con); | |
1588 | else | |
1589 | ret = ceph_con_v1_try_write(con); | |
49659416 AE |
1590 | if (ret < 0) { |
1591 | if (ret == -EAGAIN) | |
1592 | continue; | |
67c64eb7 ID |
1593 | if (!con->error_msg) |
1594 | con->error_msg = "socket error on write"; | |
49659416 AE |
1595 | fault = true; |
1596 | } | |
1597 | ||
1598 | break; /* If we make it to here, we're done */ | |
3a140a0d | 1599 | } |
b6e7b6a1 AE |
1600 | if (fault) |
1601 | con_fault(con); | |
9dd4658d | 1602 | mutex_unlock(&con->mutex); |
0da5d703 | 1603 | |
b6e7b6a1 AE |
1604 | if (fault) |
1605 | con_fault_finish(con); | |
1606 | ||
1607 | con->ops->put(con); | |
31b8006e SW |
1608 | } |
1609 | ||
31b8006e SW |
1610 | /* |
1611 | * Generic error/fault handler. A retry mechanism is used with | |
1612 | * exponential backoff | |
1613 | */ | |
93209264 | 1614 | static void con_fault(struct ceph_connection *con) |
31b8006e | 1615 | { |
30be780a | 1616 | dout("fault %p state %d to peer %s\n", |
b726ec97 | 1617 | con, con->state, ceph_pr_addr(&con->peer_addr)); |
31b8006e | 1618 | |
67c64eb7 | 1619 | pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), |
b726ec97 | 1620 | ceph_pr_addr(&con->peer_addr), con->error_msg); |
67c64eb7 ID |
1621 | con->error_msg = NULL; |
1622 | ||
cd1a677c ID |
1623 | WARN_ON(con->state == CEPH_CON_S_STANDBY || |
1624 | con->state == CEPH_CON_S_CLOSED); | |
ec302645 | 1625 | |
3596f4c1 | 1626 | ceph_con_reset_protocol(con); |
5e095e8b | 1627 | |
6503e0b6 | 1628 | if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { |
8dacc7da | 1629 | dout("fault on LOSSYTX channel, marking CLOSED\n"); |
6d7f62bf | 1630 | con->state = CEPH_CON_S_CLOSED; |
93209264 | 1631 | return; |
3b5ede07 SW |
1632 | } |
1633 | ||
e80a52d1 SW |
1634 | /* Requeue anything that hasn't been acked */ |
1635 | list_splice_init(&con->out_sent, &con->out_queue); | |
9bd2e6f8 | 1636 | |
e76661d0 SW |
1637 | /* If there are no messages queued or keepalive pending, place |
1638 | * the connection in a STANDBY state */ | |
1639 | if (list_empty(&con->out_queue) && | |
6503e0b6 | 1640 | !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { |
e00de341 | 1641 | dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); |
6503e0b6 | 1642 | ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); |
6d7f62bf | 1643 | con->state = CEPH_CON_S_STANDBY; |
e80a52d1 SW |
1644 | } else { |
1645 | /* retry after a delay. */ | |
6d7f62bf | 1646 | con->state = CEPH_CON_S_PREOPEN; |
418af5b3 | 1647 | if (!con->delay) { |
e80a52d1 | 1648 | con->delay = BASE_DELAY_INTERVAL; |
418af5b3 | 1649 | } else if (con->delay < MAX_DELAY_INTERVAL) { |
e80a52d1 | 1650 | con->delay *= 2; |
418af5b3 ID |
1651 | if (con->delay > MAX_DELAY_INTERVAL) |
1652 | con->delay = MAX_DELAY_INTERVAL; | |
1653 | } | |
6503e0b6 | 1654 | ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); |
8618e30b | 1655 | queue_con(con); |
31b8006e | 1656 | } |
31b8006e SW |
1657 | } |
1658 | ||
120a75ea YZ |
1659 | void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) |
1660 | { | |
1661 | u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; | |
1662 | msgr->inst.addr.nonce = cpu_to_le32(nonce); | |
6503e0b6 | 1663 | ceph_encode_my_addr(msgr); |
120a75ea | 1664 | } |
31b8006e SW |
1665 | |
1666 | /* | |
15d9882c | 1667 | * initialize a new messenger instance |
31b8006e | 1668 | */ |
15d9882c | 1669 | void ceph_messenger_init(struct ceph_messenger *msgr, |
859bff51 | 1670 | struct ceph_entity_addr *myaddr) |
31b8006e | 1671 | { |
31b8006e SW |
1672 | spin_lock_init(&msgr->global_seq_lock); |
1673 | ||
fd1a154c ID |
1674 | if (myaddr) { |
1675 | memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, | |
1676 | sizeof(msgr->inst.addr.in_addr)); | |
6503e0b6 | 1677 | ceph_addr_set_port(&msgr->inst.addr, 0); |
fd1a154c | 1678 | } |
31b8006e | 1679 | |
cd1a677c ID |
1680 | /* |
1681 | * Since nautilus, clients are identified using type ANY. | |
1682 | * For msgr1, ceph_encode_banner_addr() munges it to NONE. | |
1683 | */ | |
1684 | msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; | |
fd1a154c ID |
1685 | |
1686 | /* generate a random non-zero nonce */ | |
1687 | do { | |
1688 | get_random_bytes(&msgr->inst.addr.nonce, | |
1689 | sizeof(msgr->inst.addr.nonce)); | |
1690 | } while (!msgr->inst.addr.nonce); | |
6503e0b6 | 1691 | ceph_encode_my_addr(msgr); |
31b8006e | 1692 | |
a2a32584 | 1693 | atomic_set(&msgr->stopping, 0); |
757856d2 | 1694 | write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); |
31b8006e | 1695 | |
15d9882c | 1696 | dout("%s %p\n", __func__, msgr); |
31b8006e SW |
1697 | } |
1698 | ||
757856d2 ID |
1699 | void ceph_messenger_fini(struct ceph_messenger *msgr) |
1700 | { | |
1701 | put_net(read_pnet(&msgr->net)); | |
1702 | } | |
757856d2 | 1703 | |
583d0fef ID |
1704 | static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) |
1705 | { | |
1706 | if (msg->con) | |
1707 | msg->con->ops->put(msg->con); | |
1708 | ||
1709 | msg->con = con ? con->ops->get(con) : NULL; | |
1710 | BUG_ON(msg->con != con); | |
1711 | } | |
1712 | ||
e00de341 SW |
1713 | static void clear_standby(struct ceph_connection *con) |
1714 | { | |
1715 | /* come back from STANDBY? */ | |
6d7f62bf | 1716 | if (con->state == CEPH_CON_S_STANDBY) { |
e00de341 | 1717 | dout("clear_standby %p and ++connect_seq\n", con); |
6d7f62bf | 1718 | con->state = CEPH_CON_S_PREOPEN; |
a56dd9bf | 1719 | con->v1.connect_seq++; |
6503e0b6 ID |
1720 | WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); |
1721 | WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); | |
e00de341 SW |
1722 | } |
1723 | } | |
1724 | ||
31b8006e SW |
1725 | /* |
1726 | * Queue up an outgoing message on the given connection. | |
771294fe ID |
1727 | * |
1728 | * Consumes a ref on @msg. | |
31b8006e SW |
1729 | */ |
1730 | void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) | |
1731 | { | |
31b8006e | 1732 | /* set src+dst */ |
dbad185d | 1733 | msg->hdr.src = con->msgr->inst.name; |
3ca02ef9 | 1734 | BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); |
e84346b7 SW |
1735 | msg->needs_out_seq = true; |
1736 | ||
ec302645 | 1737 | mutex_lock(&con->mutex); |
92ce034b | 1738 | |
6d7f62bf | 1739 | if (con->state == CEPH_CON_S_CLOSED) { |
a59b55a6 SW |
1740 | dout("con_send %p closed, dropping %p\n", con, msg); |
1741 | ceph_msg_put(msg); | |
1742 | mutex_unlock(&con->mutex); | |
1743 | return; | |
1744 | } | |
1745 | ||
583d0fef | 1746 | msg_con_set(msg, con); |
92ce034b | 1747 | |
31b8006e SW |
1748 | BUG_ON(!list_empty(&msg->list_head)); |
1749 | list_add_tail(&msg->list_head, &con->out_queue); | |
1750 | dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, | |
1751 | ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), | |
1752 | ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), | |
1753 | le32_to_cpu(msg->hdr.front_len), | |
1754 | le32_to_cpu(msg->hdr.middle_len), | |
1755 | le32_to_cpu(msg->hdr.data_len)); | |
00650931 SW |
1756 | |
1757 | clear_standby(con); | |
ec302645 | 1758 | mutex_unlock(&con->mutex); |
31b8006e SW |
1759 | |
1760 | /* if there wasn't anything waiting to send before, queue | |
1761 | * new work */ | |
6503e0b6 | 1762 | if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) |
31b8006e SW |
1763 | queue_con(con); |
1764 | } | |
3d14c5d2 | 1765 | EXPORT_SYMBOL(ceph_con_send); |
31b8006e SW |
1766 | |
1767 | /* | |
1768 | * Revoke a message that was previously queued for send | |
1769 | */ | |
6740a845 | 1770 | void ceph_msg_revoke(struct ceph_msg *msg) |
31b8006e | 1771 | { |
6740a845 AE |
1772 | struct ceph_connection *con = msg->con; |
1773 | ||
583d0fef ID |
1774 | if (!con) { |
1775 | dout("%s msg %p null con\n", __func__, msg); | |
6740a845 | 1776 | return; /* Message not in our possession */ |
583d0fef | 1777 | } |
6740a845 | 1778 | |
ec302645 | 1779 | mutex_lock(&con->mutex); |
566050e1 ID |
1780 | if (list_empty(&msg->list_head)) { |
1781 | WARN_ON(con->out_msg == msg); | |
1782 | dout("%s con %p msg %p not linked\n", __func__, con, msg); | |
1783 | mutex_unlock(&con->mutex); | |
1784 | return; | |
ed98adad | 1785 | } |
67645d76 | 1786 | |
566050e1 ID |
1787 | dout("%s con %p msg %p was linked\n", __func__, con, msg); |
1788 | msg->hdr.seq = 0; | |
1789 | ceph_msg_remove(msg); | |
1790 | ||
1791 | if (con->out_msg == msg) { | |
1792 | WARN_ON(con->state != CEPH_CON_S_OPEN); | |
1793 | dout("%s con %p msg %p was sending\n", __func__, con, msg); | |
cd1a677c ID |
1794 | if (ceph_msgr2(from_msgr(con->msgr))) |
1795 | ceph_con_v2_revoke(con); | |
1796 | else | |
1797 | ceph_con_v1_revoke(con); | |
566050e1 | 1798 | ceph_msg_put(con->out_msg); |
67645d76 | 1799 | con->out_msg = NULL; |
566050e1 ID |
1800 | } else { |
1801 | dout("%s con %p msg %p not current, out_msg %p\n", __func__, | |
1802 | con, msg, con->out_msg); | |
31b8006e | 1803 | } |
ec302645 | 1804 | mutex_unlock(&con->mutex); |
31b8006e SW |
1805 | } |
1806 | ||
350b1c32 | 1807 | /* |
0d59ab81 | 1808 | * Revoke a message that we may be reading data into |
350b1c32 | 1809 | */ |
8921d114 | 1810 | void ceph_msg_revoke_incoming(struct ceph_msg *msg) |
350b1c32 | 1811 | { |
583d0fef | 1812 | struct ceph_connection *con = msg->con; |
8921d114 | 1813 | |
583d0fef | 1814 | if (!con) { |
8921d114 | 1815 | dout("%s msg %p null con\n", __func__, msg); |
8921d114 AE |
1816 | return; /* Message not in our possession */ |
1817 | } | |
1818 | ||
350b1c32 | 1819 | mutex_lock(&con->mutex); |
8921d114 | 1820 | if (con->in_msg == msg) { |
566050e1 ID |
1821 | WARN_ON(con->state != CEPH_CON_S_OPEN); |
1822 | dout("%s con %p msg %p was recving\n", __func__, con, msg); | |
cd1a677c ID |
1823 | if (ceph_msgr2(from_msgr(con->msgr))) |
1824 | ceph_con_v2_revoke_incoming(con); | |
1825 | else | |
1826 | ceph_con_v1_revoke_incoming(con); | |
350b1c32 SW |
1827 | ceph_msg_put(con->in_msg); |
1828 | con->in_msg = NULL; | |
350b1c32 | 1829 | } else { |
566050e1 ID |
1830 | dout("%s con %p msg %p not current, in_msg %p\n", __func__, |
1831 | con, msg, con->in_msg); | |
350b1c32 SW |
1832 | } |
1833 | mutex_unlock(&con->mutex); | |
1834 | } | |
1835 | ||
31b8006e SW |
1836 | /* |
1837 | * Queue a keepalive byte to ensure the tcp connection is alive. | |
1838 | */ | |
1839 | void ceph_con_keepalive(struct ceph_connection *con) | |
1840 | { | |
e00de341 | 1841 | dout("con_keepalive %p\n", con); |
00650931 | 1842 | mutex_lock(&con->mutex); |
e00de341 | 1843 | clear_standby(con); |
6503e0b6 | 1844 | ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); |
00650931 | 1845 | mutex_unlock(&con->mutex); |
4aac9228 | 1846 | |
6503e0b6 | 1847 | if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) |
31b8006e SW |
1848 | queue_con(con); |
1849 | } | |
3d14c5d2 | 1850 | EXPORT_SYMBOL(ceph_con_keepalive); |
31b8006e | 1851 | |
8b9558aa YZ |
1852 | bool ceph_con_keepalive_expired(struct ceph_connection *con, |
1853 | unsigned long interval) | |
1854 | { | |
1855 | if (interval > 0 && | |
1856 | (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { | |
473bd2d7 AB |
1857 | struct timespec64 now; |
1858 | struct timespec64 ts; | |
1859 | ktime_get_real_ts64(&now); | |
1860 | jiffies_to_timespec64(interval, &ts); | |
1861 | ts = timespec64_add(con->last_keepalive_ack, ts); | |
1862 | return timespec64_compare(&now, &ts) >= 0; | |
8b9558aa YZ |
1863 | } |
1864 | return false; | |
1865 | } | |
1866 | ||
0d9c1ab3 | 1867 | static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) |
43794509 | 1868 | { |
0d9c1ab3 ID |
1869 | BUG_ON(msg->num_data_items >= msg->max_data_items); |
1870 | return &msg->data[msg->num_data_items++]; | |
6644ed7b AE |
1871 | } |
1872 | ||
1873 | static void ceph_msg_data_destroy(struct ceph_msg_data *data) | |
1874 | { | |
e8862740 ID |
1875 | if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { |
1876 | int num_pages = calc_pages_for(data->alignment, data->length); | |
1877 | ceph_release_page_vector(data->pages, num_pages); | |
1878 | } else if (data->type == CEPH_MSG_DATA_PAGELIST) { | |
6644ed7b | 1879 | ceph_pagelist_release(data->pagelist); |
e8862740 | 1880 | } |
43794509 AE |
1881 | } |
1882 | ||
90af3602 | 1883 | void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, |
e8862740 | 1884 | size_t length, size_t alignment, bool own_pages) |
02afca6c | 1885 | { |
6644ed7b AE |
1886 | struct ceph_msg_data *data; |
1887 | ||
07aa1558 AE |
1888 | BUG_ON(!pages); |
1889 | BUG_ON(!length); | |
6644ed7b | 1890 | |
0d9c1ab3 ID |
1891 | data = ceph_msg_data_add(msg); |
1892 | data->type = CEPH_MSG_DATA_PAGES; | |
6644ed7b AE |
1893 | data->pages = pages; |
1894 | data->length = length; | |
1895 | data->alignment = alignment & ~PAGE_MASK; | |
e8862740 | 1896 | data->own_pages = own_pages; |
02afca6c | 1897 | |
5240d9f9 | 1898 | msg->data_length += length; |
02afca6c | 1899 | } |
90af3602 | 1900 | EXPORT_SYMBOL(ceph_msg_data_add_pages); |
31b8006e | 1901 | |
90af3602 | 1902 | void ceph_msg_data_add_pagelist(struct ceph_msg *msg, |
27fa8385 AE |
1903 | struct ceph_pagelist *pagelist) |
1904 | { | |
6644ed7b AE |
1905 | struct ceph_msg_data *data; |
1906 | ||
07aa1558 AE |
1907 | BUG_ON(!pagelist); |
1908 | BUG_ON(!pagelist->length); | |
27fa8385 | 1909 | |
0d9c1ab3 ID |
1910 | data = ceph_msg_data_add(msg); |
1911 | data->type = CEPH_MSG_DATA_PAGELIST; | |
89486833 | 1912 | refcount_inc(&pagelist->refcnt); |
6644ed7b AE |
1913 | data->pagelist = pagelist; |
1914 | ||
5240d9f9 | 1915 | msg->data_length += pagelist->length; |
27fa8385 | 1916 | } |
90af3602 | 1917 | EXPORT_SYMBOL(ceph_msg_data_add_pagelist); |
27fa8385 | 1918 | |
ea96571f | 1919 | #ifdef CONFIG_BLOCK |
5359a17d ID |
1920 | void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, |
1921 | u32 length) | |
27fa8385 | 1922 | { |
6644ed7b AE |
1923 | struct ceph_msg_data *data; |
1924 | ||
0d9c1ab3 ID |
1925 | data = ceph_msg_data_add(msg); |
1926 | data->type = CEPH_MSG_DATA_BIO; | |
5359a17d | 1927 | data->bio_pos = *bio_pos; |
c851c495 | 1928 | data->bio_length = length; |
6644ed7b | 1929 | |
5240d9f9 | 1930 | msg->data_length += length; |
27fa8385 | 1931 | } |
90af3602 | 1932 | EXPORT_SYMBOL(ceph_msg_data_add_bio); |
ea96571f | 1933 | #endif /* CONFIG_BLOCK */ |
27fa8385 | 1934 | |
b9e281c2 ID |
1935 | void ceph_msg_data_add_bvecs(struct ceph_msg *msg, |
1936 | struct ceph_bvec_iter *bvec_pos) | |
1937 | { | |
1938 | struct ceph_msg_data *data; | |
1939 | ||
0d9c1ab3 ID |
1940 | data = ceph_msg_data_add(msg); |
1941 | data->type = CEPH_MSG_DATA_BVECS; | |
b9e281c2 ID |
1942 | data->bvec_pos = *bvec_pos; |
1943 | ||
b9e281c2 ID |
1944 | msg->data_length += bvec_pos->iter.bi_size; |
1945 | } | |
1946 | EXPORT_SYMBOL(ceph_msg_data_add_bvecs); | |
1947 | ||
dee0c5f8 JL |
1948 | void ceph_msg_data_add_iter(struct ceph_msg *msg, |
1949 | struct iov_iter *iter) | |
1950 | { | |
1951 | struct ceph_msg_data *data; | |
1952 | ||
1953 | data = ceph_msg_data_add(msg); | |
1954 | data->type = CEPH_MSG_DATA_ITER; | |
1955 | data->iter = *iter; | |
1956 | ||
1957 | msg->data_length += iov_iter_count(&data->iter); | |
1958 | } | |
1959 | ||
31b8006e SW |
1960 | /* |
1961 | * construct a new message with given type, size | |
1962 | * the new msg has a ref count of 1. | |
1963 | */ | |
0d9c1ab3 ID |
1964 | struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, |
1965 | gfp_t flags, bool can_fail) | |
31b8006e SW |
1966 | { |
1967 | struct ceph_msg *m; | |
1968 | ||
e3d5d638 | 1969 | m = kmem_cache_zalloc(ceph_msg_cache, flags); |
31b8006e SW |
1970 | if (m == NULL) |
1971 | goto out; | |
31b8006e SW |
1972 | |
1973 | m->hdr.type = cpu_to_le16(type); | |
45c6ceb5 | 1974 | m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); |
31b8006e | 1975 | m->hdr.front_len = cpu_to_le32(front_len); |
ca20892d | 1976 | |
9516e45b AE |
1977 | INIT_LIST_HEAD(&m->list_head); |
1978 | kref_init(&m->kref); | |
ca20892d | 1979 | |
31b8006e SW |
1980 | /* front */ |
1981 | if (front_len) { | |
a421ef30 | 1982 | m->front.iov_base = kvmalloc(front_len, flags); |
31b8006e | 1983 | if (m->front.iov_base == NULL) { |
b61c2763 | 1984 | dout("ceph_msg_new can't allocate %d bytes\n", |
31b8006e SW |
1985 | front_len); |
1986 | goto out2; | |
1987 | } | |
1988 | } else { | |
1989 | m->front.iov_base = NULL; | |
1990 | } | |
f2be82b0 | 1991 | m->front_alloc_len = m->front.iov_len = front_len; |
31b8006e | 1992 | |
0d9c1ab3 ID |
1993 | if (max_data_items) { |
1994 | m->data = kmalloc_array(max_data_items, sizeof(*m->data), | |
1995 | flags); | |
1996 | if (!m->data) | |
1997 | goto out2; | |
1998 | ||
1999 | m->max_data_items = max_data_items; | |
2000 | } | |
2001 | ||
bb257664 | 2002 | dout("ceph_msg_new %p front %d\n", m, front_len); |
31b8006e SW |
2003 | return m; |
2004 | ||
2005 | out2: | |
2006 | ceph_msg_put(m); | |
2007 | out: | |
b61c2763 SW |
2008 | if (!can_fail) { |
2009 | pr_err("msg_new can't create type %d front %d\n", type, | |
2010 | front_len); | |
f0ed1b7c | 2011 | WARN_ON(1); |
b61c2763 SW |
2012 | } else { |
2013 | dout("msg_new can't create type %d front %d\n", type, | |
2014 | front_len); | |
2015 | } | |
a79832f2 | 2016 | return NULL; |
31b8006e | 2017 | } |
0d9c1ab3 ID |
2018 | EXPORT_SYMBOL(ceph_msg_new2); |
2019 | ||
2020 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |
2021 | bool can_fail) | |
2022 | { | |
2023 | return ceph_msg_new2(type, front_len, 0, flags, can_fail); | |
2024 | } | |
3d14c5d2 | 2025 | EXPORT_SYMBOL(ceph_msg_new); |
31b8006e | 2026 | |
31b8006e SW |
2027 | /* |
2028 | * Allocate "middle" portion of a message, if it is needed and wasn't | |
2029 | * allocated by alloc_msg. This allows us to read a small fixed-size | |
2030 | * per-type header in the front and then gracefully fail (i.e., | |
2031 | * propagate the error to the caller based on info in the front) when | |
2032 | * the middle is too large. | |
2033 | */ | |
2450418c | 2034 | static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) |
31b8006e SW |
2035 | { |
2036 | int type = le16_to_cpu(msg->hdr.type); | |
2037 | int middle_len = le32_to_cpu(msg->hdr.middle_len); | |
2038 | ||
2039 | dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, | |
2040 | ceph_msg_type_name(type), middle_len); | |
2041 | BUG_ON(!middle_len); | |
2042 | BUG_ON(msg->middle); | |
2043 | ||
b6c1d5b8 | 2044 | msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); |
31b8006e SW |
2045 | if (!msg->middle) |
2046 | return -ENOMEM; | |
2047 | return 0; | |
2048 | } | |
2049 | ||
2450418c | 2050 | /* |
1c20f2d2 AE |
2051 | * Allocate a message for receiving an incoming message on a |
2052 | * connection, and save the result in con->in_msg. Uses the | |
2053 | * connection's private alloc_msg op if available. | |
2054 | * | |
4740a623 SW |
2055 | * Returns 0 on success, or a negative error code. |
2056 | * | |
2057 | * On success, if we set *skip = 1: | |
2058 | * - the next message should be skipped and ignored. | |
2059 | * - con->in_msg == NULL | |
2060 | * or if we set *skip = 0: | |
2061 | * - con->in_msg is non-null. | |
2062 | * On error (ENOMEM, EAGAIN, ...), | |
2063 | * - con->in_msg == NULL | |
2450418c | 2064 | */ |
6503e0b6 ID |
2065 | int ceph_con_in_msg_alloc(struct ceph_connection *con, |
2066 | struct ceph_msg_header *hdr, int *skip) | |
2450418c | 2067 | { |
2450418c | 2068 | int middle_len = le32_to_cpu(hdr->middle_len); |
1d866d1c | 2069 | struct ceph_msg *msg; |
4740a623 | 2070 | int ret = 0; |
2450418c | 2071 | |
1c20f2d2 | 2072 | BUG_ON(con->in_msg != NULL); |
53ded495 | 2073 | BUG_ON(!con->ops->alloc_msg); |
2450418c | 2074 | |
53ded495 AE |
2075 | mutex_unlock(&con->mutex); |
2076 | msg = con->ops->alloc_msg(con, hdr, skip); | |
2077 | mutex_lock(&con->mutex); | |
6d7f62bf | 2078 | if (con->state != CEPH_CON_S_OPEN) { |
53ded495 | 2079 | if (msg) |
1d866d1c | 2080 | ceph_msg_put(msg); |
53ded495 AE |
2081 | return -EAGAIN; |
2082 | } | |
4137577a AE |
2083 | if (msg) { |
2084 | BUG_ON(*skip); | |
583d0fef | 2085 | msg_con_set(msg, con); |
4137577a | 2086 | con->in_msg = msg; |
4137577a AE |
2087 | } else { |
2088 | /* | |
2089 | * Null message pointer means either we should skip | |
2090 | * this message or we couldn't allocate memory. The | |
2091 | * former is not an error. | |
2092 | */ | |
2093 | if (*skip) | |
2094 | return 0; | |
4137577a | 2095 | |
67c64eb7 | 2096 | con->error_msg = "error allocating memory for incoming message"; |
53ded495 | 2097 | return -ENOMEM; |
2450418c | 2098 | } |
fc4c128e | 2099 | memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); |
2450418c | 2100 | |
1c20f2d2 AE |
2101 | if (middle_len && !con->in_msg->middle) { |
2102 | ret = ceph_alloc_middle(con, con->in_msg); | |
2450418c | 2103 | if (ret < 0) { |
1c20f2d2 AE |
2104 | ceph_msg_put(con->in_msg); |
2105 | con->in_msg = NULL; | |
2450418c YS |
2106 | } |
2107 | } | |
9d7f0f13 | 2108 | |
4740a623 | 2109 | return ret; |
2450418c YS |
2110 | } |
2111 | ||
6503e0b6 | 2112 | void ceph_con_get_out_msg(struct ceph_connection *con) |
771294fe ID |
2113 | { |
2114 | struct ceph_msg *msg; | |
2115 | ||
2116 | BUG_ON(list_empty(&con->out_queue)); | |
2117 | msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); | |
2118 | WARN_ON(msg->con != con); | |
2119 | ||
2120 | /* | |
2121 | * Put the message on "sent" list using a ref from ceph_con_send(). | |
2122 | * It is put when the message is acked or revoked. | |
2123 | */ | |
2124 | list_move_tail(&msg->list_head, &con->out_sent); | |
2125 | ||
2126 | /* | |
2127 | * Only assign outgoing seq # if we haven't sent this message | |
2128 | * yet. If it is requeued, resend with it's original seq. | |
2129 | */ | |
2130 | if (msg->needs_out_seq) { | |
2131 | msg->hdr.seq = cpu_to_le64(++con->out_seq); | |
2132 | msg->needs_out_seq = false; | |
2133 | ||
2134 | if (con->ops->reencode_message) | |
2135 | con->ops->reencode_message(msg); | |
2136 | } | |
2137 | ||
2138 | /* | |
2139 | * Get a ref for out_msg. It is put when we are done sending the | |
2140 | * message or in case of a fault. | |
2141 | */ | |
2142 | WARN_ON(con->out_msg); | |
2143 | con->out_msg = ceph_msg_get(msg); | |
2144 | } | |
31b8006e SW |
2145 | |
2146 | /* | |
2147 | * Free a generically kmalloc'd message. | |
2148 | */ | |
0215e44b | 2149 | static void ceph_msg_free(struct ceph_msg *m) |
31b8006e | 2150 | { |
0215e44b | 2151 | dout("%s %p\n", __func__, m); |
4965fc38 | 2152 | kvfree(m->front.iov_base); |
0d9c1ab3 | 2153 | kfree(m->data); |
e3d5d638 | 2154 | kmem_cache_free(ceph_msg_cache, m); |
31b8006e SW |
2155 | } |
2156 | ||
0215e44b | 2157 | static void ceph_msg_release(struct kref *kref) |
c2e552e7 SW |
2158 | { |
2159 | struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); | |
0d9c1ab3 | 2160 | int i; |
31b8006e | 2161 | |
0215e44b | 2162 | dout("%s %p\n", __func__, m); |
c2e552e7 SW |
2163 | WARN_ON(!list_empty(&m->list_head)); |
2164 | ||
583d0fef ID |
2165 | msg_con_set(m, NULL); |
2166 | ||
c2e552e7 SW |
2167 | /* drop middle, data, if any */ |
2168 | if (m->middle) { | |
2169 | ceph_buffer_put(m->middle); | |
2170 | m->middle = NULL; | |
31b8006e | 2171 | } |
5240d9f9 | 2172 | |
0d9c1ab3 ID |
2173 | for (i = 0; i < m->num_data_items; i++) |
2174 | ceph_msg_data_destroy(&m->data[i]); | |
58bb3b37 | 2175 | |
c2e552e7 SW |
2176 | if (m->pool) |
2177 | ceph_msgpool_put(m->pool, m); | |
2178 | else | |
0215e44b ID |
2179 | ceph_msg_free(m); |
2180 | } | |
2181 | ||
2182 | struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) | |
2183 | { | |
2184 | dout("%s %p (was %d)\n", __func__, msg, | |
2c935bc5 | 2185 | kref_read(&msg->kref)); |
0215e44b ID |
2186 | kref_get(&msg->kref); |
2187 | return msg; | |
2188 | } | |
2189 | EXPORT_SYMBOL(ceph_msg_get); | |
2190 | ||
2191 | void ceph_msg_put(struct ceph_msg *msg) | |
2192 | { | |
2193 | dout("%s %p (was %d)\n", __func__, msg, | |
2c935bc5 | 2194 | kref_read(&msg->kref)); |
0215e44b | 2195 | kref_put(&msg->kref, ceph_msg_release); |
31b8006e | 2196 | } |
0215e44b | 2197 | EXPORT_SYMBOL(ceph_msg_put); |
9ec7cab1 SW |
2198 | |
2199 | void ceph_msg_dump(struct ceph_msg *msg) | |
2200 | { | |
3cea4c30 ID |
2201 | pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, |
2202 | msg->front_alloc_len, msg->data_length); | |
9ec7cab1 SW |
2203 | print_hex_dump(KERN_DEBUG, "header: ", |
2204 | DUMP_PREFIX_OFFSET, 16, 1, | |
2205 | &msg->hdr, sizeof(msg->hdr), true); | |
2206 | print_hex_dump(KERN_DEBUG, " front: ", | |
2207 | DUMP_PREFIX_OFFSET, 16, 1, | |
2208 | msg->front.iov_base, msg->front.iov_len, true); | |
2209 | if (msg->middle) | |
2210 | print_hex_dump(KERN_DEBUG, "middle: ", | |
2211 | DUMP_PREFIX_OFFSET, 16, 1, | |
2212 | msg->middle->vec.iov_base, | |
2213 | msg->middle->vec.iov_len, true); | |
2214 | print_hex_dump(KERN_DEBUG, "footer: ", | |
2215 | DUMP_PREFIX_OFFSET, 16, 1, | |
2216 | &msg->footer, sizeof(msg->footer), true); | |
2217 | } | |
3d14c5d2 | 2218 | EXPORT_SYMBOL(ceph_msg_dump); |