Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
3d14c5d2 | 2 | #include <linux/ceph/ceph_debug.h> |
31b8006e SW |
3 | |
4 | #include <linux/crc32c.h> | |
5 | #include <linux/ctype.h> | |
6 | #include <linux/highmem.h> | |
7 | #include <linux/inet.h> | |
8 | #include <linux/kthread.h> | |
9 | #include <linux/net.h> | |
757856d2 | 10 | #include <linux/nsproxy.h> |
633ee407 | 11 | #include <linux/sched/mm.h> |
5a0e3ad6 | 12 | #include <linux/slab.h> |
31b8006e SW |
13 | #include <linux/socket.h> |
14 | #include <linux/string.h> | |
3ebc21f7 | 15 | #ifdef CONFIG_BLOCK |
68b4476b | 16 | #include <linux/bio.h> |
3ebc21f7 | 17 | #endif /* CONFIG_BLOCK */ |
ee3b56f2 | 18 | #include <linux/dns_resolver.h> |
31b8006e | 19 | #include <net/tcp.h> |
40e0b090 | 20 | #include <trace/events/sock.h> |
31b8006e | 21 | |
2b3e0c90 | 22 | #include <linux/ceph/ceph_features.h> |
3d14c5d2 YS |
23 | #include <linux/ceph/libceph.h> |
24 | #include <linux/ceph/messenger.h> | |
25 | #include <linux/ceph/decode.h> | |
26 | #include <linux/ceph/pagelist.h> | |
bc3b2d7f | 27 | #include <linux/export.h> |
31b8006e SW |
28 | |
29 | /* | |
30 | * Ceph uses the messenger to exchange ceph_msg messages with other | |
31 | * hosts in the system. The messenger provides ordered and reliable | |
32 | * delivery. We tolerate TCP disconnects by reconnecting (with | |
33 | * exponential backoff) in the case of a fault (disconnection, bad | |
34 | * crc, protocol error). Acks allow sent messages to be discarded by | |
35 | * the sender. | |
36 | */ | |
37 | ||
bc18f4b1 AE |
38 | /* |
39 | * We track the state of the socket on a given connection using | |
40 | * values defined below. The transition to a new socket state is | |
41 | * handled by a function which verifies we aren't coming from an | |
42 | * unexpected state. | |
43 | * | |
44 | * -------- | |
45 | * | NEW* | transient initial state | |
46 | * -------- | |
47 | * | con_sock_state_init() | |
48 | * v | |
49 | * ---------- | |
50 | * | CLOSED | initialized, but no socket (and no | |
51 | * ---------- TCP connection) | |
52 | * ^ \ | |
53 | * | \ con_sock_state_connecting() | |
54 | * | ---------------------- | |
55 | * | \ | |
56 | * + con_sock_state_closed() \ | |
fbb85a47 SW |
57 | * |+--------------------------- \ |
58 | * | \ \ \ | |
59 | * | ----------- \ \ | |
60 | * | | CLOSING | socket event; \ \ | |
61 | * | ----------- await close \ \ | |
62 | * | ^ \ | | |
63 | * | | \ | | |
64 | * | + con_sock_state_closing() \ | | |
65 | * | / \ | | | |
66 | * | / --------------- | | | |
67 | * | / \ v v | |
bc18f4b1 AE |
68 | * | / -------------- |
69 | * | / -----------------| CONNECTING | socket created, TCP | |
70 | * | | / -------------- connect initiated | |
71 | * | | | con_sock_state_connected() | |
72 | * | | v | |
73 | * ------------- | |
74 | * | CONNECTED | TCP connection established | |
75 | * ------------- | |
76 | * | |
77 | * State values for ceph_connection->sock_state; NEW is assumed to be 0. | |
78 | */ | |
ce2c8903 AE |
79 | |
80 | #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ | |
81 | #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ | |
82 | #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ | |
83 | #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ | |
84 | #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ | |
85 | ||
c9ffc77a AE |
86 | static bool con_flag_valid(unsigned long con_flag) |
87 | { | |
88 | switch (con_flag) { | |
3fefd43e ID |
89 | case CEPH_CON_F_LOSSYTX: |
90 | case CEPH_CON_F_KEEPALIVE_PENDING: | |
91 | case CEPH_CON_F_WRITE_PENDING: | |
92 | case CEPH_CON_F_SOCK_CLOSED: | |
93 | case CEPH_CON_F_BACKOFF: | |
c9ffc77a AE |
94 | return true; |
95 | default: | |
96 | return false; | |
97 | } | |
98 | } | |
99 | ||
6503e0b6 | 100 | void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) |
c9ffc77a AE |
101 | { |
102 | BUG_ON(!con_flag_valid(con_flag)); | |
103 | ||
104 | clear_bit(con_flag, &con->flags); | |
105 | } | |
106 | ||
6503e0b6 | 107 | void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) |
c9ffc77a AE |
108 | { |
109 | BUG_ON(!con_flag_valid(con_flag)); | |
110 | ||
111 | set_bit(con_flag, &con->flags); | |
112 | } | |
113 | ||
6503e0b6 | 114 | bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) |
c9ffc77a AE |
115 | { |
116 | BUG_ON(!con_flag_valid(con_flag)); | |
117 | ||
118 | return test_bit(con_flag, &con->flags); | |
119 | } | |
120 | ||
6503e0b6 ID |
121 | bool ceph_con_flag_test_and_clear(struct ceph_connection *con, |
122 | unsigned long con_flag) | |
c9ffc77a AE |
123 | { |
124 | BUG_ON(!con_flag_valid(con_flag)); | |
125 | ||
126 | return test_and_clear_bit(con_flag, &con->flags); | |
127 | } | |
128 | ||
6503e0b6 ID |
129 | bool ceph_con_flag_test_and_set(struct ceph_connection *con, |
130 | unsigned long con_flag) | |
c9ffc77a AE |
131 | { |
132 | BUG_ON(!con_flag_valid(con_flag)); | |
133 | ||
134 | return test_and_set_bit(con_flag, &con->flags); | |
135 | } | |
136 | ||
e3d5d638 AE |
137 | /* Slab caches for frequently-allocated structures */ |
138 | ||
139 | static struct kmem_cache *ceph_msg_cache; | |
140 | ||
a6a5349d SW |
141 | #ifdef CONFIG_LOCKDEP |
142 | static struct lock_class_key socket_class; | |
143 | #endif | |
144 | ||
31b8006e | 145 | static void queue_con(struct ceph_connection *con); |
37ab77ac | 146 | static void cancel_con(struct ceph_connection *con); |
68931622 | 147 | static void ceph_con_workfn(struct work_struct *); |
93209264 | 148 | static void con_fault(struct ceph_connection *con); |
31b8006e | 149 | |
31b8006e | 150 | /* |
f64a9317 AE |
151 | * Nicely render a sockaddr as a string. An array of formatted |
152 | * strings is used, to approximate reentrancy. | |
31b8006e | 153 | */ |
f64a9317 AE |
154 | #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ |
155 | #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) | |
156 | #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) | |
157 | #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ | |
158 | ||
159 | static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; | |
160 | static atomic_t addr_str_seq = ATOMIC_INIT(0); | |
31b8006e | 161 | |
699921d9 | 162 | struct page *ceph_zero_page; /* used in certain error cases */ |
57666519 | 163 | |
b726ec97 | 164 | const char *ceph_pr_addr(const struct ceph_entity_addr *addr) |
31b8006e SW |
165 | { |
166 | int i; | |
167 | char *s; | |
b726ec97 JL |
168 | struct sockaddr_storage ss = addr->in_addr; /* align */ |
169 | struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; | |
170 | struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; | |
31b8006e | 171 | |
f64a9317 | 172 | i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; |
31b8006e SW |
173 | s = addr_str[i]; |
174 | ||
b726ec97 | 175 | switch (ss.ss_family) { |
31b8006e | 176 | case AF_INET: |
d3c3c0a8 JL |
177 | snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu", |
178 | le32_to_cpu(addr->type), &in4->sin_addr, | |
bd406145 | 179 | ntohs(in4->sin_port)); |
31b8006e SW |
180 | break; |
181 | ||
182 | case AF_INET6: | |
d3c3c0a8 JL |
183 | snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu", |
184 | le32_to_cpu(addr->type), &in6->sin6_addr, | |
bd406145 | 185 | ntohs(in6->sin6_port)); |
31b8006e SW |
186 | break; |
187 | ||
188 | default: | |
d3002b97 | 189 | snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", |
b726ec97 | 190 | ss.ss_family); |
31b8006e SW |
191 | } |
192 | ||
193 | return s; | |
194 | } | |
3d14c5d2 | 195 | EXPORT_SYMBOL(ceph_pr_addr); |
31b8006e | 196 | |
6503e0b6 | 197 | void ceph_encode_my_addr(struct ceph_messenger *msgr) |
63f2d211 | 198 | { |
cd1a677c ID |
199 | if (!ceph_msgr2(from_msgr(msgr))) { |
200 | memcpy(&msgr->my_enc_addr, &msgr->inst.addr, | |
201 | sizeof(msgr->my_enc_addr)); | |
202 | ceph_encode_banner_addr(&msgr->my_enc_addr); | |
203 | } | |
63f2d211 SW |
204 | } |
205 | ||
31b8006e SW |
206 | /* |
207 | * work queue for all reading and writing to/from the socket. | |
208 | */ | |
e0f43c94 | 209 | static struct workqueue_struct *ceph_msgr_wq; |
31b8006e | 210 | |
e3d5d638 AE |
211 | static int ceph_msgr_slab_init(void) |
212 | { | |
213 | BUG_ON(ceph_msg_cache); | |
5ee61e95 | 214 | ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); |
81b36be4 AE |
215 | if (!ceph_msg_cache) |
216 | return -ENOMEM; | |
217 | ||
0d9c1ab3 | 218 | return 0; |
e3d5d638 AE |
219 | } |
220 | ||
221 | static void ceph_msgr_slab_exit(void) | |
222 | { | |
223 | BUG_ON(!ceph_msg_cache); | |
224 | kmem_cache_destroy(ceph_msg_cache); | |
225 | ceph_msg_cache = NULL; | |
226 | } | |
227 | ||
15417167 | 228 | static void _ceph_msgr_exit(void) |
6173d1f0 | 229 | { |
d3002b97 | 230 | if (ceph_msgr_wq) { |
6173d1f0 | 231 | destroy_workqueue(ceph_msgr_wq); |
d3002b97 AE |
232 | ceph_msgr_wq = NULL; |
233 | } | |
6173d1f0 | 234 | |
699921d9 ID |
235 | BUG_ON(!ceph_zero_page); |
236 | put_page(ceph_zero_page); | |
237 | ceph_zero_page = NULL; | |
d920ff6f BC |
238 | |
239 | ceph_msgr_slab_exit(); | |
6173d1f0 AE |
240 | } |
241 | ||
57a35dfb | 242 | int __init ceph_msgr_init(void) |
31b8006e | 243 | { |
d920ff6f BC |
244 | if (ceph_msgr_slab_init()) |
245 | return -ENOMEM; | |
246 | ||
699921d9 ID |
247 | BUG_ON(ceph_zero_page); |
248 | ceph_zero_page = ZERO_PAGE(0); | |
249 | get_page(ceph_zero_page); | |
57666519 | 250 | |
f9865f06 ID |
251 | /* |
252 | * The number of active work items is limited by the number of | |
253 | * connections, so leave @max_active at default. | |
254 | */ | |
255 | ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); | |
6173d1f0 AE |
256 | if (ceph_msgr_wq) |
257 | return 0; | |
57666519 | 258 | |
6173d1f0 AE |
259 | pr_err("msgr_init failed to create workqueue\n"); |
260 | _ceph_msgr_exit(); | |
57666519 | 261 | |
6173d1f0 | 262 | return -ENOMEM; |
31b8006e SW |
263 | } |
264 | ||
265 | void ceph_msgr_exit(void) | |
266 | { | |
57666519 | 267 | BUG_ON(ceph_msgr_wq == NULL); |
57666519 | 268 | |
6173d1f0 | 269 | _ceph_msgr_exit(); |
31b8006e SW |
270 | } |
271 | ||
cd84db6e | 272 | void ceph_msgr_flush(void) |
a922d38f SW |
273 | { |
274 | flush_workqueue(ceph_msgr_wq); | |
275 | } | |
3d14c5d2 | 276 | EXPORT_SYMBOL(ceph_msgr_flush); |
a922d38f | 277 | |
ce2c8903 AE |
278 | /* Connection socket state transition functions */ |
279 | ||
280 | static void con_sock_state_init(struct ceph_connection *con) | |
281 | { | |
282 | int old_state; | |
283 | ||
284 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); | |
285 | if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) | |
286 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
287 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
288 | CON_SOCK_STATE_CLOSED); | |
ce2c8903 AE |
289 | } |
290 | ||
291 | static void con_sock_state_connecting(struct ceph_connection *con) | |
292 | { | |
293 | int old_state; | |
294 | ||
295 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); | |
296 | if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) | |
297 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
298 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
299 | CON_SOCK_STATE_CONNECTING); | |
ce2c8903 AE |
300 | } |
301 | ||
302 | static void con_sock_state_connected(struct ceph_connection *con) | |
303 | { | |
304 | int old_state; | |
305 | ||
306 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); | |
307 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) | |
308 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
309 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
310 | CON_SOCK_STATE_CONNECTED); | |
ce2c8903 AE |
311 | } |
312 | ||
313 | static void con_sock_state_closing(struct ceph_connection *con) | |
314 | { | |
315 | int old_state; | |
316 | ||
317 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); | |
318 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && | |
319 | old_state != CON_SOCK_STATE_CONNECTED && | |
320 | old_state != CON_SOCK_STATE_CLOSING)) | |
321 | printk("%s: unexpected old state %d\n", __func__, old_state); | |
8007b8d6 SW |
322 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
323 | CON_SOCK_STATE_CLOSING); | |
ce2c8903 AE |
324 | } |
325 | ||
326 | static void con_sock_state_closed(struct ceph_connection *con) | |
327 | { | |
328 | int old_state; | |
329 | ||
330 | old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); | |
331 | if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && | |
fbb85a47 | 332 | old_state != CON_SOCK_STATE_CLOSING && |
8007b8d6 SW |
333 | old_state != CON_SOCK_STATE_CONNECTING && |
334 | old_state != CON_SOCK_STATE_CLOSED)) | |
ce2c8903 | 335 | printk("%s: unexpected old state %d\n", __func__, old_state); |
8007b8d6 SW |
336 | dout("%s con %p sock %d -> %d\n", __func__, con, old_state, |
337 | CON_SOCK_STATE_CLOSED); | |
ce2c8903 | 338 | } |
a922d38f | 339 | |
31b8006e SW |
340 | /* |
341 | * socket callback functions | |
342 | */ | |
343 | ||
344 | /* data available on socket, or listen socket received a connect */ | |
676d2369 | 345 | static void ceph_sock_data_ready(struct sock *sk) |
31b8006e | 346 | { |
bd406145 | 347 | struct ceph_connection *con = sk->sk_user_data; |
40e0b090 PY |
348 | |
349 | trace_sk_data_ready(sk); | |
350 | ||
a2a32584 GH |
351 | if (atomic_read(&con->msgr->stopping)) { |
352 | return; | |
353 | } | |
bd406145 | 354 | |
31b8006e | 355 | if (sk->sk_state != TCP_CLOSE_WAIT) { |
30be780a | 356 | dout("%s %p state = %d, queueing work\n", __func__, |
31b8006e SW |
357 | con, con->state); |
358 | queue_con(con); | |
359 | } | |
360 | } | |
361 | ||
362 | /* socket has buffer space for writing */ | |
327800bd | 363 | static void ceph_sock_write_space(struct sock *sk) |
31b8006e | 364 | { |
d3002b97 | 365 | struct ceph_connection *con = sk->sk_user_data; |
31b8006e | 366 | |
182fac26 JS |
367 | /* only queue to workqueue if there is data we want to write, |
368 | * and there is sufficient space in the socket buffer to accept | |
327800bd | 369 | * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() |
182fac26 JS |
370 | * doesn't get called again until try_write() fills the socket |
371 | * buffer. See net/ipv4/tcp_input.c:tcp_check_space() | |
372 | * and net/core/stream.c:sk_stream_write_space(). | |
373 | */ | |
6503e0b6 | 374 | if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { |
64dc6130 | 375 | if (sk_stream_is_writeable(sk)) { |
327800bd | 376 | dout("%s %p queueing write work\n", __func__, con); |
182fac26 JS |
377 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
378 | queue_con(con); | |
379 | } | |
31b8006e | 380 | } else { |
327800bd | 381 | dout("%s %p nothing to write\n", __func__, con); |
31b8006e | 382 | } |
31b8006e SW |
383 | } |
384 | ||
385 | /* socket's state has changed */ | |
327800bd | 386 | static void ceph_sock_state_change(struct sock *sk) |
31b8006e | 387 | { |
bd406145 | 388 | struct ceph_connection *con = sk->sk_user_data; |
31b8006e | 389 | |
30be780a | 390 | dout("%s %p state = %d sk_state = %u\n", __func__, |
31b8006e SW |
391 | con, con->state, sk->sk_state); |
392 | ||
31b8006e SW |
393 | switch (sk->sk_state) { |
394 | case TCP_CLOSE: | |
327800bd | 395 | dout("%s TCP_CLOSE\n", __func__); |
df561f66 | 396 | fallthrough; |
31b8006e | 397 | case TCP_CLOSE_WAIT: |
327800bd | 398 | dout("%s TCP_CLOSE_WAIT\n", __func__); |
ce2c8903 | 399 | con_sock_state_closing(con); |
6503e0b6 | 400 | ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); |
d65c9e0b | 401 | queue_con(con); |
31b8006e SW |
402 | break; |
403 | case TCP_ESTABLISHED: | |
327800bd | 404 | dout("%s TCP_ESTABLISHED\n", __func__); |
ce2c8903 | 405 | con_sock_state_connected(con); |
31b8006e SW |
406 | queue_con(con); |
407 | break; | |
d3002b97 AE |
408 | default: /* Everything else is uninteresting */ |
409 | break; | |
31b8006e SW |
410 | } |
411 | } | |
412 | ||
413 | /* | |
414 | * set up socket callbacks | |
415 | */ | |
416 | static void set_sock_callbacks(struct socket *sock, | |
417 | struct ceph_connection *con) | |
418 | { | |
419 | struct sock *sk = sock->sk; | |
bd406145 | 420 | sk->sk_user_data = con; |
327800bd AE |
421 | sk->sk_data_ready = ceph_sock_data_ready; |
422 | sk->sk_write_space = ceph_sock_write_space; | |
423 | sk->sk_state_change = ceph_sock_state_change; | |
31b8006e SW |
424 | } |
425 | ||
426 | ||
427 | /* | |
428 | * socket helpers | |
429 | */ | |
430 | ||
431 | /* | |
432 | * initiate connection to a remote socket. | |
433 | */ | |
6503e0b6 | 434 | int ceph_tcp_connect(struct ceph_connection *con) |
31b8006e | 435 | { |
cede185b | 436 | struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ |
31b8006e | 437 | struct socket *sock; |
633ee407 | 438 | unsigned int noio_flag; |
31b8006e SW |
439 | int ret; |
440 | ||
6503e0b6 ID |
441 | dout("%s con %p peer_addr %s\n", __func__, con, |
442 | ceph_pr_addr(&con->peer_addr)); | |
31b8006e | 443 | BUG_ON(con->sock); |
633ee407 ID |
444 | |
445 | /* sock_create_kern() allocates with GFP_KERNEL */ | |
446 | noio_flag = memalloc_noio_save(); | |
cede185b | 447 | ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, |
eeb1bd5c | 448 | SOCK_STREAM, IPPROTO_TCP, &sock); |
633ee407 | 449 | memalloc_noio_restore(noio_flag); |
31b8006e | 450 | if (ret) |
41617d0c | 451 | return ret; |
6d7fdb0a | 452 | sock->sk->sk_allocation = GFP_NOFS; |
98123866 | 453 | sock->sk->sk_use_task_frag = false; |
31b8006e | 454 | |
a6a5349d SW |
455 | #ifdef CONFIG_LOCKDEP |
456 | lockdep_set_class(&sock->sk->sk_lock, &socket_class); | |
457 | #endif | |
458 | ||
31b8006e SW |
459 | set_sock_callbacks(sock, con); |
460 | ||
89a86be0 | 461 | con_sock_state_connecting(con); |
cede185b | 462 | ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), |
f91d3471 | 463 | O_NONBLOCK); |
31b8006e SW |
464 | if (ret == -EINPROGRESS) { |
465 | dout("connect %s EINPROGRESS sk_state = %u\n", | |
b726ec97 | 466 | ceph_pr_addr(&con->peer_addr), |
31b8006e | 467 | sock->sk->sk_state); |
a5bc3129 | 468 | } else if (ret < 0) { |
31b8006e | 469 | pr_err("connect %s error %d\n", |
b726ec97 | 470 | ceph_pr_addr(&con->peer_addr), ret); |
31b8006e | 471 | sock_release(sock); |
41617d0c | 472 | return ret; |
a5bc3129 | 473 | } |
89baaa57 | 474 | |
12abc5ee CH |
475 | if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) |
476 | tcp_sock_set_nodelay(sock->sk); | |
ba988f87 | 477 | |
a5bc3129 | 478 | con->sock = sock; |
41617d0c | 479 | return 0; |
31b8006e SW |
480 | } |
481 | ||
31b8006e SW |
482 | /* |
483 | * Shutdown/close the socket for the given connection. | |
484 | */ | |
6503e0b6 | 485 | int ceph_con_close_socket(struct ceph_connection *con) |
31b8006e | 486 | { |
8007b8d6 | 487 | int rc = 0; |
31b8006e | 488 | |
6503e0b6 | 489 | dout("%s con %p sock %p\n", __func__, con, con->sock); |
8007b8d6 SW |
490 | if (con->sock) { |
491 | rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); | |
492 | sock_release(con->sock); | |
493 | con->sock = NULL; | |
494 | } | |
456ea468 AE |
495 | |
496 | /* | |
4a861692 | 497 | * Forcibly clear the SOCK_CLOSED flag. It gets set |
456ea468 AE |
498 | * independent of the connection mutex, and we could have |
499 | * received a socket close event before we had the chance to | |
500 | * shut the socket down. | |
501 | */ | |
6503e0b6 | 502 | ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); |
8007b8d6 | 503 | |
ce2c8903 | 504 | con_sock_state_closed(con); |
31b8006e SW |
505 | return rc; |
506 | } | |
507 | ||
3596f4c1 ID |
508 | static void ceph_con_reset_protocol(struct ceph_connection *con) |
509 | { | |
510 | dout("%s con %p\n", __func__, con); | |
511 | ||
6503e0b6 | 512 | ceph_con_close_socket(con); |
3596f4c1 ID |
513 | if (con->in_msg) { |
514 | WARN_ON(con->in_msg->con != con); | |
515 | ceph_msg_put(con->in_msg); | |
516 | con->in_msg = NULL; | |
517 | } | |
518 | if (con->out_msg) { | |
519 | WARN_ON(con->out_msg->con != con); | |
520 | ceph_msg_put(con->out_msg); | |
521 | con->out_msg = NULL; | |
522 | } | |
038b8d1d ID |
523 | if (con->bounce_page) { |
524 | __free_page(con->bounce_page); | |
525 | con->bounce_page = NULL; | |
526 | } | |
3596f4c1 | 527 | |
cd1a677c ID |
528 | if (ceph_msgr2(from_msgr(con->msgr))) |
529 | ceph_con_v2_reset_protocol(con); | |
530 | else | |
531 | ceph_con_v1_reset_protocol(con); | |
3596f4c1 ID |
532 | } |
533 | ||
31b8006e SW |
534 | /* |
535 | * Reset a connection. Discard all incoming and outgoing messages | |
536 | * and clear *_seq state. | |
537 | */ | |
538 | static void ceph_msg_remove(struct ceph_msg *msg) | |
539 | { | |
540 | list_del_init(&msg->list_head); | |
38941f80 | 541 | |
31b8006e SW |
542 | ceph_msg_put(msg); |
543 | } | |
cd1a677c | 544 | |
31b8006e SW |
545 | static void ceph_msg_remove_list(struct list_head *head) |
546 | { | |
547 | while (!list_empty(head)) { | |
548 | struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, | |
549 | list_head); | |
550 | ceph_msg_remove(msg); | |
551 | } | |
552 | } | |
553 | ||
6503e0b6 | 554 | void ceph_con_reset_session(struct ceph_connection *con) |
31b8006e | 555 | { |
5963c3d0 | 556 | dout("%s con %p\n", __func__, con); |
3596f4c1 ID |
557 | |
558 | WARN_ON(con->in_msg); | |
559 | WARN_ON(con->out_msg); | |
31b8006e SW |
560 | ceph_msg_remove_list(&con->out_queue); |
561 | ceph_msg_remove_list(&con->out_sent); | |
31b8006e | 562 | con->out_seq = 0; |
31b8006e | 563 | con->in_seq = 0; |
0e0d5e0c | 564 | con->in_seq_acked = 0; |
a3da057b | 565 | |
cd1a677c ID |
566 | if (ceph_msgr2(from_msgr(con->msgr))) |
567 | ceph_con_v2_reset_session(con); | |
568 | else | |
569 | ceph_con_v1_reset_session(con); | |
31b8006e SW |
570 | } |
571 | ||
572 | /* | |
573 | * mark a peer down. drop any open connections. | |
574 | */ | |
575 | void ceph_con_close(struct ceph_connection *con) | |
576 | { | |
8c50c817 | 577 | mutex_lock(&con->mutex); |
b726ec97 | 578 | dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); |
6d7f62bf | 579 | con->state = CEPH_CON_S_CLOSED; |
a5988c49 | 580 | |
6503e0b6 ID |
581 | ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next |
582 | connect */ | |
583 | ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); | |
584 | ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); | |
585 | ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); | |
a5988c49 | 586 | |
3596f4c1 | 587 | ceph_con_reset_protocol(con); |
5963c3d0 | 588 | ceph_con_reset_session(con); |
37ab77ac | 589 | cancel_con(con); |
ec302645 | 590 | mutex_unlock(&con->mutex); |
31b8006e | 591 | } |
3d14c5d2 | 592 | EXPORT_SYMBOL(ceph_con_close); |
31b8006e | 593 | |
31b8006e SW |
594 | /* |
595 | * Reopen a closed connection, with a new peer address. | |
596 | */ | |
b7a9e5dd SW |
597 | void ceph_con_open(struct ceph_connection *con, |
598 | __u8 entity_type, __u64 entity_num, | |
599 | struct ceph_entity_addr *addr) | |
31b8006e | 600 | { |
5469155f | 601 | mutex_lock(&con->mutex); |
b726ec97 | 602 | dout("con_open %p %s\n", con, ceph_pr_addr(addr)); |
8dacc7da | 603 | |
6d7f62bf ID |
604 | WARN_ON(con->state != CEPH_CON_S_CLOSED); |
605 | con->state = CEPH_CON_S_PREOPEN; | |
a5988c49 | 606 | |
b7a9e5dd SW |
607 | con->peer_name.type = (__u8) entity_type; |
608 | con->peer_name.num = cpu_to_le64(entity_num); | |
609 | ||
31b8006e | 610 | memcpy(&con->peer_addr, addr, sizeof(*addr)); |
03c677e1 | 611 | con->delay = 0; /* reset backoff memory */ |
5469155f | 612 | mutex_unlock(&con->mutex); |
31b8006e SW |
613 | queue_con(con); |
614 | } | |
3d14c5d2 | 615 | EXPORT_SYMBOL(ceph_con_open); |
31b8006e | 616 | |
87b315a5 SW |
617 | /* |
618 | * return true if this connection ever successfully opened | |
619 | */ | |
620 | bool ceph_con_opened(struct ceph_connection *con) | |
621 | { | |
cd1a677c ID |
622 | if (ceph_msgr2(from_msgr(con->msgr))) |
623 | return ceph_con_v2_opened(con); | |
624 | ||
566050e1 | 625 | return ceph_con_v1_opened(con); |
87b315a5 SW |
626 | } |
627 | ||
31b8006e SW |
628 | /* |
629 | * initialize a new connection. | |
630 | */ | |
1bfd89f4 AE |
631 | void ceph_con_init(struct ceph_connection *con, void *private, |
632 | const struct ceph_connection_operations *ops, | |
b7a9e5dd | 633 | struct ceph_messenger *msgr) |
31b8006e SW |
634 | { |
635 | dout("con_init %p\n", con); | |
636 | memset(con, 0, sizeof(*con)); | |
1bfd89f4 AE |
637 | con->private = private; |
638 | con->ops = ops; | |
31b8006e | 639 | con->msgr = msgr; |
ce2c8903 AE |
640 | |
641 | con_sock_state_init(con); | |
642 | ||
ec302645 | 643 | mutex_init(&con->mutex); |
31b8006e SW |
644 | INIT_LIST_HEAD(&con->out_queue); |
645 | INIT_LIST_HEAD(&con->out_sent); | |
68931622 | 646 | INIT_DELAYED_WORK(&con->work, ceph_con_workfn); |
a5988c49 | 647 | |
6d7f62bf | 648 | con->state = CEPH_CON_S_CLOSED; |
31b8006e | 649 | } |
3d14c5d2 | 650 | EXPORT_SYMBOL(ceph_con_init); |
31b8006e | 651 | |
31b8006e SW |
652 | /* |
653 | * We maintain a global counter to order connection attempts. Get | |
654 | * a unique seq greater than @gt. | |
655 | */ | |
6503e0b6 | 656 | u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) |
31b8006e SW |
657 | { |
658 | u32 ret; | |
659 | ||
660 | spin_lock(&msgr->global_seq_lock); | |
661 | if (msgr->global_seq < gt) | |
662 | msgr->global_seq = gt; | |
663 | ret = ++msgr->global_seq; | |
664 | spin_unlock(&msgr->global_seq_lock); | |
665 | return ret; | |
666 | } | |
667 | ||
02471928 ID |
668 | /* |
669 | * Discard messages that have been acked by the server. | |
670 | */ | |
6503e0b6 | 671 | void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) |
02471928 ID |
672 | { |
673 | struct ceph_msg *msg; | |
674 | u64 seq; | |
675 | ||
676 | dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); | |
677 | while (!list_empty(&con->out_sent)) { | |
678 | msg = list_first_entry(&con->out_sent, struct ceph_msg, | |
679 | list_head); | |
680 | WARN_ON(msg->needs_out_seq); | |
681 | seq = le64_to_cpu(msg->hdr.seq); | |
682 | if (seq > ack_seq) | |
683 | break; | |
684 | ||
685 | dout("%s con %p discarding msg %p seq %llu\n", __func__, con, | |
686 | msg, seq); | |
687 | ceph_msg_remove(msg); | |
688 | } | |
689 | } | |
690 | ||
691 | /* | |
692 | * Discard messages that have been requeued in con_fault(), up to | |
693 | * reconnect_seq. This avoids gratuitously resending messages that | |
694 | * the server had received and handled prior to reconnect. | |
695 | */ | |
6503e0b6 | 696 | void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) |
02471928 ID |
697 | { |
698 | struct ceph_msg *msg; | |
699 | u64 seq; | |
700 | ||
701 | dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); | |
702 | while (!list_empty(&con->out_queue)) { | |
703 | msg = list_first_entry(&con->out_queue, struct ceph_msg, | |
704 | list_head); | |
705 | if (msg->needs_out_seq) | |
706 | break; | |
707 | seq = le64_to_cpu(msg->hdr.seq); | |
708 | if (seq > reconnect_seq) | |
709 | break; | |
710 | ||
711 | dout("%s con %p discarding msg %p seq %llu\n", __func__, con, | |
712 | msg, seq); | |
713 | ceph_msg_remove(msg); | |
714 | } | |
715 | } | |
716 | ||
df6ad1f9 | 717 | #ifdef CONFIG_BLOCK |
6aaa4511 AE |
718 | |
719 | /* | |
720 | * For a bio data item, a piece is whatever remains of the next | |
721 | * entry in the current bio iovec, or the first entry in the next | |
722 | * bio in the list. | |
723 | */ | |
8ae4f4f5 | 724 | static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, |
25aff7c5 | 725 | size_t length) |
6aaa4511 | 726 | { |
8ae4f4f5 | 727 | struct ceph_msg_data *data = cursor->data; |
5359a17d | 728 | struct ceph_bio_iter *it = &cursor->bio_iter; |
6aaa4511 | 729 | |
5359a17d ID |
730 | cursor->resid = min_t(size_t, length, data->bio_length); |
731 | *it = data->bio_pos; | |
732 | if (cursor->resid < it->iter.bi_size) | |
733 | it->iter.bi_size = cursor->resid; | |
6aaa4511 | 734 | |
5359a17d | 735 | BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); |
6aaa4511 AE |
736 | } |
737 | ||
8ae4f4f5 | 738 | static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, |
6aaa4511 AE |
739 | size_t *page_offset, |
740 | size_t *length) | |
741 | { | |
5359a17d ID |
742 | struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, |
743 | cursor->bio_iter.iter); | |
6aaa4511 | 744 | |
5359a17d ID |
745 | *page_offset = bv.bv_offset; |
746 | *length = bv.bv_len; | |
747 | return bv.bv_page; | |
6aaa4511 AE |
748 | } |
749 | ||
8ae4f4f5 AE |
750 | static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, |
751 | size_t bytes) | |
6aaa4511 | 752 | { |
5359a17d | 753 | struct ceph_bio_iter *it = &cursor->bio_iter; |
187df763 | 754 | struct page *page = bio_iter_page(it->bio, it->iter); |
6aaa4511 | 755 | |
5359a17d ID |
756 | BUG_ON(bytes > cursor->resid); |
757 | BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); | |
25aff7c5 | 758 | cursor->resid -= bytes; |
5359a17d | 759 | bio_advance_iter(it->bio, &it->iter, bytes); |
f38a5181 | 760 | |
da4ab869 | 761 | if (!cursor->resid) |
5359a17d | 762 | return false; /* no more data */ |
f38a5181 | 763 | |
187df763 ID |
764 | if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && |
765 | page == bio_iter_page(it->bio, it->iter))) | |
6aaa4511 AE |
766 | return false; /* more bytes to process in this segment */ |
767 | ||
5359a17d ID |
768 | if (!it->iter.bi_size) { |
769 | it->bio = it->bio->bi_next; | |
770 | it->iter = it->bio->bi_iter; | |
771 | if (cursor->resid < it->iter.bi_size) | |
772 | it->iter.bi_size = cursor->resid; | |
25aff7c5 | 773 | } |
6aaa4511 | 774 | |
5359a17d | 775 | BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); |
6aaa4511 AE |
776 | return true; |
777 | } | |
ea96571f | 778 | #endif /* CONFIG_BLOCK */ |
df6ad1f9 | 779 | |
b9e281c2 ID |
780 | static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, |
781 | size_t length) | |
782 | { | |
783 | struct ceph_msg_data *data = cursor->data; | |
784 | struct bio_vec *bvecs = data->bvec_pos.bvecs; | |
785 | ||
786 | cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); | |
787 | cursor->bvec_iter = data->bvec_pos.iter; | |
788 | cursor->bvec_iter.bi_size = cursor->resid; | |
789 | ||
790 | BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); | |
b9e281c2 ID |
791 | } |
792 | ||
793 | static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, | |
794 | size_t *page_offset, | |
795 | size_t *length) | |
796 | { | |
797 | struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, | |
798 | cursor->bvec_iter); | |
799 | ||
800 | *page_offset = bv.bv_offset; | |
801 | *length = bv.bv_len; | |
802 | return bv.bv_page; | |
803 | } | |
804 | ||
805 | static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, | |
806 | size_t bytes) | |
807 | { | |
808 | struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; | |
187df763 | 809 | struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter); |
b9e281c2 ID |
810 | |
811 | BUG_ON(bytes > cursor->resid); | |
812 | BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); | |
813 | cursor->resid -= bytes; | |
814 | bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); | |
815 | ||
da4ab869 | 816 | if (!cursor->resid) |
b9e281c2 | 817 | return false; /* no more data */ |
b9e281c2 | 818 | |
187df763 ID |
819 | if (!bytes || (cursor->bvec_iter.bi_bvec_done && |
820 | page == bvec_iter_page(bvecs, cursor->bvec_iter))) | |
b9e281c2 ID |
821 | return false; /* more bytes to process in this segment */ |
822 | ||
b9e281c2 | 823 | BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); |
b9e281c2 ID |
824 | return true; |
825 | } | |
826 | ||
e766d7b5 AE |
827 | /* |
828 | * For a page array, a piece comes from the first page in the array | |
829 | * that has not already been fully consumed. | |
830 | */ | |
8ae4f4f5 | 831 | static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, |
25aff7c5 | 832 | size_t length) |
e766d7b5 | 833 | { |
8ae4f4f5 | 834 | struct ceph_msg_data *data = cursor->data; |
e766d7b5 AE |
835 | int page_count; |
836 | ||
837 | BUG_ON(data->type != CEPH_MSG_DATA_PAGES); | |
838 | ||
839 | BUG_ON(!data->pages); | |
840 | BUG_ON(!data->length); | |
841 | ||
ca8b3a69 | 842 | cursor->resid = min(length, data->length); |
e766d7b5 | 843 | page_count = calc_pages_for(data->alignment, (u64)data->length); |
e766d7b5 AE |
844 | cursor->page_offset = data->alignment & ~PAGE_MASK; |
845 | cursor->page_index = 0; | |
56fc5659 AE |
846 | BUG_ON(page_count > (int)USHRT_MAX); |
847 | cursor->page_count = (unsigned short)page_count; | |
848 | BUG_ON(length > SIZE_MAX - cursor->page_offset); | |
e766d7b5 AE |
849 | } |
850 | ||
8ae4f4f5 AE |
851 | static struct page * |
852 | ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, | |
853 | size_t *page_offset, size_t *length) | |
e766d7b5 | 854 | { |
8ae4f4f5 | 855 | struct ceph_msg_data *data = cursor->data; |
e766d7b5 AE |
856 | |
857 | BUG_ON(data->type != CEPH_MSG_DATA_PAGES); | |
858 | ||
859 | BUG_ON(cursor->page_index >= cursor->page_count); | |
860 | BUG_ON(cursor->page_offset >= PAGE_SIZE); | |
e766d7b5 AE |
861 | |
862 | *page_offset = cursor->page_offset; | |
da4ab869 | 863 | *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); |
e766d7b5 AE |
864 | return data->pages[cursor->page_index]; |
865 | } | |
866 | ||
8ae4f4f5 | 867 | static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, |
e766d7b5 AE |
868 | size_t bytes) |
869 | { | |
8ae4f4f5 | 870 | BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); |
e766d7b5 AE |
871 | |
872 | BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); | |
e766d7b5 AE |
873 | |
874 | /* Advance the cursor page offset */ | |
875 | ||
876 | cursor->resid -= bytes; | |
5df521b1 AE |
877 | cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; |
878 | if (!bytes || cursor->page_offset) | |
e766d7b5 AE |
879 | return false; /* more bytes to process in the current page */ |
880 | ||
d90deda6 YZ |
881 | if (!cursor->resid) |
882 | return false; /* no more data */ | |
883 | ||
5df521b1 | 884 | /* Move on to the next page; offset is already at 0 */ |
e766d7b5 AE |
885 | |
886 | BUG_ON(cursor->page_index >= cursor->page_count); | |
e766d7b5 | 887 | cursor->page_index++; |
e766d7b5 AE |
888 | return true; |
889 | } | |
890 | ||
fe38a2b6 | 891 | /* |
dd236fcb AE |
892 | * For a pagelist, a piece is whatever remains to be consumed in the |
893 | * first page in the list, or the front of the next page. | |
fe38a2b6 | 894 | */ |
8ae4f4f5 AE |
895 | static void |
896 | ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, | |
25aff7c5 | 897 | size_t length) |
fe38a2b6 | 898 | { |
8ae4f4f5 | 899 | struct ceph_msg_data *data = cursor->data; |
fe38a2b6 AE |
900 | struct ceph_pagelist *pagelist; |
901 | struct page *page; | |
902 | ||
dd236fcb | 903 | BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); |
fe38a2b6 AE |
904 | |
905 | pagelist = data->pagelist; | |
906 | BUG_ON(!pagelist); | |
25aff7c5 AE |
907 | |
908 | if (!length) | |
fe38a2b6 AE |
909 | return; /* pagelist can be assigned but empty */ |
910 | ||
911 | BUG_ON(list_empty(&pagelist->head)); | |
912 | page = list_first_entry(&pagelist->head, struct page, lru); | |
913 | ||
ca8b3a69 | 914 | cursor->resid = min(length, pagelist->length); |
fe38a2b6 AE |
915 | cursor->page = page; |
916 | cursor->offset = 0; | |
fe38a2b6 AE |
917 | } |
918 | ||
8ae4f4f5 AE |
919 | static struct page * |
920 | ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, | |
921 | size_t *page_offset, size_t *length) | |
fe38a2b6 | 922 | { |
8ae4f4f5 | 923 | struct ceph_msg_data *data = cursor->data; |
fe38a2b6 | 924 | struct ceph_pagelist *pagelist; |
fe38a2b6 AE |
925 | |
926 | BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); | |
927 | ||
928 | pagelist = data->pagelist; | |
929 | BUG_ON(!pagelist); | |
930 | ||
931 | BUG_ON(!cursor->page); | |
25aff7c5 | 932 | BUG_ON(cursor->offset + cursor->resid != pagelist->length); |
fe38a2b6 | 933 | |
5df521b1 | 934 | /* offset of first page in pagelist is always 0 */ |
fe38a2b6 | 935 | *page_offset = cursor->offset & ~PAGE_MASK; |
da4ab869 | 936 | *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); |
8ae4f4f5 | 937 | return cursor->page; |
fe38a2b6 AE |
938 | } |
939 | ||
8ae4f4f5 | 940 | static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, |
dd236fcb | 941 | size_t bytes) |
fe38a2b6 | 942 | { |
8ae4f4f5 | 943 | struct ceph_msg_data *data = cursor->data; |
fe38a2b6 AE |
944 | struct ceph_pagelist *pagelist; |
945 | ||
946 | BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); | |
947 | ||
948 | pagelist = data->pagelist; | |
949 | BUG_ON(!pagelist); | |
25aff7c5 AE |
950 | |
951 | BUG_ON(cursor->offset + cursor->resid != pagelist->length); | |
fe38a2b6 AE |
952 | BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); |
953 | ||
954 | /* Advance the cursor offset */ | |
955 | ||
25aff7c5 | 956 | cursor->resid -= bytes; |
fe38a2b6 | 957 | cursor->offset += bytes; |
5df521b1 | 958 | /* offset of first page in pagelist is always 0 */ |
fe38a2b6 AE |
959 | if (!bytes || cursor->offset & ~PAGE_MASK) |
960 | return false; /* more bytes to process in the current page */ | |
961 | ||
d90deda6 YZ |
962 | if (!cursor->resid) |
963 | return false; /* no more data */ | |
964 | ||
fe38a2b6 AE |
965 | /* Move on to the next page */ |
966 | ||
967 | BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); | |
17ddc49b | 968 | cursor->page = list_next_entry(cursor->page, lru); |
fe38a2b6 AE |
969 | return true; |
970 | } | |
971 | ||
dd236fcb AE |
972 | /* |
973 | * Message data is handled (sent or received) in pieces, where each | |
974 | * piece resides on a single page. The network layer might not | |
975 | * consume an entire piece at once. A data item's cursor keeps | |
976 | * track of which piece is next to process and how much remains to | |
977 | * be processed in that piece. It also tracks whether the current | |
978 | * piece is the last one in the data item. | |
979 | */ | |
ca8b3a69 | 980 | static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) |
dd236fcb | 981 | { |
ca8b3a69 | 982 | size_t length = cursor->total_resid; |
8ae4f4f5 | 983 | |
8ae4f4f5 | 984 | switch (cursor->data->type) { |
dd236fcb | 985 | case CEPH_MSG_DATA_PAGELIST: |
8ae4f4f5 | 986 | ceph_msg_data_pagelist_cursor_init(cursor, length); |
dd236fcb | 987 | break; |
e766d7b5 | 988 | case CEPH_MSG_DATA_PAGES: |
8ae4f4f5 | 989 | ceph_msg_data_pages_cursor_init(cursor, length); |
e766d7b5 | 990 | break; |
dd236fcb AE |
991 | #ifdef CONFIG_BLOCK |
992 | case CEPH_MSG_DATA_BIO: | |
8ae4f4f5 | 993 | ceph_msg_data_bio_cursor_init(cursor, length); |
6aaa4511 | 994 | break; |
dd236fcb | 995 | #endif /* CONFIG_BLOCK */ |
b9e281c2 ID |
996 | case CEPH_MSG_DATA_BVECS: |
997 | ceph_msg_data_bvecs_cursor_init(cursor, length); | |
998 | break; | |
6aaa4511 | 999 | case CEPH_MSG_DATA_NONE: |
dd236fcb AE |
1000 | default: |
1001 | /* BUG(); */ | |
1002 | break; | |
1003 | } | |
8ae4f4f5 | 1004 | cursor->need_crc = true; |
dd236fcb AE |
1005 | } |
1006 | ||
6503e0b6 ID |
1007 | void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, |
1008 | struct ceph_msg *msg, size_t length) | |
ca8b3a69 | 1009 | { |
ca8b3a69 AE |
1010 | BUG_ON(!length); |
1011 | BUG_ON(length > msg->data_length); | |
0d9c1ab3 | 1012 | BUG_ON(!msg->num_data_items); |
ca8b3a69 | 1013 | |
ca8b3a69 | 1014 | cursor->total_resid = length; |
0d9c1ab3 | 1015 | cursor->data = msg->data; |
ca8b3a69 AE |
1016 | |
1017 | __ceph_msg_data_cursor_init(cursor); | |
1018 | } | |
1019 | ||
dd236fcb AE |
1020 | /* |
1021 | * Return the page containing the next piece to process for a given | |
1022 | * data item, and supply the page offset and length of that piece. | |
1023 | * Indicate whether this is the last piece in this data item. | |
1024 | */ | |
6503e0b6 | 1025 | struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, |
da4ab869 | 1026 | size_t *page_offset, size_t *length) |
dd236fcb AE |
1027 | { |
1028 | struct page *page; | |
1029 | ||
8ae4f4f5 | 1030 | switch (cursor->data->type) { |
dd236fcb | 1031 | case CEPH_MSG_DATA_PAGELIST: |
8ae4f4f5 | 1032 | page = ceph_msg_data_pagelist_next(cursor, page_offset, length); |
dd236fcb | 1033 | break; |
e766d7b5 | 1034 | case CEPH_MSG_DATA_PAGES: |
8ae4f4f5 | 1035 | page = ceph_msg_data_pages_next(cursor, page_offset, length); |
e766d7b5 | 1036 | break; |
dd236fcb AE |
1037 | #ifdef CONFIG_BLOCK |
1038 | case CEPH_MSG_DATA_BIO: | |
8ae4f4f5 | 1039 | page = ceph_msg_data_bio_next(cursor, page_offset, length); |
6aaa4511 | 1040 | break; |
dd236fcb | 1041 | #endif /* CONFIG_BLOCK */ |
b9e281c2 ID |
1042 | case CEPH_MSG_DATA_BVECS: |
1043 | page = ceph_msg_data_bvecs_next(cursor, page_offset, length); | |
1044 | break; | |
6aaa4511 | 1045 | case CEPH_MSG_DATA_NONE: |
dd236fcb AE |
1046 | default: |
1047 | page = NULL; | |
1048 | break; | |
1049 | } | |
5359a17d | 1050 | |
dd236fcb AE |
1051 | BUG_ON(!page); |
1052 | BUG_ON(*page_offset + *length > PAGE_SIZE); | |
1053 | BUG_ON(!*length); | |
5359a17d | 1054 | BUG_ON(*length > cursor->resid); |
dd236fcb AE |
1055 | |
1056 | return page; | |
1057 | } | |
1058 | ||
1059 | /* | |
1060 | * Returns true if the result moves the cursor on to the next piece | |
1061 | * of the data item. | |
1062 | */ | |
6503e0b6 | 1063 | void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) |
dd236fcb AE |
1064 | { |
1065 | bool new_piece; | |
1066 | ||
25aff7c5 | 1067 | BUG_ON(bytes > cursor->resid); |
8ae4f4f5 | 1068 | switch (cursor->data->type) { |
dd236fcb | 1069 | case CEPH_MSG_DATA_PAGELIST: |
8ae4f4f5 | 1070 | new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); |
dd236fcb | 1071 | break; |
e766d7b5 | 1072 | case CEPH_MSG_DATA_PAGES: |
8ae4f4f5 | 1073 | new_piece = ceph_msg_data_pages_advance(cursor, bytes); |
e766d7b5 | 1074 | break; |
dd236fcb AE |
1075 | #ifdef CONFIG_BLOCK |
1076 | case CEPH_MSG_DATA_BIO: | |
8ae4f4f5 | 1077 | new_piece = ceph_msg_data_bio_advance(cursor, bytes); |
6aaa4511 | 1078 | break; |
dd236fcb | 1079 | #endif /* CONFIG_BLOCK */ |
b9e281c2 ID |
1080 | case CEPH_MSG_DATA_BVECS: |
1081 | new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); | |
1082 | break; | |
6aaa4511 | 1083 | case CEPH_MSG_DATA_NONE: |
dd236fcb AE |
1084 | default: |
1085 | BUG(); | |
1086 | break; | |
1087 | } | |
ca8b3a69 | 1088 | cursor->total_resid -= bytes; |
dd236fcb | 1089 | |
ca8b3a69 | 1090 | if (!cursor->resid && cursor->total_resid) { |
0d9c1ab3 | 1091 | cursor->data++; |
ca8b3a69 | 1092 | __ceph_msg_data_cursor_init(cursor); |
a51b272e | 1093 | new_piece = true; |
ca8b3a69 | 1094 | } |
a51b272e | 1095 | cursor->need_crc = new_piece; |
dd236fcb AE |
1096 | } |
1097 | ||
6503e0b6 ID |
1098 | u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, |
1099 | unsigned int length) | |
35b62808 AE |
1100 | { |
1101 | char *kaddr; | |
1102 | ||
1103 | kaddr = kmap(page); | |
1104 | BUG_ON(kaddr == NULL); | |
1105 | crc = crc32c(crc, kaddr + page_offset, length); | |
1106 | kunmap(page); | |
1107 | ||
1108 | return crc; | |
1109 | } | |
31b8006e | 1110 | |
6503e0b6 | 1111 | bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) |
31b8006e | 1112 | { |
cede185b JL |
1113 | struct sockaddr_storage ss = addr->in_addr; /* align */ |
1114 | struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; | |
1115 | struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; | |
c44bd69c | 1116 | |
cede185b | 1117 | switch (ss.ss_family) { |
31b8006e | 1118 | case AF_INET: |
cede185b | 1119 | return addr4->s_addr == htonl(INADDR_ANY); |
31b8006e | 1120 | case AF_INET6: |
c44bd69c ID |
1121 | return ipv6_addr_any(addr6); |
1122 | default: | |
1123 | return true; | |
31b8006e | 1124 | } |
31b8006e SW |
1125 | } |
1126 | ||
6503e0b6 | 1127 | int ceph_addr_port(const struct ceph_entity_addr *addr) |
31b8006e | 1128 | { |
cede185b | 1129 | switch (get_unaligned(&addr->in_addr.ss_family)) { |
31b8006e | 1130 | case AF_INET: |
cede185b | 1131 | return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); |
31b8006e | 1132 | case AF_INET6: |
cede185b | 1133 | return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); |
31b8006e SW |
1134 | } |
1135 | return 0; | |
1136 | } | |
1137 | ||
6503e0b6 | 1138 | void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) |
31b8006e | 1139 | { |
cede185b | 1140 | switch (get_unaligned(&addr->in_addr.ss_family)) { |
31b8006e | 1141 | case AF_INET: |
cede185b | 1142 | put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); |
a2a79609 | 1143 | break; |
31b8006e | 1144 | case AF_INET6: |
cede185b | 1145 | put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); |
a2a79609 | 1146 | break; |
31b8006e SW |
1147 | } |
1148 | } | |
1149 | ||
ee3b56f2 NW |
1150 | /* |
1151 | * Unlike other *_pton function semantics, zero indicates success. | |
1152 | */ | |
cede185b | 1153 | static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, |
ee3b56f2 NW |
1154 | char delim, const char **ipend) |
1155 | { | |
cede185b | 1156 | memset(&addr->in_addr, 0, sizeof(addr->in_addr)); |
ee3b56f2 | 1157 | |
cede185b JL |
1158 | if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { |
1159 | put_unaligned(AF_INET, &addr->in_addr.ss_family); | |
ee3b56f2 NW |
1160 | return 0; |
1161 | } | |
1162 | ||
cede185b JL |
1163 | if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { |
1164 | put_unaligned(AF_INET6, &addr->in_addr.ss_family); | |
ee3b56f2 NW |
1165 | return 0; |
1166 | } | |
1167 | ||
1168 | return -EINVAL; | |
1169 | } | |
1170 | ||
1171 | /* | |
1172 | * Extract hostname string and resolve using kernel DNS facility. | |
1173 | */ | |
1174 | #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER | |
1175 | static int ceph_dns_resolve_name(const char *name, size_t namelen, | |
cede185b | 1176 | struct ceph_entity_addr *addr, char delim, const char **ipend) |
ee3b56f2 NW |
1177 | { |
1178 | const char *end, *delim_p; | |
1179 | char *colon_p, *ip_addr = NULL; | |
1180 | int ip_len, ret; | |
1181 | ||
1182 | /* | |
1183 | * The end of the hostname occurs immediately preceding the delimiter or | |
1184 | * the port marker (':') where the delimiter takes precedence. | |
1185 | */ | |
1186 | delim_p = memchr(name, delim, namelen); | |
1187 | colon_p = memchr(name, ':', namelen); | |
1188 | ||
1189 | if (delim_p && colon_p) | |
1190 | end = delim_p < colon_p ? delim_p : colon_p; | |
1191 | else if (!delim_p && colon_p) | |
1192 | end = colon_p; | |
1193 | else { | |
1194 | end = delim_p; | |
1195 | if (!end) /* case: hostname:/ */ | |
1196 | end = name + namelen; | |
1197 | } | |
1198 | ||
1199 | if (end <= name) | |
1200 | return -EINVAL; | |
1201 | ||
1202 | /* do dns_resolve upcall */ | |
a58946c1 DH |
1203 | ip_len = dns_query(current->nsproxy->net_ns, |
1204 | NULL, name, end - name, NULL, &ip_addr, NULL, false); | |
ee3b56f2 | 1205 | if (ip_len > 0) |
cede185b | 1206 | ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); |
ee3b56f2 NW |
1207 | else |
1208 | ret = -ESRCH; | |
1209 | ||
1210 | kfree(ip_addr); | |
1211 | ||
1212 | *ipend = end; | |
1213 | ||
1214 | pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, | |
b726ec97 | 1215 | ret, ret ? "failed" : ceph_pr_addr(addr)); |
ee3b56f2 NW |
1216 | |
1217 | return ret; | |
1218 | } | |
1219 | #else | |
1220 | static inline int ceph_dns_resolve_name(const char *name, size_t namelen, | |
cede185b | 1221 | struct ceph_entity_addr *addr, char delim, const char **ipend) |
ee3b56f2 NW |
1222 | { |
1223 | return -EINVAL; | |
1224 | } | |
1225 | #endif | |
1226 | ||
1227 | /* | |
1228 | * Parse a server name (IP or hostname). If a valid IP address is not found | |
1229 | * then try to extract a hostname to resolve using userspace DNS upcall. | |
1230 | */ | |
1231 | static int ceph_parse_server_name(const char *name, size_t namelen, | |
cede185b | 1232 | struct ceph_entity_addr *addr, char delim, const char **ipend) |
ee3b56f2 NW |
1233 | { |
1234 | int ret; | |
1235 | ||
cede185b | 1236 | ret = ceph_pton(name, namelen, addr, delim, ipend); |
ee3b56f2 | 1237 | if (ret) |
cede185b | 1238 | ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); |
ee3b56f2 NW |
1239 | |
1240 | return ret; | |
1241 | } | |
1242 | ||
31b8006e SW |
1243 | /* |
1244 | * Parse an ip[:port] list into an addr array. Use the default | |
1245 | * monitor port if a port isn't specified. | |
1246 | */ | |
1247 | int ceph_parse_ips(const char *c, const char *end, | |
1248 | struct ceph_entity_addr *addr, | |
2d7c86a8 | 1249 | int max_count, int *count, char delim) |
31b8006e | 1250 | { |
ee3b56f2 | 1251 | int i, ret = -EINVAL; |
31b8006e SW |
1252 | const char *p = c; |
1253 | ||
1254 | dout("parse_ips on '%.*s'\n", (int)(end-c), c); | |
1255 | for (i = 0; i < max_count; i++) { | |
2d7c86a8 | 1256 | char cur_delim = delim; |
31b8006e | 1257 | const char *ipend; |
31b8006e | 1258 | int port; |
39139f64 SW |
1259 | |
1260 | if (*p == '[') { | |
2d7c86a8 | 1261 | cur_delim = ']'; |
39139f64 SW |
1262 | p++; |
1263 | } | |
31b8006e | 1264 | |
2d7c86a8 VS |
1265 | ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, |
1266 | &ipend); | |
ee3b56f2 | 1267 | if (ret) |
31b8006e | 1268 | goto bad; |
ee3b56f2 NW |
1269 | ret = -EINVAL; |
1270 | ||
31b8006e SW |
1271 | p = ipend; |
1272 | ||
2d7c86a8 | 1273 | if (cur_delim == ']') { |
39139f64 SW |
1274 | if (*p != ']') { |
1275 | dout("missing matching ']'\n"); | |
1276 | goto bad; | |
1277 | } | |
1278 | p++; | |
1279 | } | |
1280 | ||
31b8006e SW |
1281 | /* port? */ |
1282 | if (p < end && *p == ':') { | |
1283 | port = 0; | |
1284 | p++; | |
1285 | while (p < end && *p >= '0' && *p <= '9') { | |
1286 | port = (port * 10) + (*p - '0'); | |
1287 | p++; | |
1288 | } | |
f48db1e9 ID |
1289 | if (port == 0) |
1290 | port = CEPH_MON_PORT; | |
1291 | else if (port > 65535) | |
31b8006e SW |
1292 | goto bad; |
1293 | } else { | |
1294 | port = CEPH_MON_PORT; | |
1295 | } | |
1296 | ||
6503e0b6 | 1297 | ceph_addr_set_port(&addr[i], port); |
cd1a677c ID |
1298 | /* |
1299 | * We want the type to be set according to ms_mode | |
1300 | * option, but options are normally parsed after mon | |
1301 | * addresses. Rather than complicating parsing, set | |
1302 | * to LEGACY and override in build_initial_monmap() | |
1303 | * for mon addresses and ceph_messenger_init() for | |
1304 | * ip option. | |
1305 | */ | |
d3c3c0a8 | 1306 | addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; |
cd1a677c | 1307 | addr[i].nonce = 0; |
31b8006e | 1308 | |
2d7c86a8 | 1309 | dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); |
31b8006e SW |
1310 | |
1311 | if (p == end) | |
1312 | break; | |
2d7c86a8 | 1313 | if (*p != delim) |
31b8006e SW |
1314 | goto bad; |
1315 | p++; | |
1316 | } | |
1317 | ||
1318 | if (p != end) | |
1319 | goto bad; | |
1320 | ||
1321 | if (count) | |
1322 | *count = i + 1; | |
1323 | return 0; | |
1324 | ||
1325 | bad: | |
ee3b56f2 | 1326 | return ret; |
31b8006e SW |
1327 | } |
1328 | ||
31b8006e SW |
1329 | /* |
1330 | * Process message. This happens in the worker thread. The callback should | |
1331 | * be careful not to do anything that waits on other incoming messages or it | |
1332 | * may deadlock. | |
1333 | */ | |
6503e0b6 | 1334 | void ceph_con_process_message(struct ceph_connection *con) |
31b8006e | 1335 | { |
583d0fef | 1336 | struct ceph_msg *msg = con->in_msg; |
31b8006e | 1337 | |
38941f80 | 1338 | BUG_ON(con->in_msg->con != con); |
31b8006e SW |
1339 | con->in_msg = NULL; |
1340 | ||
1341 | /* if first message, set peer_name */ | |
1342 | if (con->peer_name.type == 0) | |
dbad185d | 1343 | con->peer_name = msg->hdr.src; |
31b8006e | 1344 | |
31b8006e | 1345 | con->in_seq++; |
ec302645 | 1346 | mutex_unlock(&con->mutex); |
31b8006e | 1347 | |
b77f8f0e | 1348 | dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", |
31b8006e | 1349 | msg, le64_to_cpu(msg->hdr.seq), |
dbad185d | 1350 | ENTITY_NAME(msg->hdr.src), |
31b8006e SW |
1351 | le16_to_cpu(msg->hdr.type), |
1352 | ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), | |
1353 | le32_to_cpu(msg->hdr.front_len), | |
b77f8f0e | 1354 | le32_to_cpu(msg->hdr.middle_len), |
31b8006e SW |
1355 | le32_to_cpu(msg->hdr.data_len), |
1356 | con->in_front_crc, con->in_middle_crc, con->in_data_crc); | |
1357 | con->ops->dispatch(con, msg); | |
ec302645 SW |
1358 | |
1359 | mutex_lock(&con->mutex); | |
31b8006e SW |
1360 | } |
1361 | ||
31b8006e | 1362 | /* |
802c6d96 AE |
1363 | * Atomically queue work on a connection after the specified delay. |
1364 | * Bump @con reference to avoid races with connection teardown. | |
1365 | * Returns 0 if work was queued, or an error code otherwise. | |
31b8006e | 1366 | */ |
802c6d96 | 1367 | static int queue_con_delay(struct ceph_connection *con, unsigned long delay) |
31b8006e | 1368 | { |
31b8006e | 1369 | if (!con->ops->get(con)) { |
802c6d96 | 1370 | dout("%s %p ref count 0\n", __func__, con); |
802c6d96 | 1371 | return -ENOENT; |
31b8006e SW |
1372 | } |
1373 | ||
418af5b3 ID |
1374 | if (delay >= HZ) |
1375 | delay = round_jiffies_relative(delay); | |
1376 | ||
5a5036c8 | 1377 | dout("%s %p %lu\n", __func__, con, delay); |
802c6d96 AE |
1378 | if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { |
1379 | dout("%s %p - already queued\n", __func__, con); | |
31b8006e | 1380 | con->ops->put(con); |
802c6d96 | 1381 | return -EBUSY; |
31b8006e | 1382 | } |
802c6d96 | 1383 | |
802c6d96 AE |
1384 | return 0; |
1385 | } | |
1386 | ||
1387 | static void queue_con(struct ceph_connection *con) | |
1388 | { | |
1389 | (void) queue_con_delay(con, 0); | |
31b8006e SW |
1390 | } |
1391 | ||
37ab77ac ID |
1392 | static void cancel_con(struct ceph_connection *con) |
1393 | { | |
1394 | if (cancel_delayed_work(&con->work)) { | |
1395 | dout("%s %p\n", __func__, con); | |
1396 | con->ops->put(con); | |
1397 | } | |
1398 | } | |
1399 | ||
7bb21d68 AE |
1400 | static bool con_sock_closed(struct ceph_connection *con) |
1401 | { | |
6503e0b6 | 1402 | if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) |
7bb21d68 AE |
1403 | return false; |
1404 | ||
1405 | #define CASE(x) \ | |
6d7f62bf | 1406 | case CEPH_CON_S_ ## x: \ |
7bb21d68 AE |
1407 | con->error_msg = "socket closed (con state " #x ")"; \ |
1408 | break; | |
1409 | ||
1410 | switch (con->state) { | |
1411 | CASE(CLOSED); | |
1412 | CASE(PREOPEN); | |
6d7f62bf ID |
1413 | CASE(V1_BANNER); |
1414 | CASE(V1_CONNECT_MSG); | |
cd1a677c ID |
1415 | CASE(V2_BANNER_PREFIX); |
1416 | CASE(V2_BANNER_PAYLOAD); | |
1417 | CASE(V2_HELLO); | |
1418 | CASE(V2_AUTH); | |
1419 | CASE(V2_AUTH_SIGNATURE); | |
1420 | CASE(V2_SESSION_CONNECT); | |
1421 | CASE(V2_SESSION_RECONNECT); | |
7bb21d68 AE |
1422 | CASE(OPEN); |
1423 | CASE(STANDBY); | |
1424 | default: | |
7bb21d68 | 1425 | BUG(); |
7bb21d68 AE |
1426 | } |
1427 | #undef CASE | |
1428 | ||
1429 | return true; | |
1430 | } | |
1431 | ||
f20a39fd AE |
1432 | static bool con_backoff(struct ceph_connection *con) |
1433 | { | |
1434 | int ret; | |
1435 | ||
6503e0b6 | 1436 | if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) |
f20a39fd AE |
1437 | return false; |
1438 | ||
418af5b3 | 1439 | ret = queue_con_delay(con, con->delay); |
f20a39fd AE |
1440 | if (ret) { |
1441 | dout("%s: con %p FAILED to back off %lu\n", __func__, | |
1442 | con, con->delay); | |
1443 | BUG_ON(ret == -ENOENT); | |
6503e0b6 | 1444 | ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); |
f20a39fd AE |
1445 | } |
1446 | ||
1447 | return true; | |
1448 | } | |
1449 | ||
93209264 AE |
1450 | /* Finish fault handling; con->mutex must *not* be held here */ |
1451 | ||
1452 | static void con_fault_finish(struct ceph_connection *con) | |
1453 | { | |
f6330cc1 ID |
1454 | dout("%s %p\n", __func__, con); |
1455 | ||
93209264 AE |
1456 | /* |
1457 | * in case we faulted due to authentication, invalidate our | |
1458 | * current tickets so that we can get new ones. | |
1459 | */ | |
a56dd9bf ID |
1460 | if (con->v1.auth_retry) { |
1461 | dout("auth_retry %d, invalidating\n", con->v1.auth_retry); | |
f6330cc1 ID |
1462 | if (con->ops->invalidate_authorizer) |
1463 | con->ops->invalidate_authorizer(con); | |
a56dd9bf | 1464 | con->v1.auth_retry = 0; |
93209264 AE |
1465 | } |
1466 | ||
1467 | if (con->ops->fault) | |
1468 | con->ops->fault(con); | |
1469 | } | |
1470 | ||
31b8006e SW |
1471 | /* |
1472 | * Do some work on a connection. Drop a connection ref when we're done. | |
1473 | */ | |
68931622 | 1474 | static void ceph_con_workfn(struct work_struct *work) |
31b8006e SW |
1475 | { |
1476 | struct ceph_connection *con = container_of(work, struct ceph_connection, | |
1477 | work.work); | |
49659416 | 1478 | bool fault; |
31b8006e | 1479 | |
9dd4658d | 1480 | mutex_lock(&con->mutex); |
49659416 AE |
1481 | while (true) { |
1482 | int ret; | |
31b8006e | 1483 | |
49659416 AE |
1484 | if ((fault = con_sock_closed(con))) { |
1485 | dout("%s: con %p SOCK_CLOSED\n", __func__, con); | |
1486 | break; | |
1487 | } | |
1488 | if (con_backoff(con)) { | |
1489 | dout("%s: con %p BACKOFF\n", __func__, con); | |
1490 | break; | |
1491 | } | |
6d7f62bf | 1492 | if (con->state == CEPH_CON_S_STANDBY) { |
49659416 AE |
1493 | dout("%s: con %p STANDBY\n", __func__, con); |
1494 | break; | |
1495 | } | |
6d7f62bf | 1496 | if (con->state == CEPH_CON_S_CLOSED) { |
49659416 AE |
1497 | dout("%s: con %p CLOSED\n", __func__, con); |
1498 | BUG_ON(con->sock); | |
1499 | break; | |
1500 | } | |
6d7f62bf | 1501 | if (con->state == CEPH_CON_S_PREOPEN) { |
49659416 AE |
1502 | dout("%s: con %p PREOPEN\n", __func__, con); |
1503 | BUG_ON(con->sock); | |
1504 | } | |
0da5d703 | 1505 | |
cd1a677c ID |
1506 | if (ceph_msgr2(from_msgr(con->msgr))) |
1507 | ret = ceph_con_v2_try_read(con); | |
1508 | else | |
1509 | ret = ceph_con_v1_try_read(con); | |
49659416 AE |
1510 | if (ret < 0) { |
1511 | if (ret == -EAGAIN) | |
1512 | continue; | |
67c64eb7 ID |
1513 | if (!con->error_msg) |
1514 | con->error_msg = "socket error on read"; | |
49659416 AE |
1515 | fault = true; |
1516 | break; | |
1517 | } | |
1518 | ||
cd1a677c ID |
1519 | if (ceph_msgr2(from_msgr(con->msgr))) |
1520 | ret = ceph_con_v2_try_write(con); | |
1521 | else | |
1522 | ret = ceph_con_v1_try_write(con); | |
49659416 AE |
1523 | if (ret < 0) { |
1524 | if (ret == -EAGAIN) | |
1525 | continue; | |
67c64eb7 ID |
1526 | if (!con->error_msg) |
1527 | con->error_msg = "socket error on write"; | |
49659416 AE |
1528 | fault = true; |
1529 | } | |
1530 | ||
1531 | break; /* If we make it to here, we're done */ | |
3a140a0d | 1532 | } |
b6e7b6a1 AE |
1533 | if (fault) |
1534 | con_fault(con); | |
9dd4658d | 1535 | mutex_unlock(&con->mutex); |
0da5d703 | 1536 | |
b6e7b6a1 AE |
1537 | if (fault) |
1538 | con_fault_finish(con); | |
1539 | ||
1540 | con->ops->put(con); | |
31b8006e SW |
1541 | } |
1542 | ||
31b8006e SW |
1543 | /* |
1544 | * Generic error/fault handler. A retry mechanism is used with | |
1545 | * exponential backoff | |
1546 | */ | |
93209264 | 1547 | static void con_fault(struct ceph_connection *con) |
31b8006e | 1548 | { |
30be780a | 1549 | dout("fault %p state %d to peer %s\n", |
b726ec97 | 1550 | con, con->state, ceph_pr_addr(&con->peer_addr)); |
31b8006e | 1551 | |
67c64eb7 | 1552 | pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), |
b726ec97 | 1553 | ceph_pr_addr(&con->peer_addr), con->error_msg); |
67c64eb7 ID |
1554 | con->error_msg = NULL; |
1555 | ||
cd1a677c ID |
1556 | WARN_ON(con->state == CEPH_CON_S_STANDBY || |
1557 | con->state == CEPH_CON_S_CLOSED); | |
ec302645 | 1558 | |
3596f4c1 | 1559 | ceph_con_reset_protocol(con); |
5e095e8b | 1560 | |
6503e0b6 | 1561 | if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { |
8dacc7da | 1562 | dout("fault on LOSSYTX channel, marking CLOSED\n"); |
6d7f62bf | 1563 | con->state = CEPH_CON_S_CLOSED; |
93209264 | 1564 | return; |
3b5ede07 SW |
1565 | } |
1566 | ||
e80a52d1 SW |
1567 | /* Requeue anything that hasn't been acked */ |
1568 | list_splice_init(&con->out_sent, &con->out_queue); | |
9bd2e6f8 | 1569 | |
e76661d0 SW |
1570 | /* If there are no messages queued or keepalive pending, place |
1571 | * the connection in a STANDBY state */ | |
1572 | if (list_empty(&con->out_queue) && | |
6503e0b6 | 1573 | !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { |
e00de341 | 1574 | dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); |
6503e0b6 | 1575 | ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); |
6d7f62bf | 1576 | con->state = CEPH_CON_S_STANDBY; |
e80a52d1 SW |
1577 | } else { |
1578 | /* retry after a delay. */ | |
6d7f62bf | 1579 | con->state = CEPH_CON_S_PREOPEN; |
418af5b3 | 1580 | if (!con->delay) { |
e80a52d1 | 1581 | con->delay = BASE_DELAY_INTERVAL; |
418af5b3 | 1582 | } else if (con->delay < MAX_DELAY_INTERVAL) { |
e80a52d1 | 1583 | con->delay *= 2; |
418af5b3 ID |
1584 | if (con->delay > MAX_DELAY_INTERVAL) |
1585 | con->delay = MAX_DELAY_INTERVAL; | |
1586 | } | |
6503e0b6 | 1587 | ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); |
8618e30b | 1588 | queue_con(con); |
31b8006e | 1589 | } |
31b8006e SW |
1590 | } |
1591 | ||
120a75ea YZ |
1592 | void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) |
1593 | { | |
1594 | u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; | |
1595 | msgr->inst.addr.nonce = cpu_to_le32(nonce); | |
6503e0b6 | 1596 | ceph_encode_my_addr(msgr); |
120a75ea | 1597 | } |
31b8006e SW |
1598 | |
1599 | /* | |
15d9882c | 1600 | * initialize a new messenger instance |
31b8006e | 1601 | */ |
15d9882c | 1602 | void ceph_messenger_init(struct ceph_messenger *msgr, |
859bff51 | 1603 | struct ceph_entity_addr *myaddr) |
31b8006e | 1604 | { |
31b8006e SW |
1605 | spin_lock_init(&msgr->global_seq_lock); |
1606 | ||
fd1a154c ID |
1607 | if (myaddr) { |
1608 | memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, | |
1609 | sizeof(msgr->inst.addr.in_addr)); | |
6503e0b6 | 1610 | ceph_addr_set_port(&msgr->inst.addr, 0); |
fd1a154c | 1611 | } |
31b8006e | 1612 | |
cd1a677c ID |
1613 | /* |
1614 | * Since nautilus, clients are identified using type ANY. | |
1615 | * For msgr1, ceph_encode_banner_addr() munges it to NONE. | |
1616 | */ | |
1617 | msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; | |
fd1a154c ID |
1618 | |
1619 | /* generate a random non-zero nonce */ | |
1620 | do { | |
1621 | get_random_bytes(&msgr->inst.addr.nonce, | |
1622 | sizeof(msgr->inst.addr.nonce)); | |
1623 | } while (!msgr->inst.addr.nonce); | |
6503e0b6 | 1624 | ceph_encode_my_addr(msgr); |
31b8006e | 1625 | |
a2a32584 | 1626 | atomic_set(&msgr->stopping, 0); |
757856d2 | 1627 | write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); |
31b8006e | 1628 | |
15d9882c | 1629 | dout("%s %p\n", __func__, msgr); |
31b8006e SW |
1630 | } |
1631 | ||
757856d2 ID |
1632 | void ceph_messenger_fini(struct ceph_messenger *msgr) |
1633 | { | |
1634 | put_net(read_pnet(&msgr->net)); | |
1635 | } | |
757856d2 | 1636 | |
583d0fef ID |
1637 | static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) |
1638 | { | |
1639 | if (msg->con) | |
1640 | msg->con->ops->put(msg->con); | |
1641 | ||
1642 | msg->con = con ? con->ops->get(con) : NULL; | |
1643 | BUG_ON(msg->con != con); | |
1644 | } | |
1645 | ||
e00de341 SW |
1646 | static void clear_standby(struct ceph_connection *con) |
1647 | { | |
1648 | /* come back from STANDBY? */ | |
6d7f62bf | 1649 | if (con->state == CEPH_CON_S_STANDBY) { |
e00de341 | 1650 | dout("clear_standby %p and ++connect_seq\n", con); |
6d7f62bf | 1651 | con->state = CEPH_CON_S_PREOPEN; |
a56dd9bf | 1652 | con->v1.connect_seq++; |
6503e0b6 ID |
1653 | WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); |
1654 | WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); | |
e00de341 SW |
1655 | } |
1656 | } | |
1657 | ||
31b8006e SW |
1658 | /* |
1659 | * Queue up an outgoing message on the given connection. | |
771294fe ID |
1660 | * |
1661 | * Consumes a ref on @msg. | |
31b8006e SW |
1662 | */ |
1663 | void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) | |
1664 | { | |
31b8006e | 1665 | /* set src+dst */ |
dbad185d | 1666 | msg->hdr.src = con->msgr->inst.name; |
3ca02ef9 | 1667 | BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); |
e84346b7 SW |
1668 | msg->needs_out_seq = true; |
1669 | ||
ec302645 | 1670 | mutex_lock(&con->mutex); |
92ce034b | 1671 | |
6d7f62bf | 1672 | if (con->state == CEPH_CON_S_CLOSED) { |
a59b55a6 SW |
1673 | dout("con_send %p closed, dropping %p\n", con, msg); |
1674 | ceph_msg_put(msg); | |
1675 | mutex_unlock(&con->mutex); | |
1676 | return; | |
1677 | } | |
1678 | ||
583d0fef | 1679 | msg_con_set(msg, con); |
92ce034b | 1680 | |
31b8006e SW |
1681 | BUG_ON(!list_empty(&msg->list_head)); |
1682 | list_add_tail(&msg->list_head, &con->out_queue); | |
1683 | dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, | |
1684 | ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), | |
1685 | ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), | |
1686 | le32_to_cpu(msg->hdr.front_len), | |
1687 | le32_to_cpu(msg->hdr.middle_len), | |
1688 | le32_to_cpu(msg->hdr.data_len)); | |
00650931 SW |
1689 | |
1690 | clear_standby(con); | |
ec302645 | 1691 | mutex_unlock(&con->mutex); |
31b8006e SW |
1692 | |
1693 | /* if there wasn't anything waiting to send before, queue | |
1694 | * new work */ | |
6503e0b6 | 1695 | if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) |
31b8006e SW |
1696 | queue_con(con); |
1697 | } | |
3d14c5d2 | 1698 | EXPORT_SYMBOL(ceph_con_send); |
31b8006e SW |
1699 | |
1700 | /* | |
1701 | * Revoke a message that was previously queued for send | |
1702 | */ | |
6740a845 | 1703 | void ceph_msg_revoke(struct ceph_msg *msg) |
31b8006e | 1704 | { |
6740a845 AE |
1705 | struct ceph_connection *con = msg->con; |
1706 | ||
583d0fef ID |
1707 | if (!con) { |
1708 | dout("%s msg %p null con\n", __func__, msg); | |
6740a845 | 1709 | return; /* Message not in our possession */ |
583d0fef | 1710 | } |
6740a845 | 1711 | |
ec302645 | 1712 | mutex_lock(&con->mutex); |
566050e1 ID |
1713 | if (list_empty(&msg->list_head)) { |
1714 | WARN_ON(con->out_msg == msg); | |
1715 | dout("%s con %p msg %p not linked\n", __func__, con, msg); | |
1716 | mutex_unlock(&con->mutex); | |
1717 | return; | |
ed98adad | 1718 | } |
67645d76 | 1719 | |
566050e1 ID |
1720 | dout("%s con %p msg %p was linked\n", __func__, con, msg); |
1721 | msg->hdr.seq = 0; | |
1722 | ceph_msg_remove(msg); | |
1723 | ||
1724 | if (con->out_msg == msg) { | |
1725 | WARN_ON(con->state != CEPH_CON_S_OPEN); | |
1726 | dout("%s con %p msg %p was sending\n", __func__, con, msg); | |
cd1a677c ID |
1727 | if (ceph_msgr2(from_msgr(con->msgr))) |
1728 | ceph_con_v2_revoke(con); | |
1729 | else | |
1730 | ceph_con_v1_revoke(con); | |
566050e1 | 1731 | ceph_msg_put(con->out_msg); |
67645d76 | 1732 | con->out_msg = NULL; |
566050e1 ID |
1733 | } else { |
1734 | dout("%s con %p msg %p not current, out_msg %p\n", __func__, | |
1735 | con, msg, con->out_msg); | |
31b8006e | 1736 | } |
ec302645 | 1737 | mutex_unlock(&con->mutex); |
31b8006e SW |
1738 | } |
1739 | ||
350b1c32 | 1740 | /* |
0d59ab81 | 1741 | * Revoke a message that we may be reading data into |
350b1c32 | 1742 | */ |
8921d114 | 1743 | void ceph_msg_revoke_incoming(struct ceph_msg *msg) |
350b1c32 | 1744 | { |
583d0fef | 1745 | struct ceph_connection *con = msg->con; |
8921d114 | 1746 | |
583d0fef | 1747 | if (!con) { |
8921d114 | 1748 | dout("%s msg %p null con\n", __func__, msg); |
8921d114 AE |
1749 | return; /* Message not in our possession */ |
1750 | } | |
1751 | ||
350b1c32 | 1752 | mutex_lock(&con->mutex); |
8921d114 | 1753 | if (con->in_msg == msg) { |
566050e1 ID |
1754 | WARN_ON(con->state != CEPH_CON_S_OPEN); |
1755 | dout("%s con %p msg %p was recving\n", __func__, con, msg); | |
cd1a677c ID |
1756 | if (ceph_msgr2(from_msgr(con->msgr))) |
1757 | ceph_con_v2_revoke_incoming(con); | |
1758 | else | |
1759 | ceph_con_v1_revoke_incoming(con); | |
350b1c32 SW |
1760 | ceph_msg_put(con->in_msg); |
1761 | con->in_msg = NULL; | |
350b1c32 | 1762 | } else { |
566050e1 ID |
1763 | dout("%s con %p msg %p not current, in_msg %p\n", __func__, |
1764 | con, msg, con->in_msg); | |
350b1c32 SW |
1765 | } |
1766 | mutex_unlock(&con->mutex); | |
1767 | } | |
1768 | ||
31b8006e SW |
1769 | /* |
1770 | * Queue a keepalive byte to ensure the tcp connection is alive. | |
1771 | */ | |
1772 | void ceph_con_keepalive(struct ceph_connection *con) | |
1773 | { | |
e00de341 | 1774 | dout("con_keepalive %p\n", con); |
00650931 | 1775 | mutex_lock(&con->mutex); |
e00de341 | 1776 | clear_standby(con); |
6503e0b6 | 1777 | ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); |
00650931 | 1778 | mutex_unlock(&con->mutex); |
4aac9228 | 1779 | |
6503e0b6 | 1780 | if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) |
31b8006e SW |
1781 | queue_con(con); |
1782 | } | |
3d14c5d2 | 1783 | EXPORT_SYMBOL(ceph_con_keepalive); |
31b8006e | 1784 | |
8b9558aa YZ |
1785 | bool ceph_con_keepalive_expired(struct ceph_connection *con, |
1786 | unsigned long interval) | |
1787 | { | |
1788 | if (interval > 0 && | |
1789 | (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { | |
473bd2d7 AB |
1790 | struct timespec64 now; |
1791 | struct timespec64 ts; | |
1792 | ktime_get_real_ts64(&now); | |
1793 | jiffies_to_timespec64(interval, &ts); | |
1794 | ts = timespec64_add(con->last_keepalive_ack, ts); | |
1795 | return timespec64_compare(&now, &ts) >= 0; | |
8b9558aa YZ |
1796 | } |
1797 | return false; | |
1798 | } | |
1799 | ||
0d9c1ab3 | 1800 | static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) |
43794509 | 1801 | { |
0d9c1ab3 ID |
1802 | BUG_ON(msg->num_data_items >= msg->max_data_items); |
1803 | return &msg->data[msg->num_data_items++]; | |
6644ed7b AE |
1804 | } |
1805 | ||
1806 | static void ceph_msg_data_destroy(struct ceph_msg_data *data) | |
1807 | { | |
e8862740 ID |
1808 | if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { |
1809 | int num_pages = calc_pages_for(data->alignment, data->length); | |
1810 | ceph_release_page_vector(data->pages, num_pages); | |
1811 | } else if (data->type == CEPH_MSG_DATA_PAGELIST) { | |
6644ed7b | 1812 | ceph_pagelist_release(data->pagelist); |
e8862740 | 1813 | } |
43794509 AE |
1814 | } |
1815 | ||
90af3602 | 1816 | void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, |
e8862740 | 1817 | size_t length, size_t alignment, bool own_pages) |
02afca6c | 1818 | { |
6644ed7b AE |
1819 | struct ceph_msg_data *data; |
1820 | ||
07aa1558 AE |
1821 | BUG_ON(!pages); |
1822 | BUG_ON(!length); | |
6644ed7b | 1823 | |
0d9c1ab3 ID |
1824 | data = ceph_msg_data_add(msg); |
1825 | data->type = CEPH_MSG_DATA_PAGES; | |
6644ed7b AE |
1826 | data->pages = pages; |
1827 | data->length = length; | |
1828 | data->alignment = alignment & ~PAGE_MASK; | |
e8862740 | 1829 | data->own_pages = own_pages; |
02afca6c | 1830 | |
5240d9f9 | 1831 | msg->data_length += length; |
02afca6c | 1832 | } |
90af3602 | 1833 | EXPORT_SYMBOL(ceph_msg_data_add_pages); |
31b8006e | 1834 | |
90af3602 | 1835 | void ceph_msg_data_add_pagelist(struct ceph_msg *msg, |
27fa8385 AE |
1836 | struct ceph_pagelist *pagelist) |
1837 | { | |
6644ed7b AE |
1838 | struct ceph_msg_data *data; |
1839 | ||
07aa1558 AE |
1840 | BUG_ON(!pagelist); |
1841 | BUG_ON(!pagelist->length); | |
27fa8385 | 1842 | |
0d9c1ab3 ID |
1843 | data = ceph_msg_data_add(msg); |
1844 | data->type = CEPH_MSG_DATA_PAGELIST; | |
89486833 | 1845 | refcount_inc(&pagelist->refcnt); |
6644ed7b AE |
1846 | data->pagelist = pagelist; |
1847 | ||
5240d9f9 | 1848 | msg->data_length += pagelist->length; |
27fa8385 | 1849 | } |
90af3602 | 1850 | EXPORT_SYMBOL(ceph_msg_data_add_pagelist); |
27fa8385 | 1851 | |
ea96571f | 1852 | #ifdef CONFIG_BLOCK |
5359a17d ID |
1853 | void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, |
1854 | u32 length) | |
27fa8385 | 1855 | { |
6644ed7b AE |
1856 | struct ceph_msg_data *data; |
1857 | ||
0d9c1ab3 ID |
1858 | data = ceph_msg_data_add(msg); |
1859 | data->type = CEPH_MSG_DATA_BIO; | |
5359a17d | 1860 | data->bio_pos = *bio_pos; |
c851c495 | 1861 | data->bio_length = length; |
6644ed7b | 1862 | |
5240d9f9 | 1863 | msg->data_length += length; |
27fa8385 | 1864 | } |
90af3602 | 1865 | EXPORT_SYMBOL(ceph_msg_data_add_bio); |
ea96571f | 1866 | #endif /* CONFIG_BLOCK */ |
27fa8385 | 1867 | |
b9e281c2 ID |
1868 | void ceph_msg_data_add_bvecs(struct ceph_msg *msg, |
1869 | struct ceph_bvec_iter *bvec_pos) | |
1870 | { | |
1871 | struct ceph_msg_data *data; | |
1872 | ||
0d9c1ab3 ID |
1873 | data = ceph_msg_data_add(msg); |
1874 | data->type = CEPH_MSG_DATA_BVECS; | |
b9e281c2 ID |
1875 | data->bvec_pos = *bvec_pos; |
1876 | ||
b9e281c2 ID |
1877 | msg->data_length += bvec_pos->iter.bi_size; |
1878 | } | |
1879 | EXPORT_SYMBOL(ceph_msg_data_add_bvecs); | |
1880 | ||
31b8006e SW |
1881 | /* |
1882 | * construct a new message with given type, size | |
1883 | * the new msg has a ref count of 1. | |
1884 | */ | |
0d9c1ab3 ID |
1885 | struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, |
1886 | gfp_t flags, bool can_fail) | |
31b8006e SW |
1887 | { |
1888 | struct ceph_msg *m; | |
1889 | ||
e3d5d638 | 1890 | m = kmem_cache_zalloc(ceph_msg_cache, flags); |
31b8006e SW |
1891 | if (m == NULL) |
1892 | goto out; | |
31b8006e SW |
1893 | |
1894 | m->hdr.type = cpu_to_le16(type); | |
45c6ceb5 | 1895 | m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); |
31b8006e | 1896 | m->hdr.front_len = cpu_to_le32(front_len); |
ca20892d | 1897 | |
9516e45b AE |
1898 | INIT_LIST_HEAD(&m->list_head); |
1899 | kref_init(&m->kref); | |
ca20892d | 1900 | |
31b8006e SW |
1901 | /* front */ |
1902 | if (front_len) { | |
a421ef30 | 1903 | m->front.iov_base = kvmalloc(front_len, flags); |
31b8006e | 1904 | if (m->front.iov_base == NULL) { |
b61c2763 | 1905 | dout("ceph_msg_new can't allocate %d bytes\n", |
31b8006e SW |
1906 | front_len); |
1907 | goto out2; | |
1908 | } | |
1909 | } else { | |
1910 | m->front.iov_base = NULL; | |
1911 | } | |
f2be82b0 | 1912 | m->front_alloc_len = m->front.iov_len = front_len; |
31b8006e | 1913 | |
0d9c1ab3 ID |
1914 | if (max_data_items) { |
1915 | m->data = kmalloc_array(max_data_items, sizeof(*m->data), | |
1916 | flags); | |
1917 | if (!m->data) | |
1918 | goto out2; | |
1919 | ||
1920 | m->max_data_items = max_data_items; | |
1921 | } | |
1922 | ||
bb257664 | 1923 | dout("ceph_msg_new %p front %d\n", m, front_len); |
31b8006e SW |
1924 | return m; |
1925 | ||
1926 | out2: | |
1927 | ceph_msg_put(m); | |
1928 | out: | |
b61c2763 SW |
1929 | if (!can_fail) { |
1930 | pr_err("msg_new can't create type %d front %d\n", type, | |
1931 | front_len); | |
f0ed1b7c | 1932 | WARN_ON(1); |
b61c2763 SW |
1933 | } else { |
1934 | dout("msg_new can't create type %d front %d\n", type, | |
1935 | front_len); | |
1936 | } | |
a79832f2 | 1937 | return NULL; |
31b8006e | 1938 | } |
0d9c1ab3 ID |
1939 | EXPORT_SYMBOL(ceph_msg_new2); |
1940 | ||
1941 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |
1942 | bool can_fail) | |
1943 | { | |
1944 | return ceph_msg_new2(type, front_len, 0, flags, can_fail); | |
1945 | } | |
3d14c5d2 | 1946 | EXPORT_SYMBOL(ceph_msg_new); |
31b8006e | 1947 | |
31b8006e SW |
1948 | /* |
1949 | * Allocate "middle" portion of a message, if it is needed and wasn't | |
1950 | * allocated by alloc_msg. This allows us to read a small fixed-size | |
1951 | * per-type header in the front and then gracefully fail (i.e., | |
1952 | * propagate the error to the caller based on info in the front) when | |
1953 | * the middle is too large. | |
1954 | */ | |
2450418c | 1955 | static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) |
31b8006e SW |
1956 | { |
1957 | int type = le16_to_cpu(msg->hdr.type); | |
1958 | int middle_len = le32_to_cpu(msg->hdr.middle_len); | |
1959 | ||
1960 | dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, | |
1961 | ceph_msg_type_name(type), middle_len); | |
1962 | BUG_ON(!middle_len); | |
1963 | BUG_ON(msg->middle); | |
1964 | ||
b6c1d5b8 | 1965 | msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); |
31b8006e SW |
1966 | if (!msg->middle) |
1967 | return -ENOMEM; | |
1968 | return 0; | |
1969 | } | |
1970 | ||
2450418c | 1971 | /* |
1c20f2d2 AE |
1972 | * Allocate a message for receiving an incoming message on a |
1973 | * connection, and save the result in con->in_msg. Uses the | |
1974 | * connection's private alloc_msg op if available. | |
1975 | * | |
4740a623 SW |
1976 | * Returns 0 on success, or a negative error code. |
1977 | * | |
1978 | * On success, if we set *skip = 1: | |
1979 | * - the next message should be skipped and ignored. | |
1980 | * - con->in_msg == NULL | |
1981 | * or if we set *skip = 0: | |
1982 | * - con->in_msg is non-null. | |
1983 | * On error (ENOMEM, EAGAIN, ...), | |
1984 | * - con->in_msg == NULL | |
2450418c | 1985 | */ |
6503e0b6 ID |
1986 | int ceph_con_in_msg_alloc(struct ceph_connection *con, |
1987 | struct ceph_msg_header *hdr, int *skip) | |
2450418c | 1988 | { |
2450418c | 1989 | int middle_len = le32_to_cpu(hdr->middle_len); |
1d866d1c | 1990 | struct ceph_msg *msg; |
4740a623 | 1991 | int ret = 0; |
2450418c | 1992 | |
1c20f2d2 | 1993 | BUG_ON(con->in_msg != NULL); |
53ded495 | 1994 | BUG_ON(!con->ops->alloc_msg); |
2450418c | 1995 | |
53ded495 AE |
1996 | mutex_unlock(&con->mutex); |
1997 | msg = con->ops->alloc_msg(con, hdr, skip); | |
1998 | mutex_lock(&con->mutex); | |
6d7f62bf | 1999 | if (con->state != CEPH_CON_S_OPEN) { |
53ded495 | 2000 | if (msg) |
1d866d1c | 2001 | ceph_msg_put(msg); |
53ded495 AE |
2002 | return -EAGAIN; |
2003 | } | |
4137577a AE |
2004 | if (msg) { |
2005 | BUG_ON(*skip); | |
583d0fef | 2006 | msg_con_set(msg, con); |
4137577a | 2007 | con->in_msg = msg; |
4137577a AE |
2008 | } else { |
2009 | /* | |
2010 | * Null message pointer means either we should skip | |
2011 | * this message or we couldn't allocate memory. The | |
2012 | * former is not an error. | |
2013 | */ | |
2014 | if (*skip) | |
2015 | return 0; | |
4137577a | 2016 | |
67c64eb7 | 2017 | con->error_msg = "error allocating memory for incoming message"; |
53ded495 | 2018 | return -ENOMEM; |
2450418c | 2019 | } |
fc4c128e | 2020 | memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); |
2450418c | 2021 | |
1c20f2d2 AE |
2022 | if (middle_len && !con->in_msg->middle) { |
2023 | ret = ceph_alloc_middle(con, con->in_msg); | |
2450418c | 2024 | if (ret < 0) { |
1c20f2d2 AE |
2025 | ceph_msg_put(con->in_msg); |
2026 | con->in_msg = NULL; | |
2450418c YS |
2027 | } |
2028 | } | |
9d7f0f13 | 2029 | |
4740a623 | 2030 | return ret; |
2450418c YS |
2031 | } |
2032 | ||
6503e0b6 | 2033 | void ceph_con_get_out_msg(struct ceph_connection *con) |
771294fe ID |
2034 | { |
2035 | struct ceph_msg *msg; | |
2036 | ||
2037 | BUG_ON(list_empty(&con->out_queue)); | |
2038 | msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); | |
2039 | WARN_ON(msg->con != con); | |
2040 | ||
2041 | /* | |
2042 | * Put the message on "sent" list using a ref from ceph_con_send(). | |
2043 | * It is put when the message is acked or revoked. | |
2044 | */ | |
2045 | list_move_tail(&msg->list_head, &con->out_sent); | |
2046 | ||
2047 | /* | |
2048 | * Only assign outgoing seq # if we haven't sent this message | |
2049 | * yet. If it is requeued, resend with it's original seq. | |
2050 | */ | |
2051 | if (msg->needs_out_seq) { | |
2052 | msg->hdr.seq = cpu_to_le64(++con->out_seq); | |
2053 | msg->needs_out_seq = false; | |
2054 | ||
2055 | if (con->ops->reencode_message) | |
2056 | con->ops->reencode_message(msg); | |
2057 | } | |
2058 | ||
2059 | /* | |
2060 | * Get a ref for out_msg. It is put when we are done sending the | |
2061 | * message or in case of a fault. | |
2062 | */ | |
2063 | WARN_ON(con->out_msg); | |
2064 | con->out_msg = ceph_msg_get(msg); | |
2065 | } | |
31b8006e SW |
2066 | |
2067 | /* | |
2068 | * Free a generically kmalloc'd message. | |
2069 | */ | |
0215e44b | 2070 | static void ceph_msg_free(struct ceph_msg *m) |
31b8006e | 2071 | { |
0215e44b | 2072 | dout("%s %p\n", __func__, m); |
4965fc38 | 2073 | kvfree(m->front.iov_base); |
0d9c1ab3 | 2074 | kfree(m->data); |
e3d5d638 | 2075 | kmem_cache_free(ceph_msg_cache, m); |
31b8006e SW |
2076 | } |
2077 | ||
0215e44b | 2078 | static void ceph_msg_release(struct kref *kref) |
c2e552e7 SW |
2079 | { |
2080 | struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); | |
0d9c1ab3 | 2081 | int i; |
31b8006e | 2082 | |
0215e44b | 2083 | dout("%s %p\n", __func__, m); |
c2e552e7 SW |
2084 | WARN_ON(!list_empty(&m->list_head)); |
2085 | ||
583d0fef ID |
2086 | msg_con_set(m, NULL); |
2087 | ||
c2e552e7 SW |
2088 | /* drop middle, data, if any */ |
2089 | if (m->middle) { | |
2090 | ceph_buffer_put(m->middle); | |
2091 | m->middle = NULL; | |
31b8006e | 2092 | } |
5240d9f9 | 2093 | |
0d9c1ab3 ID |
2094 | for (i = 0; i < m->num_data_items; i++) |
2095 | ceph_msg_data_destroy(&m->data[i]); | |
58bb3b37 | 2096 | |
c2e552e7 SW |
2097 | if (m->pool) |
2098 | ceph_msgpool_put(m->pool, m); | |
2099 | else | |
0215e44b ID |
2100 | ceph_msg_free(m); |
2101 | } | |
2102 | ||
2103 | struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) | |
2104 | { | |
2105 | dout("%s %p (was %d)\n", __func__, msg, | |
2c935bc5 | 2106 | kref_read(&msg->kref)); |
0215e44b ID |
2107 | kref_get(&msg->kref); |
2108 | return msg; | |
2109 | } | |
2110 | EXPORT_SYMBOL(ceph_msg_get); | |
2111 | ||
2112 | void ceph_msg_put(struct ceph_msg *msg) | |
2113 | { | |
2114 | dout("%s %p (was %d)\n", __func__, msg, | |
2c935bc5 | 2115 | kref_read(&msg->kref)); |
0215e44b | 2116 | kref_put(&msg->kref, ceph_msg_release); |
31b8006e | 2117 | } |
0215e44b | 2118 | EXPORT_SYMBOL(ceph_msg_put); |
9ec7cab1 SW |
2119 | |
2120 | void ceph_msg_dump(struct ceph_msg *msg) | |
2121 | { | |
3cea4c30 ID |
2122 | pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, |
2123 | msg->front_alloc_len, msg->data_length); | |
9ec7cab1 SW |
2124 | print_hex_dump(KERN_DEBUG, "header: ", |
2125 | DUMP_PREFIX_OFFSET, 16, 1, | |
2126 | &msg->hdr, sizeof(msg->hdr), true); | |
2127 | print_hex_dump(KERN_DEBUG, " front: ", | |
2128 | DUMP_PREFIX_OFFSET, 16, 1, | |
2129 | msg->front.iov_base, msg->front.iov_len, true); | |
2130 | if (msg->middle) | |
2131 | print_hex_dump(KERN_DEBUG, "middle: ", | |
2132 | DUMP_PREFIX_OFFSET, 16, 1, | |
2133 | msg->middle->vec.iov_base, | |
2134 | msg->middle->vec.iov_len, true); | |
2135 | print_hex_dump(KERN_DEBUG, "footer: ", | |
2136 | DUMP_PREFIX_OFFSET, 16, 1, | |
2137 | &msg->footer, sizeof(msg->footer), true); | |
2138 | } | |
3d14c5d2 | 2139 | EXPORT_SYMBOL(ceph_msg_dump); |