Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
952310cc UB |
2 | /* |
3 | * Shared Memory Communications over RDMA (SMC-R) and RoCE | |
4 | * | |
5 | * Manage RMBE | |
6 | * copy new RMBE data into user space | |
7 | * | |
8 | * Copyright IBM Corp. 2016 | |
9 | * | |
10 | * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> | |
11 | */ | |
12 | ||
13 | #include <linux/net.h> | |
14 | #include <linux/rcupdate.h> | |
c3edc401 IM |
15 | #include <linux/sched/signal.h> |
16 | ||
952310cc UB |
17 | #include <net/sock.h> |
18 | ||
19 | #include "smc.h" | |
20 | #include "smc_core.h" | |
21 | #include "smc_cdc.h" | |
22 | #include "smc_tx.h" /* smc_tx_consumer_update() */ | |
23 | #include "smc_rx.h" | |
24 | ||
b51fa1b1 | 25 | /* callback implementation to wakeup consumers blocked with smc_rx_wait(). |
952310cc UB |
26 | * indirectly called by smc_cdc_msg_recv_action(). |
27 | */ | |
b51fa1b1 | 28 | static void smc_rx_wake_up(struct sock *sk) |
952310cc UB |
29 | { |
30 | struct socket_wq *wq; | |
31 | ||
32 | /* derived from sock_def_readable() */ | |
33 | /* called already in smc_listen_work() */ | |
34 | rcu_read_lock(); | |
35 | wq = rcu_dereference(sk->sk_wq); | |
36 | if (skwq_has_sleeper(wq)) | |
a9a08845 LT |
37 | wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | |
38 | EPOLLRDNORM | EPOLLRDBAND); | |
90e9517e | 39 | sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); |
952310cc UB |
40 | if ((sk->sk_shutdown == SHUTDOWN_MASK) || |
41 | (sk->sk_state == SMC_CLOSED)) | |
42 | sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); | |
952310cc UB |
43 | rcu_read_unlock(); |
44 | } | |
45 | ||
9014db20 SR |
46 | /* Update consumer cursor |
47 | * @conn connection to update | |
48 | * @cons consumer cursor | |
49 | * @len number of Bytes consumed | |
de8474eb SR |
50 | * Returns: |
51 | * 1 if we should end our receive, 0 otherwise | |
9014db20 | 52 | */ |
de8474eb SR |
53 | static int smc_rx_update_consumer(struct smc_sock *smc, |
54 | union smc_host_cursor cons, size_t len) | |
9014db20 | 55 | { |
de8474eb SR |
56 | struct smc_connection *conn = &smc->conn; |
57 | struct sock *sk = &smc->sk; | |
58 | bool force = false; | |
59 | int diff, rc = 0; | |
60 | ||
69cb7dc0 | 61 | smc_curs_add(conn->rmb_desc->len, &cons, len); |
de8474eb SR |
62 | |
63 | /* did we process urgent data? */ | |
64 | if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) { | |
65 | diff = smc_curs_comp(conn->rmb_desc->len, &cons, | |
66 | &conn->urg_curs); | |
67 | if (sock_flag(sk, SOCK_URGINLINE)) { | |
68 | if (diff == 0) { | |
69 | force = true; | |
70 | rc = 1; | |
71 | conn->urg_state = SMC_URG_READ; | |
72 | } | |
73 | } else { | |
74 | if (diff == 1) { | |
75 | /* skip urgent byte */ | |
76 | force = true; | |
77 | smc_curs_add(conn->rmb_desc->len, &cons, 1); | |
78 | conn->urg_rx_skip_pend = false; | |
79 | } else if (diff < -1) | |
80 | /* we read past urgent byte */ | |
81 | conn->urg_state = SMC_URG_READ; | |
82 | } | |
83 | } | |
84 | ||
bac6de7b | 85 | smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn); |
de8474eb | 86 | |
9014db20 SR |
87 | /* send consumer cursor update if required */ |
88 | /* similar to advertising new TCP rcv_wnd if required */ | |
de8474eb SR |
89 | smc_tx_consumer_update(conn, force); |
90 | ||
91 | return rc; | |
92 | } | |
93 | ||
94 | static void smc_rx_update_cons(struct smc_sock *smc, size_t len) | |
95 | { | |
96 | struct smc_connection *conn = &smc->conn; | |
97 | union smc_host_cursor cons; | |
98 | ||
bac6de7b | 99 | smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); |
de8474eb | 100 | smc_rx_update_consumer(smc, cons, len); |
9014db20 SR |
101 | } |
102 | ||
103 | struct smc_spd_priv { | |
104 | struct smc_sock *smc; | |
105 | size_t len; | |
106 | }; | |
107 | ||
108 | static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, | |
109 | struct pipe_buffer *buf) | |
110 | { | |
111 | struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private; | |
112 | struct smc_sock *smc = priv->smc; | |
113 | struct smc_connection *conn; | |
9014db20 SR |
114 | struct sock *sk = &smc->sk; |
115 | ||
116 | if (sk->sk_state == SMC_CLOSED || | |
117 | sk->sk_state == SMC_PEERFINCLOSEWAIT || | |
118 | sk->sk_state == SMC_APPFINCLOSEWAIT) | |
119 | goto out; | |
120 | conn = &smc->conn; | |
121 | lock_sock(sk); | |
de8474eb | 122 | smc_rx_update_cons(smc, priv->len); |
9014db20 SR |
123 | release_sock(sk); |
124 | if (atomic_sub_and_test(priv->len, &conn->splice_pending)) | |
125 | smc_rx_wake_up(sk); | |
126 | out: | |
127 | kfree(priv); | |
128 | put_page(buf->page); | |
129 | sock_put(sk); | |
130 | } | |
131 | ||
132 | static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe, | |
133 | struct pipe_buffer *buf) | |
134 | { | |
135 | return 1; | |
136 | } | |
137 | ||
138 | static const struct pipe_buf_operations smc_pipe_ops = { | |
9014db20 SR |
139 | .confirm = generic_pipe_buf_confirm, |
140 | .release = smc_rx_pipe_buf_release, | |
141 | .steal = smc_rx_pipe_buf_nosteal, | |
142 | .get = generic_pipe_buf_get | |
143 | }; | |
144 | ||
145 | static void smc_rx_spd_release(struct splice_pipe_desc *spd, | |
146 | unsigned int i) | |
147 | { | |
148 | put_page(spd->pages[i]); | |
149 | } | |
150 | ||
151 | static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, | |
152 | struct smc_sock *smc) | |
153 | { | |
154 | struct splice_pipe_desc spd; | |
155 | struct partial_page partial; | |
156 | struct smc_spd_priv *priv; | |
9014db20 SR |
157 | int bytes; |
158 | ||
9014db20 SR |
159 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); |
160 | if (!priv) | |
161 | return -ENOMEM; | |
162 | priv->len = len; | |
163 | priv->smc = smc; | |
164 | partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; | |
165 | partial.len = len; | |
166 | partial.private = (unsigned long)priv; | |
167 | ||
168 | spd.nr_pages_max = 1; | |
169 | spd.nr_pages = 1; | |
48bf5231 | 170 | spd.pages = &smc->conn.rmb_desc->pages; |
9014db20 SR |
171 | spd.partial = &partial; |
172 | spd.ops = &smc_pipe_ops; | |
173 | spd.spd_release = smc_rx_spd_release; | |
174 | ||
175 | bytes = splice_to_pipe(pipe, &spd); | |
176 | if (bytes > 0) { | |
177 | sock_hold(&smc->sk); | |
178 | get_page(smc->conn.rmb_desc->pages); | |
179 | atomic_add(bytes, &smc->conn.splice_pending); | |
180 | } | |
181 | ||
182 | return bytes; | |
183 | } | |
184 | ||
185 | static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) | |
186 | { | |
187 | return atomic_read(&conn->bytes_to_rcv) && | |
188 | !atomic_read(&conn->splice_pending); | |
189 | } | |
190 | ||
952310cc UB |
191 | /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted |
192 | * @smc smc socket | |
193 | * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout | |
b51fa1b1 | 194 | * @fcrit add'l criterion to evaluate as function pointer |
952310cc UB |
195 | * Returns: |
196 | * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. | |
197 | * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). | |
198 | */ | |
b51fa1b1 SR |
199 | int smc_rx_wait(struct smc_sock *smc, long *timeo, |
200 | int (*fcrit)(struct smc_connection *conn)) | |
952310cc UB |
201 | { |
202 | DEFINE_WAIT_FUNC(wait, woken_wake_function); | |
203 | struct smc_connection *conn = &smc->conn; | |
b2900980 UB |
204 | struct smc_cdc_conn_state_flags *cflags = |
205 | &conn->local_tx_ctrl.conn_state_flags; | |
952310cc UB |
206 | struct sock *sk = &smc->sk; |
207 | int rc; | |
208 | ||
b51fa1b1 | 209 | if (fcrit(conn)) |
952310cc UB |
210 | return 1; |
211 | sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); | |
212 | add_wait_queue(sk_sleep(sk), &wait); | |
213 | rc = sk_wait_event(sk, timeo, | |
214 | sk->sk_err || | |
b2900980 | 215 | cflags->peer_conn_abort || |
952310cc | 216 | sk->sk_shutdown & RCV_SHUTDOWN || |
b2900980 | 217 | conn->killed || |
882dcfe5 | 218 | fcrit(conn), |
952310cc UB |
219 | &wait); |
220 | remove_wait_queue(sk_sleep(sk), &wait); | |
221 | sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); | |
222 | return rc; | |
223 | } | |
224 | ||
de8474eb SR |
225 | static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, |
226 | int flags) | |
227 | { | |
228 | struct smc_connection *conn = &smc->conn; | |
229 | union smc_host_cursor cons; | |
230 | struct sock *sk = &smc->sk; | |
231 | int rc = 0; | |
232 | ||
233 | if (sock_flag(sk, SOCK_URGINLINE) || | |
234 | !(conn->urg_state == SMC_URG_VALID) || | |
235 | conn->urg_state == SMC_URG_READ) | |
236 | return -EINVAL; | |
237 | ||
238 | if (conn->urg_state == SMC_URG_VALID) { | |
239 | if (!(flags & MSG_PEEK)) | |
240 | smc->conn.urg_state = SMC_URG_READ; | |
241 | msg->msg_flags |= MSG_OOB; | |
242 | if (len > 0) { | |
243 | if (!(flags & MSG_TRUNC)) | |
244 | rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); | |
245 | len = 1; | |
bac6de7b | 246 | smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); |
de8474eb SR |
247 | if (smc_curs_diff(conn->rmb_desc->len, &cons, |
248 | &conn->urg_curs) > 1) | |
249 | conn->urg_rx_skip_pend = true; | |
250 | /* Urgent Byte was already accounted for, but trigger | |
251 | * skipping the urgent byte in non-inline case | |
252 | */ | |
253 | if (!(flags & MSG_PEEK)) | |
254 | smc_rx_update_consumer(smc, cons, 0); | |
255 | } else { | |
256 | msg->msg_flags |= MSG_TRUNC; | |
257 | } | |
258 | ||
259 | return rc ? -EFAULT : len; | |
260 | } | |
261 | ||
262 | if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN) | |
263 | return 0; | |
264 | ||
265 | return -EAGAIN; | |
266 | } | |
267 | ||
107529e3 KG |
268 | static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) |
269 | { | |
270 | struct smc_connection *conn = &smc->conn; | |
271 | ||
272 | if (smc_rx_data_available(conn)) | |
273 | return true; | |
274 | else if (conn->urg_state == SMC_URG_VALID) | |
275 | /* we received a single urgent Byte - skip */ | |
276 | smc_rx_update_cons(smc, 0); | |
277 | return false; | |
278 | } | |
279 | ||
9014db20 SR |
280 | /* smc_rx_recvmsg - receive data from RMBE |
281 | * @msg: copy data to receive buffer | |
282 | * @pipe: copy data to pipe if set - indicates splice() call | |
283 | * | |
284 | * rcvbuf consumer: main API called by socket layer. | |
285 | * Called under sk lock. | |
952310cc | 286 | */ |
9014db20 SR |
287 | int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, |
288 | struct pipe_inode_info *pipe, size_t len, int flags) | |
952310cc UB |
289 | { |
290 | size_t copylen, read_done = 0, read_remaining = len; | |
291 | size_t chunk_len, chunk_off, chunk_len_sum; | |
292 | struct smc_connection *conn = &smc->conn; | |
9014db20 | 293 | int (*func)(struct smc_connection *conn); |
952310cc UB |
294 | union smc_host_cursor cons; |
295 | int readable, chunk; | |
296 | char *rcvbuf_base; | |
297 | struct sock *sk; | |
9014db20 | 298 | int splbytes; |
952310cc UB |
299 | long timeo; |
300 | int target; /* Read at least these many bytes */ | |
301 | int rc; | |
302 | ||
303 | if (unlikely(flags & MSG_ERRQUEUE)) | |
304 | return -EINVAL; /* future work for sk.sk_family == AF_SMC */ | |
952310cc UB |
305 | |
306 | sk = &smc->sk; | |
307 | if (sk->sk_state == SMC_LISTEN) | |
308 | return -ENOTCONN; | |
de8474eb SR |
309 | if (flags & MSG_OOB) |
310 | return smc_rx_recv_urg(smc, msg, len, flags); | |
952310cc UB |
311 | timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); |
312 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); | |
313 | ||
952310cc | 314 | /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ |
be244f28 | 315 | rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; |
952310cc UB |
316 | |
317 | do { /* while (read_remaining) */ | |
9014db20 | 318 | if (read_done >= target || (pipe && read_done)) |
952310cc UB |
319 | break; |
320 | ||
b2900980 UB |
321 | if (conn->killed) |
322 | break; | |
323 | ||
107529e3 | 324 | if (smc_rx_recvmsg_data_available(smc)) |
952310cc UB |
325 | goto copy; |
326 | ||
b2900980 | 327 | if (sk->sk_shutdown & RCV_SHUTDOWN) { |
107529e3 KG |
328 | /* smc_cdc_msg_recv_action() could have run after |
329 | * above smc_rx_recvmsg_data_available() | |
330 | */ | |
331 | if (smc_rx_recvmsg_data_available(smc)) | |
332 | goto copy; | |
c8b8ec8e | 333 | break; |
107529e3 | 334 | } |
c8b8ec8e | 335 | |
952310cc UB |
336 | if (read_done) { |
337 | if (sk->sk_err || | |
338 | sk->sk_state == SMC_CLOSED || | |
952310cc | 339 | !timeo || |
c8b8ec8e | 340 | signal_pending(current)) |
952310cc UB |
341 | break; |
342 | } else { | |
952310cc UB |
343 | if (sk->sk_err) { |
344 | read_done = sock_error(sk); | |
345 | break; | |
346 | } | |
952310cc UB |
347 | if (sk->sk_state == SMC_CLOSED) { |
348 | if (!sock_flag(sk, SOCK_DONE)) { | |
349 | /* This occurs when user tries to read | |
350 | * from never connected socket. | |
351 | */ | |
352 | read_done = -ENOTCONN; | |
353 | break; | |
354 | } | |
355 | break; | |
356 | } | |
357 | if (signal_pending(current)) { | |
358 | read_done = sock_intr_errno(timeo); | |
359 | break; | |
360 | } | |
846e344e HW |
361 | if (!timeo) |
362 | return -EAGAIN; | |
952310cc UB |
363 | } |
364 | ||
b51fa1b1 SR |
365 | if (!smc_rx_data_available(conn)) { |
366 | smc_rx_wait(smc, &timeo, smc_rx_data_available); | |
952310cc UB |
367 | continue; |
368 | } | |
369 | ||
370 | copy: | |
371 | /* initialize variables for 1st iteration of subsequent loop */ | |
b51fa1b1 | 372 | /* could be just 1 byte, even after waiting on data above */ |
952310cc | 373 | readable = atomic_read(&conn->bytes_to_rcv); |
9014db20 SR |
374 | splbytes = atomic_read(&conn->splice_pending); |
375 | if (!readable || (msg && splbytes)) { | |
376 | if (splbytes) | |
377 | func = smc_rx_data_available_and_no_splice_pend; | |
378 | else | |
379 | func = smc_rx_data_available; | |
380 | smc_rx_wait(smc, &timeo, func); | |
381 | continue; | |
382 | } | |
383 | ||
bac6de7b | 384 | smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); |
9014db20 SR |
385 | /* subsequent splice() calls pick up where previous left */ |
386 | if (splbytes) | |
69cb7dc0 | 387 | smc_curs_add(conn->rmb_desc->len, &cons, splbytes); |
de8474eb SR |
388 | if (conn->urg_state == SMC_URG_VALID && |
389 | sock_flag(&smc->sk, SOCK_URGINLINE) && | |
390 | readable > 1) | |
391 | readable--; /* always stop at urgent Byte */ | |
392 | /* not more than what user space asked for */ | |
393 | copylen = min_t(size_t, read_remaining, readable); | |
952310cc UB |
394 | /* determine chunks where to read from rcvbuf */ |
395 | /* either unwrapped case, or 1st chunk of wrapped case */ | |
69cb7dc0 HW |
396 | chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - |
397 | cons.count); | |
952310cc UB |
398 | chunk_len_sum = chunk_len; |
399 | chunk_off = cons.count; | |
10428dd8 | 400 | smc_rmb_sync_sg_for_cpu(conn); |
952310cc UB |
401 | for (chunk = 0; chunk < 2; chunk++) { |
402 | if (!(flags & MSG_TRUNC)) { | |
9014db20 SR |
403 | if (msg) { |
404 | rc = memcpy_to_msg(msg, rcvbuf_base + | |
405 | chunk_off, | |
406 | chunk_len); | |
407 | } else { | |
408 | rc = smc_rx_splice(pipe, rcvbuf_base + | |
409 | chunk_off, chunk_len, | |
410 | smc); | |
411 | } | |
412 | if (rc < 0) { | |
952310cc UB |
413 | if (!read_done) |
414 | read_done = -EFAULT; | |
10428dd8 | 415 | smc_rmb_sync_sg_for_device(conn); |
952310cc UB |
416 | goto out; |
417 | } | |
418 | } | |
419 | read_remaining -= chunk_len; | |
420 | read_done += chunk_len; | |
421 | ||
422 | if (chunk_len_sum == copylen) | |
423 | break; /* either on 1st or 2nd iteration */ | |
424 | /* prepare next (== 2nd) iteration */ | |
425 | chunk_len = copylen - chunk_len; /* remainder */ | |
426 | chunk_len_sum += chunk_len; | |
427 | chunk_off = 0; /* modulo offset in recv ring buffer */ | |
428 | } | |
10428dd8 | 429 | smc_rmb_sync_sg_for_device(conn); |
952310cc UB |
430 | |
431 | /* update cursors */ | |
432 | if (!(flags & MSG_PEEK)) { | |
952310cc UB |
433 | /* increased in recv tasklet smc_cdc_msg_rcv() */ |
434 | smp_mb__before_atomic(); | |
435 | atomic_sub(copylen, &conn->bytes_to_rcv); | |
69cb7dc0 | 436 | /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ |
952310cc | 437 | smp_mb__after_atomic(); |
de8474eb SR |
438 | if (msg && smc_rx_update_consumer(smc, cons, copylen)) |
439 | goto out; | |
952310cc UB |
440 | } |
441 | } while (read_remaining); | |
442 | out: | |
443 | return read_done; | |
444 | } | |
445 | ||
446 | /* Initialize receive properties on connection establishment. NB: not __init! */ | |
447 | void smc_rx_init(struct smc_sock *smc) | |
448 | { | |
b51fa1b1 | 449 | smc->sk.sk_data_ready = smc_rx_wake_up; |
9014db20 | 450 | atomic_set(&smc->conn.splice_pending, 0); |
de8474eb | 451 | smc->conn.urg_state = SMC_URG_READ; |
952310cc | 452 | } |