Commit | Line | Data |
---|---|---|
00e0f34c AG |
1 | /* |
2 | * Copyright (c) 2006 Oracle. All rights reserved. | |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | * | |
32 | */ | |
33 | #include <linux/kernel.h> | |
34 | #include <linux/random.h> | |
bc3b2d7f | 35 | #include <linux/export.h> |
00e0f34c AG |
36 | |
37 | #include "rds.h" | |
38 | ||
39 | /* | |
40 | * All of connection management is simplified by serializing it through | |
41 | * work queues that execute in a connection managing thread. | |
42 | * | |
43 | * TCP wants to send acks through sendpage() in response to data_ready(), | |
44 | * but it needs a process context to do so. | |
45 | * | |
46 | * The receive paths need to allocate but can't drop packets (!) so we have | |
47 | * a thread around to block allocating if the receive fast path sees an | |
48 | * allocation failure. | |
49 | */ | |
50 | ||
51 | /* Grand Unified Theory of connection life cycle: | |
52 | * At any point in time, the connection can be in one of these states: | |
53 | * DOWN, CONNECTING, UP, DISCONNECTING, ERROR | |
54 | * | |
55 | * The following transitions are possible: | |
56 | * ANY -> ERROR | |
57 | * UP -> DISCONNECTING | |
58 | * ERROR -> DISCONNECTING | |
59 | * DISCONNECTING -> DOWN | |
60 | * DOWN -> CONNECTING | |
61 | * CONNECTING -> UP | |
62 | * | |
63 | * Transition to state DISCONNECTING/DOWN: | |
64 | * - Inside the shutdown worker; synchronizes with xmit path | |
0f4b1c7e | 65 | * through RDS_IN_XMIT, and with connection management callbacks |
00e0f34c AG |
66 | * via c_cm_lock. |
67 | * | |
68 | * For receive callbacks, we rely on the underlying transport | |
69 | * (TCP, IB/RDMA) to provide the necessary synchronisation. | |
70 | */ | |
71 | struct workqueue_struct *rds_wq; | |
616b757a | 72 | EXPORT_SYMBOL_GPL(rds_wq); |
00e0f34c | 73 | |
0cb43965 | 74 | void rds_connect_path_complete(struct rds_conn_path *cp, int curr) |
00e0f34c | 75 | { |
0cb43965 | 76 | if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { |
00e0f34c AG |
77 | printk(KERN_WARNING "%s: Cannot transition to state UP, " |
78 | "current state is %d\n", | |
79 | __func__, | |
0cb43965 | 80 | atomic_read(&cp->cp_state)); |
aed20a53 | 81 | rds_conn_path_drop(cp, false); |
00e0f34c AG |
82 | return; |
83 | } | |
84 | ||
85 | rdsdebug("conn %p for %pI4 to %pI4 complete\n", | |
0cb43965 | 86 | cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); |
00e0f34c | 87 | |
0cb43965 SV |
88 | cp->cp_reconnect_jiffies = 0; |
89 | set_bit(0, &cp->cp_conn->c_map_queued); | |
3db6e0d1 | 90 | rcu_read_lock(); |
ebeeb1ad | 91 | if (!rds_destroy_pending(cp->cp_conn)) { |
3db6e0d1 SV |
92 | queue_delayed_work(rds_wq, &cp->cp_send_w, 0); |
93 | queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); | |
94 | } | |
95 | rcu_read_unlock(); | |
00e0f34c | 96 | } |
9c79440e SV |
97 | EXPORT_SYMBOL_GPL(rds_connect_path_complete); |
98 | ||
99 | void rds_connect_complete(struct rds_connection *conn) | |
100 | { | |
0cb43965 | 101 | rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); |
9c79440e | 102 | } |
616b757a | 103 | EXPORT_SYMBOL_GPL(rds_connect_complete); |
00e0f34c AG |
104 | |
105 | /* | |
106 | * This random exponential backoff is relied on to eventually resolve racing | |
107 | * connects. | |
108 | * | |
109 | * If connect attempts race then both parties drop both connections and come | |
110 | * here to wait for a random amount of time before trying again. Eventually | |
111 | * the backoff range will be so much greater than the time it takes to | |
112 | * establish a connection that one of the pair will establish the connection | |
113 | * before the other's random delay fires. | |
114 | * | |
115 | * Connection attempts that arrive while a connection is already established | |
116 | * are also considered to be racing connects. This lets a connection from | |
117 | * a rebooted machine replace an existing stale connection before the transport | |
118 | * notices that the connection has failed. | |
119 | * | |
120 | * We should *always* start with a random backoff; otherwise a broken connection | |
121 | * will always take several iterations to be re-established. | |
122 | */ | |
0cb43965 | 123 | void rds_queue_reconnect(struct rds_conn_path *cp) |
00e0f34c AG |
124 | { |
125 | unsigned long rand; | |
0cb43965 | 126 | struct rds_connection *conn = cp->cp_conn; |
00e0f34c AG |
127 | |
128 | rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", | |
129 | conn, &conn->c_laddr, &conn->c_faddr, | |
0cb43965 | 130 | cp->cp_reconnect_jiffies); |
00e0f34c | 131 | |
8315011a SV |
132 | /* let peer with smaller addr initiate reconnect, to avoid duels */ |
133 | if (conn->c_trans->t_type == RDS_TRANS_TCP && | |
00354de5 | 134 | !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) |
8315011a SV |
135 | return; |
136 | ||
0cb43965 SV |
137 | set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); |
138 | if (cp->cp_reconnect_jiffies == 0) { | |
139 | cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; | |
3db6e0d1 | 140 | rcu_read_lock(); |
ebeeb1ad | 141 | if (!rds_destroy_pending(cp->cp_conn)) |
3db6e0d1 SV |
142 | queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); |
143 | rcu_read_unlock(); | |
00e0f34c AG |
144 | return; |
145 | } | |
146 | ||
147 | get_random_bytes(&rand, sizeof(rand)); | |
148 | rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", | |
0cb43965 | 149 | rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, |
00e0f34c | 150 | conn, &conn->c_laddr, &conn->c_faddr); |
3db6e0d1 | 151 | rcu_read_lock(); |
ebeeb1ad | 152 | if (!rds_destroy_pending(cp->cp_conn)) |
3db6e0d1 SV |
153 | queue_delayed_work(rds_wq, &cp->cp_conn_w, |
154 | rand % cp->cp_reconnect_jiffies); | |
155 | rcu_read_unlock(); | |
00e0f34c | 156 | |
0cb43965 | 157 | cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, |
00e0f34c AG |
158 | rds_sysctl_reconnect_max_jiffies); |
159 | } | |
160 | ||
161 | void rds_connect_worker(struct work_struct *work) | |
162 | { | |
0cb43965 SV |
163 | struct rds_conn_path *cp = container_of(work, |
164 | struct rds_conn_path, | |
165 | cp_conn_w.work); | |
166 | struct rds_connection *conn = cp->cp_conn; | |
00e0f34c AG |
167 | int ret; |
168 | ||
00354de5 SV |
169 | if (cp->cp_index > 0 && |
170 | !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) | |
5916e2c1 | 171 | return; |
0cb43965 | 172 | clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); |
b04e8554 SV |
173 | ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); |
174 | if (ret) { | |
175 | ret = conn->c_trans->conn_path_connect(cp); | |
00e0f34c AG |
176 | rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", |
177 | conn, &conn->c_laddr, &conn->c_faddr, ret); | |
178 | ||
179 | if (ret) { | |
0cb43965 SV |
180 | if (rds_conn_path_transition(cp, |
181 | RDS_CONN_CONNECTING, | |
182 | RDS_CONN_DOWN)) | |
183 | rds_queue_reconnect(cp); | |
00e0f34c | 184 | else |
9c7cbcf5 | 185 | rds_conn_path_error(cp, "connect failed\n"); |
00e0f34c AG |
186 | } |
187 | } | |
188 | } | |
189 | ||
00e0f34c AG |
190 | void rds_send_worker(struct work_struct *work) |
191 | { | |
0cb43965 SV |
192 | struct rds_conn_path *cp = container_of(work, |
193 | struct rds_conn_path, | |
194 | cp_send_w.work); | |
00e0f34c AG |
195 | int ret; |
196 | ||
0cb43965 SV |
197 | if (rds_conn_path_state(cp) == RDS_CONN_UP) { |
198 | clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); | |
1f9ecd7e | 199 | ret = rds_send_xmit(cp); |
db6526dc | 200 | cond_resched(); |
0cb43965 | 201 | rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); |
00e0f34c AG |
202 | switch (ret) { |
203 | case -EAGAIN: | |
204 | rds_stats_inc(s_send_immediate_retry); | |
0cb43965 | 205 | queue_delayed_work(rds_wq, &cp->cp_send_w, 0); |
00e0f34c AG |
206 | break; |
207 | case -ENOMEM: | |
208 | rds_stats_inc(s_send_delayed_retry); | |
0cb43965 | 209 | queue_delayed_work(rds_wq, &cp->cp_send_w, 2); |
00e0f34c AG |
210 | default: |
211 | break; | |
212 | } | |
213 | } | |
214 | } | |
215 | ||
216 | void rds_recv_worker(struct work_struct *work) | |
217 | { | |
0cb43965 SV |
218 | struct rds_conn_path *cp = container_of(work, |
219 | struct rds_conn_path, | |
220 | cp_recv_w.work); | |
00e0f34c AG |
221 | int ret; |
222 | ||
0cb43965 | 223 | if (rds_conn_path_state(cp) == RDS_CONN_UP) { |
2da43c4a | 224 | ret = cp->cp_conn->c_trans->recv_path(cp); |
0cb43965 | 225 | rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); |
00e0f34c AG |
226 | switch (ret) { |
227 | case -EAGAIN: | |
228 | rds_stats_inc(s_recv_immediate_retry); | |
0cb43965 | 229 | queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); |
00e0f34c AG |
230 | break; |
231 | case -ENOMEM: | |
232 | rds_stats_inc(s_recv_delayed_retry); | |
0cb43965 | 233 | queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); |
00e0f34c AG |
234 | default: |
235 | break; | |
236 | } | |
237 | } | |
238 | } | |
239 | ||
2dc39357 AG |
240 | void rds_shutdown_worker(struct work_struct *work) |
241 | { | |
0cb43965 SV |
242 | struct rds_conn_path *cp = container_of(work, |
243 | struct rds_conn_path, | |
244 | cp_down_w); | |
2dc39357 | 245 | |
d769ef81 | 246 | rds_conn_shutdown(cp); |
2dc39357 AG |
247 | } |
248 | ||
00e0f34c AG |
249 | void rds_threads_exit(void) |
250 | { | |
251 | destroy_workqueue(rds_wq); | |
252 | } | |
253 | ||
ef87b7ea | 254 | int rds_threads_init(void) |
00e0f34c | 255 | { |
80c51be5 | 256 | rds_wq = create_singlethread_workqueue("krdsd"); |
8690bfa1 | 257 | if (!rds_wq) |
00e0f34c AG |
258 | return -ENOMEM; |
259 | ||
260 | return 0; | |
261 | } |