Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | * | |
36 | * libcfs/libcfs/workitem.c | |
37 | * | |
38 | * Author: Isaac Huang <isaac@clusterfs.com> | |
39 | * Liang Zhen <zhen.liang@sun.com> | |
40 | */ | |
41 | ||
42 | #define DEBUG_SUBSYSTEM S_LNET | |
43 | ||
9fdaf8c0 | 44 | #include "../../include/linux/libcfs/libcfs.h" |
d7e09d03 PT |
45 | |
46 | #define CFS_WS_NAME_LEN 16 | |
47 | ||
48 | typedef struct cfs_wi_sched { | |
49 | struct list_head ws_list; /* chain on global list */ | |
50 | /** serialised workitems */ | |
51 | spinlock_t ws_lock; | |
52 | /** where schedulers sleep */ | |
53 | wait_queue_head_t ws_waitq; | |
54 | /** concurrent workitems */ | |
55 | struct list_head ws_runq; | |
56 | /** rescheduled running-workitems, a workitem can be rescheduled | |
57 | * while running in wi_action(), but we don't to execute it again | |
58 | * unless it returns from wi_action(), so we put it on ws_rerunq | |
59 | * while rescheduling, and move it to runq after it returns | |
60 | * from wi_action() */ | |
61 | struct list_head ws_rerunq; | |
62 | /** CPT-table for this scheduler */ | |
63 | struct cfs_cpt_table *ws_cptab; | |
64 | /** CPT id for affinity */ | |
65 | int ws_cpt; | |
66 | /** number of scheduled workitems */ | |
67 | int ws_nscheduled; | |
68 | /** started scheduler thread, protected by cfs_wi_data::wi_glock */ | |
69 | unsigned int ws_nthreads:30; | |
70 | /** shutting down, protected by cfs_wi_data::wi_glock */ | |
71 | unsigned int ws_stopping:1; | |
72 | /** serialize starting thread, protected by cfs_wi_data::wi_glock */ | |
73 | unsigned int ws_starting:1; | |
74 | /** scheduler name */ | |
75 | char ws_name[CFS_WS_NAME_LEN]; | |
76 | } cfs_wi_sched_t; | |
77 | ||
d130c000 | 78 | static struct cfs_workitem_data { |
d7e09d03 PT |
79 | /** serialize */ |
80 | spinlock_t wi_glock; | |
81 | /** list of all schedulers */ | |
82 | struct list_head wi_scheds; | |
83 | /** WI module is initialized */ | |
84 | int wi_init; | |
85 | /** shutting down the whole WI module */ | |
86 | int wi_stopping; | |
87 | } cfs_wi_data; | |
88 | ||
89 | static inline void | |
90 | cfs_wi_sched_lock(cfs_wi_sched_t *sched) | |
91 | { | |
92 | spin_lock(&sched->ws_lock); | |
93 | } | |
94 | ||
95 | static inline void | |
96 | cfs_wi_sched_unlock(cfs_wi_sched_t *sched) | |
97 | { | |
98 | spin_unlock(&sched->ws_lock); | |
99 | } | |
100 | ||
101 | static inline int | |
102 | cfs_wi_sched_cansleep(cfs_wi_sched_t *sched) | |
103 | { | |
104 | cfs_wi_sched_lock(sched); | |
105 | if (sched->ws_stopping) { | |
106 | cfs_wi_sched_unlock(sched); | |
107 | return 0; | |
108 | } | |
109 | ||
110 | if (!list_empty(&sched->ws_runq)) { | |
111 | cfs_wi_sched_unlock(sched); | |
112 | return 0; | |
113 | } | |
114 | cfs_wi_sched_unlock(sched); | |
115 | return 1; | |
116 | } | |
117 | ||
118 | ||
119 | /* XXX: | |
120 | * 0. it only works when called from wi->wi_action. | |
121 | * 1. when it returns no one shall try to schedule the workitem. | |
122 | */ | |
123 | void | |
124 | cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi) | |
125 | { | |
126 | LASSERT(!in_interrupt()); /* because we use plain spinlock */ | |
127 | LASSERT(!sched->ws_stopping); | |
128 | ||
129 | cfs_wi_sched_lock(sched); | |
130 | ||
131 | LASSERT(wi->wi_running); | |
132 | if (wi->wi_scheduled) { /* cancel pending schedules */ | |
133 | LASSERT(!list_empty(&wi->wi_list)); | |
134 | list_del_init(&wi->wi_list); | |
135 | ||
136 | LASSERT(sched->ws_nscheduled > 0); | |
137 | sched->ws_nscheduled--; | |
138 | } | |
139 | ||
140 | LASSERT(list_empty(&wi->wi_list)); | |
141 | ||
142 | wi->wi_scheduled = 1; /* LBUG future schedule attempts */ | |
143 | cfs_wi_sched_unlock(sched); | |
144 | ||
145 | return; | |
146 | } | |
147 | EXPORT_SYMBOL(cfs_wi_exit); | |
148 | ||
149 | /** | |
150 | * cancel schedule request of workitem \a wi | |
151 | */ | |
152 | int | |
153 | cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) | |
154 | { | |
155 | int rc; | |
156 | ||
157 | LASSERT(!in_interrupt()); /* because we use plain spinlock */ | |
158 | LASSERT(!sched->ws_stopping); | |
159 | ||
160 | /* | |
161 | * return 0 if it's running already, otherwise return 1, which | |
162 | * means the workitem will not be scheduled and will not have | |
163 | * any race with wi_action. | |
164 | */ | |
165 | cfs_wi_sched_lock(sched); | |
166 | ||
167 | rc = !(wi->wi_running); | |
168 | ||
169 | if (wi->wi_scheduled) { /* cancel pending schedules */ | |
170 | LASSERT(!list_empty(&wi->wi_list)); | |
171 | list_del_init(&wi->wi_list); | |
172 | ||
173 | LASSERT(sched->ws_nscheduled > 0); | |
174 | sched->ws_nscheduled--; | |
175 | ||
176 | wi->wi_scheduled = 0; | |
177 | } | |
178 | ||
179 | LASSERT (list_empty(&wi->wi_list)); | |
180 | ||
181 | cfs_wi_sched_unlock(sched); | |
182 | return rc; | |
183 | } | |
184 | EXPORT_SYMBOL(cfs_wi_deschedule); | |
185 | ||
186 | /* | |
187 | * Workitem scheduled with (serial == 1) is strictly serialised not only with | |
188 | * itself, but also with others scheduled this way. | |
189 | * | |
190 | * Now there's only one static serialised queue, but in the future more might | |
191 | * be added, and even dynamic creation of serialised queues might be supported. | |
192 | */ | |
193 | void | |
194 | cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) | |
195 | { | |
196 | LASSERT(!in_interrupt()); /* because we use plain spinlock */ | |
197 | LASSERT(!sched->ws_stopping); | |
198 | ||
199 | cfs_wi_sched_lock(sched); | |
200 | ||
201 | if (!wi->wi_scheduled) { | |
202 | LASSERT (list_empty(&wi->wi_list)); | |
203 | ||
204 | wi->wi_scheduled = 1; | |
205 | sched->ws_nscheduled++; | |
206 | if (!wi->wi_running) { | |
207 | list_add_tail(&wi->wi_list, &sched->ws_runq); | |
208 | wake_up(&sched->ws_waitq); | |
209 | } else { | |
210 | list_add(&wi->wi_list, &sched->ws_rerunq); | |
211 | } | |
212 | } | |
213 | ||
214 | LASSERT (!list_empty(&wi->wi_list)); | |
215 | cfs_wi_sched_unlock(sched); | |
216 | return; | |
217 | } | |
218 | EXPORT_SYMBOL(cfs_wi_schedule); | |
219 | ||
220 | ||
221 | static int | |
222 | cfs_wi_scheduler (void *arg) | |
223 | { | |
224 | struct cfs_wi_sched *sched = (cfs_wi_sched_t *)arg; | |
225 | ||
226 | cfs_block_allsigs(); | |
227 | ||
228 | /* CPT affinity scheduler? */ | |
229 | if (sched->ws_cptab != NULL) | |
230 | cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt); | |
231 | ||
232 | spin_lock(&cfs_wi_data.wi_glock); | |
233 | ||
234 | LASSERT(sched->ws_starting == 1); | |
235 | sched->ws_starting--; | |
236 | sched->ws_nthreads++; | |
237 | ||
238 | spin_unlock(&cfs_wi_data.wi_glock); | |
239 | ||
240 | cfs_wi_sched_lock(sched); | |
241 | ||
242 | while (!sched->ws_stopping) { | |
243 | int nloops = 0; | |
244 | int rc; | |
245 | cfs_workitem_t *wi; | |
246 | ||
247 | while (!list_empty(&sched->ws_runq) && | |
248 | nloops < CFS_WI_RESCHED) { | |
249 | wi = list_entry(sched->ws_runq.next, | |
250 | cfs_workitem_t, wi_list); | |
251 | LASSERT(wi->wi_scheduled && !wi->wi_running); | |
252 | ||
253 | list_del_init(&wi->wi_list); | |
254 | ||
255 | LASSERT(sched->ws_nscheduled > 0); | |
256 | sched->ws_nscheduled--; | |
257 | ||
258 | wi->wi_running = 1; | |
259 | wi->wi_scheduled = 0; | |
260 | ||
261 | ||
262 | cfs_wi_sched_unlock(sched); | |
263 | nloops++; | |
264 | ||
265 | rc = (*wi->wi_action) (wi); | |
266 | ||
267 | cfs_wi_sched_lock(sched); | |
268 | if (rc != 0) /* WI should be dead, even be freed! */ | |
269 | continue; | |
270 | ||
271 | wi->wi_running = 0; | |
272 | if (list_empty(&wi->wi_list)) | |
273 | continue; | |
274 | ||
275 | LASSERT(wi->wi_scheduled); | |
276 | /* wi is rescheduled, should be on rerunq now, we | |
277 | * move it to runq so it can run action now */ | |
278 | list_move_tail(&wi->wi_list, &sched->ws_runq); | |
279 | } | |
280 | ||
281 | if (!list_empty(&sched->ws_runq)) { | |
282 | cfs_wi_sched_unlock(sched); | |
283 | /* don't sleep because some workitems still | |
284 | * expect me to come back soon */ | |
285 | cond_resched(); | |
286 | cfs_wi_sched_lock(sched); | |
287 | continue; | |
288 | } | |
289 | ||
290 | cfs_wi_sched_unlock(sched); | |
291 | cfs_wait_event_interruptible_exclusive(sched->ws_waitq, | |
292 | !cfs_wi_sched_cansleep(sched), rc); | |
293 | cfs_wi_sched_lock(sched); | |
294 | } | |
295 | ||
296 | cfs_wi_sched_unlock(sched); | |
297 | ||
298 | spin_lock(&cfs_wi_data.wi_glock); | |
299 | sched->ws_nthreads--; | |
300 | spin_unlock(&cfs_wi_data.wi_glock); | |
301 | ||
302 | return 0; | |
303 | } | |
304 | ||
305 | ||
306 | void | |
307 | cfs_wi_sched_destroy(struct cfs_wi_sched *sched) | |
308 | { | |
309 | int i; | |
310 | ||
311 | LASSERT(cfs_wi_data.wi_init); | |
312 | LASSERT(!cfs_wi_data.wi_stopping); | |
313 | ||
314 | spin_lock(&cfs_wi_data.wi_glock); | |
315 | if (sched->ws_stopping) { | |
316 | CDEBUG(D_INFO, "%s is in progress of stopping\n", | |
317 | sched->ws_name); | |
318 | spin_unlock(&cfs_wi_data.wi_glock); | |
319 | return; | |
320 | } | |
321 | ||
322 | LASSERT(!list_empty(&sched->ws_list)); | |
323 | sched->ws_stopping = 1; | |
324 | ||
325 | spin_unlock(&cfs_wi_data.wi_glock); | |
326 | ||
327 | i = 2; | |
328 | wake_up_all(&sched->ws_waitq); | |
329 | ||
330 | spin_lock(&cfs_wi_data.wi_glock); | |
331 | while (sched->ws_nthreads > 0) { | |
332 | CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET, | |
333 | "waiting for %d threads of WI sched[%s] to terminate\n", | |
334 | sched->ws_nthreads, sched->ws_name); | |
335 | ||
336 | spin_unlock(&cfs_wi_data.wi_glock); | |
d3caf4d5 PT |
337 | set_current_state(TASK_UNINTERRUPTIBLE); |
338 | schedule_timeout(cfs_time_seconds(1) / 20); | |
d7e09d03 PT |
339 | spin_lock(&cfs_wi_data.wi_glock); |
340 | } | |
341 | ||
342 | list_del(&sched->ws_list); | |
343 | ||
344 | spin_unlock(&cfs_wi_data.wi_glock); | |
345 | LASSERT(sched->ws_nscheduled == 0); | |
346 | ||
347 | LIBCFS_FREE(sched, sizeof(*sched)); | |
348 | } | |
349 | EXPORT_SYMBOL(cfs_wi_sched_destroy); | |
350 | ||
351 | int | |
352 | cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, | |
353 | int cpt, int nthrs, struct cfs_wi_sched **sched_pp) | |
354 | { | |
355 | struct cfs_wi_sched *sched; | |
356 | int rc; | |
357 | ||
358 | LASSERT(cfs_wi_data.wi_init); | |
359 | LASSERT(!cfs_wi_data.wi_stopping); | |
360 | LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || | |
361 | (cpt >= 0 && cpt < cfs_cpt_number(cptab))); | |
362 | ||
363 | LIBCFS_ALLOC(sched, sizeof(*sched)); | |
364 | if (sched == NULL) | |
365 | return -ENOMEM; | |
366 | ||
367 | strncpy(sched->ws_name, name, CFS_WS_NAME_LEN); | |
299ef8cd | 368 | sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0'; |
d7e09d03 PT |
369 | sched->ws_cptab = cptab; |
370 | sched->ws_cpt = cpt; | |
371 | ||
372 | spin_lock_init(&sched->ws_lock); | |
373 | init_waitqueue_head(&sched->ws_waitq); | |
374 | INIT_LIST_HEAD(&sched->ws_runq); | |
375 | INIT_LIST_HEAD(&sched->ws_rerunq); | |
376 | INIT_LIST_HEAD(&sched->ws_list); | |
377 | ||
378 | rc = 0; | |
379 | while (nthrs > 0) { | |
380 | char name[16]; | |
68b636b6 GKH |
381 | struct task_struct *task; |
382 | ||
d7e09d03 PT |
383 | spin_lock(&cfs_wi_data.wi_glock); |
384 | while (sched->ws_starting > 0) { | |
385 | spin_unlock(&cfs_wi_data.wi_glock); | |
386 | schedule(); | |
387 | spin_lock(&cfs_wi_data.wi_glock); | |
388 | } | |
389 | ||
390 | sched->ws_starting++; | |
391 | spin_unlock(&cfs_wi_data.wi_glock); | |
392 | ||
393 | if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { | |
6879807c | 394 | snprintf(name, sizeof(name), "%s_%02d_%02u", |
d7e09d03 PT |
395 | sched->ws_name, sched->ws_cpt, |
396 | sched->ws_nthreads); | |
397 | } else { | |
6879807c | 398 | snprintf(name, sizeof(name), "%s_%02u", |
d7e09d03 PT |
399 | sched->ws_name, sched->ws_nthreads); |
400 | } | |
401 | ||
9edf0f67 | 402 | task = kthread_run(cfs_wi_scheduler, sched, "%s", name); |
d7e09d03 PT |
403 | if (!IS_ERR(task)) { |
404 | nthrs--; | |
405 | continue; | |
406 | } | |
407 | rc = PTR_ERR(task); | |
408 | ||
409 | CERROR("Failed to create thread for WI scheduler %s: %d\n", | |
410 | name, rc); | |
411 | ||
412 | spin_lock(&cfs_wi_data.wi_glock); | |
413 | ||
414 | /* make up for cfs_wi_sched_destroy */ | |
415 | list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); | |
416 | sched->ws_starting--; | |
417 | ||
418 | spin_unlock(&cfs_wi_data.wi_glock); | |
419 | ||
420 | cfs_wi_sched_destroy(sched); | |
421 | return rc; | |
422 | } | |
423 | spin_lock(&cfs_wi_data.wi_glock); | |
424 | list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); | |
425 | spin_unlock(&cfs_wi_data.wi_glock); | |
426 | ||
427 | *sched_pp = sched; | |
428 | return 0; | |
429 | } | |
430 | EXPORT_SYMBOL(cfs_wi_sched_create); | |
431 | ||
432 | int | |
433 | cfs_wi_startup(void) | |
434 | { | |
435 | memset(&cfs_wi_data, 0, sizeof(cfs_wi_data)); | |
436 | ||
437 | spin_lock_init(&cfs_wi_data.wi_glock); | |
438 | INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); | |
439 | cfs_wi_data.wi_init = 1; | |
440 | ||
441 | return 0; | |
442 | } | |
443 | ||
444 | void | |
445 | cfs_wi_shutdown (void) | |
446 | { | |
447 | struct cfs_wi_sched *sched; | |
448 | ||
449 | spin_lock(&cfs_wi_data.wi_glock); | |
450 | cfs_wi_data.wi_stopping = 1; | |
451 | spin_unlock(&cfs_wi_data.wi_glock); | |
452 | ||
453 | /* nobody should contend on this list */ | |
454 | list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { | |
455 | sched->ws_stopping = 1; | |
456 | wake_up_all(&sched->ws_waitq); | |
457 | } | |
458 | ||
459 | list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { | |
460 | spin_lock(&cfs_wi_data.wi_glock); | |
461 | ||
462 | while (sched->ws_nthreads != 0) { | |
463 | spin_unlock(&cfs_wi_data.wi_glock); | |
d3caf4d5 PT |
464 | set_current_state(TASK_UNINTERRUPTIBLE); |
465 | schedule_timeout(cfs_time_seconds(1) / 20); | |
d7e09d03 PT |
466 | spin_lock(&cfs_wi_data.wi_glock); |
467 | } | |
468 | spin_unlock(&cfs_wi_data.wi_glock); | |
469 | } | |
470 | while (!list_empty(&cfs_wi_data.wi_scheds)) { | |
471 | sched = list_entry(cfs_wi_data.wi_scheds.next, | |
472 | struct cfs_wi_sched, ws_list); | |
473 | list_del(&sched->ws_list); | |
474 | LIBCFS_FREE(sched, sizeof(*sched)); | |
475 | } | |
476 | ||
477 | cfs_wi_data.wi_stopping = 0; | |
478 | cfs_wi_data.wi_init = 0; | |
479 | } |