Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
1dc563a6 | 30 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
31 | */ |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | * | |
36 | * Implementation of cl_page for OSC layer. | |
37 | * | |
38 | * Author: Nikita Danilov <nikita.danilov@sun.com> | |
39 | */ | |
40 | ||
41 | #define DEBUG_SUBSYSTEM S_OSC | |
42 | ||
43 | #include "osc_cl_internal.h" | |
44 | ||
45 | static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del); | |
46 | static void osc_lru_add(struct client_obd *cli, struct osc_page *opg); | |
47 | static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, | |
48 | struct osc_page *opg); | |
49 | ||
50 | /** \addtogroup osc | |
51 | * @{ | |
52 | */ | |
53 | ||
54 | /* | |
55 | * Comment out osc_page_protected because it may sleep inside the | |
56 | * the client_obd_list_lock. | |
57 | * client_obd_list_lock -> osc_ap_completion -> osc_completion -> | |
58 | * -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base | |
59 | * -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep. | |
60 | */ | |
61 | #if 0 | |
62 | static int osc_page_is_dlocked(const struct lu_env *env, | |
63 | const struct osc_page *opg, | |
64 | enum cl_lock_mode mode, int pending, int unref) | |
65 | { | |
66 | struct cl_page *page; | |
67 | struct osc_object *obj; | |
68 | struct osc_thread_info *info; | |
69 | struct ldlm_res_id *resname; | |
70 | struct lustre_handle *lockh; | |
71 | ldlm_policy_data_t *policy; | |
72 | ldlm_mode_t dlmmode; | |
875332d4 | 73 | __u64 flags; |
d7e09d03 PT |
74 | |
75 | might_sleep(); | |
76 | ||
77 | info = osc_env_info(env); | |
78 | resname = &info->oti_resname; | |
79 | policy = &info->oti_policy; | |
80 | lockh = &info->oti_handle; | |
81 | page = opg->ops_cl.cpl_page; | |
82 | obj = cl2osc(opg->ops_cl.cpl_obj); | |
83 | ||
84 | flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED; | |
85 | if (pending) | |
86 | flags |= LDLM_FL_CBPENDING; | |
87 | ||
88 | dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW; | |
89 | osc_lock_build_res(env, obj, resname); | |
90 | osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index); | |
91 | return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy, | |
92 | dlmmode, &flags, NULL, lockh, unref); | |
93 | } | |
94 | ||
95 | /** | |
96 | * Checks an invariant that a page in the cache is covered by a lock, as | |
97 | * needed. | |
98 | */ | |
99 | static int osc_page_protected(const struct lu_env *env, | |
100 | const struct osc_page *opg, | |
101 | enum cl_lock_mode mode, int unref) | |
102 | { | |
103 | struct cl_object_header *hdr; | |
104 | struct cl_lock *scan; | |
105 | struct cl_page *page; | |
106 | struct cl_lock_descr *descr; | |
107 | int result; | |
108 | ||
109 | LINVRNT(!opg->ops_temp); | |
110 | ||
111 | page = opg->ops_cl.cpl_page; | |
112 | if (page->cp_owner != NULL && | |
113 | cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER) | |
114 | /* | |
115 | * If IO is done without locks (liblustre, or lloop), lock is | |
116 | * not required. | |
117 | */ | |
118 | result = 1; | |
119 | else | |
120 | /* otherwise check for a DLM lock */ | |
121 | result = osc_page_is_dlocked(env, opg, mode, 1, unref); | |
122 | if (result == 0) { | |
123 | /* maybe this page is a part of a lockless io? */ | |
124 | hdr = cl_object_header(opg->ops_cl.cpl_obj); | |
125 | descr = &osc_env_info(env)->oti_descr; | |
126 | descr->cld_mode = mode; | |
127 | descr->cld_start = page->cp_index; | |
128 | descr->cld_end = page->cp_index; | |
129 | spin_lock(&hdr->coh_lock_guard); | |
130 | list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) { | |
131 | /* | |
132 | * Lock-less sub-lock has to be either in HELD state | |
133 | * (when io is actively going on), or in CACHED state, | |
134 | * when top-lock is being unlocked: | |
135 | * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse(). | |
136 | */ | |
137 | if ((scan->cll_state == CLS_HELD || | |
138 | scan->cll_state == CLS_CACHED) && | |
139 | cl_lock_ext_match(&scan->cll_descr, descr)) { | |
140 | struct osc_lock *olck; | |
141 | ||
142 | olck = osc_lock_at(scan); | |
143 | result = osc_lock_is_lockless(olck); | |
144 | break; | |
145 | } | |
146 | } | |
147 | spin_unlock(&hdr->coh_lock_guard); | |
148 | } | |
149 | return result; | |
150 | } | |
151 | #else | |
152 | static int osc_page_protected(const struct lu_env *env, | |
153 | const struct osc_page *opg, | |
154 | enum cl_lock_mode mode, int unref) | |
155 | { | |
156 | return 1; | |
157 | } | |
158 | #endif | |
159 | ||
160 | /***************************************************************************** | |
161 | * | |
162 | * Page operations. | |
163 | * | |
164 | */ | |
165 | static void osc_page_fini(const struct lu_env *env, | |
166 | struct cl_page_slice *slice) | |
167 | { | |
168 | struct osc_page *opg = cl2osc_page(slice); | |
50ffcb7e | 169 | |
d7e09d03 PT |
170 | CDEBUG(D_TRACE, "%p\n", opg); |
171 | LASSERT(opg->ops_lock == NULL); | |
172 | } | |
173 | ||
174 | static void osc_page_transfer_get(struct osc_page *opg, const char *label) | |
175 | { | |
176 | struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); | |
177 | ||
178 | LASSERT(!opg->ops_transfer_pinned); | |
179 | cl_page_get(page); | |
180 | lu_ref_add_atomic(&page->cp_reference, label, page); | |
181 | opg->ops_transfer_pinned = 1; | |
182 | } | |
183 | ||
184 | static void osc_page_transfer_put(const struct lu_env *env, | |
185 | struct osc_page *opg) | |
186 | { | |
187 | struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); | |
188 | ||
189 | if (opg->ops_transfer_pinned) { | |
190 | lu_ref_del(&page->cp_reference, "transfer", page); | |
191 | opg->ops_transfer_pinned = 0; | |
192 | cl_page_put(env, page); | |
193 | } | |
194 | } | |
195 | ||
196 | /** | |
197 | * This is called once for every page when it is submitted for a transfer | |
198 | * either opportunistic (osc_page_cache_add()), or immediate | |
199 | * (osc_page_submit()). | |
200 | */ | |
201 | static void osc_page_transfer_add(const struct lu_env *env, | |
202 | struct osc_page *opg, enum cl_req_type crt) | |
203 | { | |
204 | struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); | |
205 | ||
206 | /* ops_lru and ops_inflight share the same field, so take it from LRU | |
207 | * first and then use it as inflight. */ | |
208 | osc_lru_del(osc_cli(obj), opg, false); | |
209 | ||
210 | spin_lock(&obj->oo_seatbelt); | |
211 | list_add(&opg->ops_inflight, &obj->oo_inflight[crt]); | |
212 | opg->ops_submitter = current; | |
213 | spin_unlock(&obj->oo_seatbelt); | |
214 | } | |
215 | ||
216 | static int osc_page_cache_add(const struct lu_env *env, | |
217 | const struct cl_page_slice *slice, | |
218 | struct cl_io *io) | |
219 | { | |
29ac6840 | 220 | struct osc_io *oio = osc_env_io(env); |
d7e09d03 PT |
221 | struct osc_page *opg = cl2osc_page(slice); |
222 | int result; | |
d7e09d03 PT |
223 | |
224 | LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0)); | |
225 | ||
226 | osc_page_transfer_get(opg, "transfer\0cache"); | |
227 | result = osc_queue_async_io(env, io, opg); | |
228 | if (result != 0) | |
229 | osc_page_transfer_put(env, opg); | |
230 | else | |
231 | osc_page_transfer_add(env, opg, CRT_WRITE); | |
232 | ||
233 | /* for sync write, kernel will wait for this page to be flushed before | |
234 | * osc_io_end() is called, so release it earlier. | |
235 | * for mkwrite(), it's known there is no further pages. */ | |
236 | if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) { | |
237 | if (oio->oi_active != NULL) { | |
238 | osc_extent_release(env, oio->oi_active); | |
239 | oio->oi_active = NULL; | |
240 | } | |
241 | } | |
242 | ||
0a3bdb00 | 243 | return result; |
d7e09d03 PT |
244 | } |
245 | ||
246 | void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj, | |
247 | pgoff_t start, pgoff_t end) | |
248 | { | |
ec83e611 | 249 | memset(policy, 0, sizeof(*policy)); |
d7e09d03 | 250 | policy->l_extent.start = cl_offset(obj, start); |
29ac6840 | 251 | policy->l_extent.end = cl_offset(obj, end + 1) - 1; |
d7e09d03 PT |
252 | } |
253 | ||
254 | static int osc_page_addref_lock(const struct lu_env *env, | |
255 | struct osc_page *opg, | |
256 | struct cl_lock *lock) | |
257 | { | |
258 | struct osc_lock *olock; | |
29ac6840 | 259 | int rc; |
d7e09d03 PT |
260 | |
261 | LASSERT(opg->ops_lock == NULL); | |
262 | ||
263 | olock = osc_lock_at(lock); | |
264 | if (atomic_inc_return(&olock->ols_pageref) <= 0) { | |
265 | atomic_dec(&olock->ols_pageref); | |
266 | rc = -ENODATA; | |
267 | } else { | |
268 | cl_lock_get(lock); | |
269 | opg->ops_lock = lock; | |
270 | rc = 0; | |
271 | } | |
272 | return rc; | |
273 | } | |
274 | ||
275 | static void osc_page_putref_lock(const struct lu_env *env, | |
276 | struct osc_page *opg) | |
277 | { | |
29ac6840 | 278 | struct cl_lock *lock = opg->ops_lock; |
d7e09d03 PT |
279 | struct osc_lock *olock; |
280 | ||
281 | LASSERT(lock != NULL); | |
282 | olock = osc_lock_at(lock); | |
283 | ||
284 | atomic_dec(&olock->ols_pageref); | |
285 | opg->ops_lock = NULL; | |
286 | ||
287 | cl_lock_put(env, lock); | |
288 | } | |
289 | ||
290 | static int osc_page_is_under_lock(const struct lu_env *env, | |
291 | const struct cl_page_slice *slice, | |
292 | struct cl_io *unused) | |
293 | { | |
294 | struct cl_lock *lock; | |
29ac6840 | 295 | int result = -ENODATA; |
d7e09d03 | 296 | |
d7e09d03 PT |
297 | lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page, |
298 | NULL, 1, 0); | |
299 | if (lock != NULL) { | |
300 | if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0) | |
301 | result = -EBUSY; | |
302 | cl_lock_put(env, lock); | |
303 | } | |
0a3bdb00 | 304 | return result; |
d7e09d03 PT |
305 | } |
306 | ||
307 | static void osc_page_disown(const struct lu_env *env, | |
308 | const struct cl_page_slice *slice, | |
309 | struct cl_io *io) | |
310 | { | |
311 | struct osc_page *opg = cl2osc_page(slice); | |
312 | ||
313 | if (unlikely(opg->ops_lock)) | |
314 | osc_page_putref_lock(env, opg); | |
315 | } | |
316 | ||
317 | static void osc_page_completion_read(const struct lu_env *env, | |
318 | const struct cl_page_slice *slice, | |
319 | int ioret) | |
320 | { | |
29ac6840 | 321 | struct osc_page *opg = cl2osc_page(slice); |
d7e09d03 PT |
322 | struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); |
323 | ||
324 | if (likely(opg->ops_lock)) | |
325 | osc_page_putref_lock(env, opg); | |
326 | osc_lru_add(osc_cli(obj), opg); | |
327 | } | |
328 | ||
329 | static void osc_page_completion_write(const struct lu_env *env, | |
330 | const struct cl_page_slice *slice, | |
331 | int ioret) | |
332 | { | |
29ac6840 | 333 | struct osc_page *opg = cl2osc_page(slice); |
d7e09d03 PT |
334 | struct osc_object *obj = cl2osc(slice->cpl_obj); |
335 | ||
336 | osc_lru_add(osc_cli(obj), opg); | |
337 | } | |
338 | ||
339 | static int osc_page_fail(const struct lu_env *env, | |
340 | const struct cl_page_slice *slice, | |
341 | struct cl_io *unused) | |
342 | { | |
343 | /* | |
344 | * Cached read? | |
345 | */ | |
346 | LBUG(); | |
347 | return 0; | |
348 | } | |
349 | ||
d7e09d03 PT |
350 | static const char *osc_list(struct list_head *head) |
351 | { | |
352 | return list_empty(head) ? "-" : "+"; | |
353 | } | |
354 | ||
a649ad1d | 355 | static inline unsigned long osc_submit_duration(struct osc_page *opg) |
d7e09d03 PT |
356 | { |
357 | if (opg->ops_submit_time == 0) | |
358 | return 0; | |
359 | ||
360 | return (cfs_time_current() - opg->ops_submit_time); | |
361 | } | |
362 | ||
363 | static int osc_page_print(const struct lu_env *env, | |
364 | const struct cl_page_slice *slice, | |
365 | void *cookie, lu_printer_t printer) | |
366 | { | |
29ac6840 | 367 | struct osc_page *opg = cl2osc_page(slice); |
d7e09d03 | 368 | struct osc_async_page *oap = &opg->ops_oap; |
29ac6840 CH |
369 | struct osc_object *obj = cl2osc(slice->cpl_obj); |
370 | struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; | |
d7e09d03 | 371 | |
2d00bd17 | 372 | return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %s %p %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n", |
d7e09d03 PT |
373 | opg, |
374 | /* 1 */ | |
375 | oap->oap_magic, oap->oap_cmd, | |
376 | oap->oap_interrupted, | |
377 | osc_list(&oap->oap_pending_item), | |
378 | osc_list(&oap->oap_rpc_item), | |
379 | /* 2 */ | |
380 | oap->oap_obj_off, oap->oap_page_off, oap->oap_count, | |
381 | oap->oap_async_flags, oap->oap_brw_flags, | |
382 | oap->oap_request, oap->oap_cli, obj, | |
383 | /* 3 */ | |
384 | osc_list(&opg->ops_inflight), | |
385 | opg->ops_submitter, opg->ops_transfer_pinned, | |
386 | osc_submit_duration(opg), opg->ops_srvlock, | |
387 | /* 4 */ | |
388 | cli->cl_r_in_flight, cli->cl_w_in_flight, | |
389 | cli->cl_max_rpcs_in_flight, | |
390 | cli->cl_avail_grant, | |
391 | osc_list(&cli->cl_cache_waiters), | |
392 | osc_list(&cli->cl_loi_ready_list), | |
393 | osc_list(&cli->cl_loi_hp_ready_list), | |
394 | osc_list(&cli->cl_loi_write_list), | |
395 | osc_list(&cli->cl_loi_read_list), | |
396 | /* 5 */ | |
397 | osc_list(&obj->oo_ready_item), | |
398 | osc_list(&obj->oo_hp_ready_item), | |
399 | osc_list(&obj->oo_write_item), | |
400 | osc_list(&obj->oo_read_item), | |
401 | atomic_read(&obj->oo_nr_reads), | |
402 | osc_list(&obj->oo_reading_exts), | |
403 | atomic_read(&obj->oo_nr_writes), | |
404 | osc_list(&obj->oo_hp_exts), | |
405 | osc_list(&obj->oo_urgent_exts)); | |
406 | } | |
407 | ||
408 | static void osc_page_delete(const struct lu_env *env, | |
409 | const struct cl_page_slice *slice) | |
410 | { | |
29ac6840 | 411 | struct osc_page *opg = cl2osc_page(slice); |
d7e09d03 PT |
412 | struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); |
413 | int rc; | |
414 | ||
415 | LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1)); | |
416 | ||
d7e09d03 PT |
417 | CDEBUG(D_TRACE, "%p\n", opg); |
418 | osc_page_transfer_put(env, opg); | |
419 | rc = osc_teardown_async_page(env, obj, opg); | |
420 | if (rc) { | |
421 | CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page), | |
422 | "Trying to teardown failed: %d\n", rc); | |
423 | LASSERT(0); | |
424 | } | |
425 | ||
426 | spin_lock(&obj->oo_seatbelt); | |
427 | if (opg->ops_submitter != NULL) { | |
428 | LASSERT(!list_empty(&opg->ops_inflight)); | |
429 | list_del_init(&opg->ops_inflight); | |
430 | opg->ops_submitter = NULL; | |
431 | } | |
432 | spin_unlock(&obj->oo_seatbelt); | |
433 | ||
434 | osc_lru_del(osc_cli(obj), opg, true); | |
d7e09d03 PT |
435 | } |
436 | ||
437 | void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice, | |
438 | int from, int to) | |
439 | { | |
29ac6840 | 440 | struct osc_page *opg = cl2osc_page(slice); |
d7e09d03 PT |
441 | struct osc_async_page *oap = &opg->ops_oap; |
442 | ||
443 | LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); | |
444 | ||
445 | opg->ops_from = from; | |
29ac6840 | 446 | opg->ops_to = to; |
d7e09d03 PT |
447 | spin_lock(&oap->oap_lock); |
448 | oap->oap_async_flags |= ASYNC_COUNT_STABLE; | |
449 | spin_unlock(&oap->oap_lock); | |
450 | } | |
451 | ||
452 | static int osc_page_cancel(const struct lu_env *env, | |
453 | const struct cl_page_slice *slice) | |
454 | { | |
455 | struct osc_page *opg = cl2osc_page(slice); | |
456 | int rc = 0; | |
457 | ||
458 | LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); | |
459 | ||
460 | /* Check if the transferring against this page | |
461 | * is completed, or not even queued. */ | |
462 | if (opg->ops_transfer_pinned) | |
463 | /* FIXME: may not be interrupted.. */ | |
464 | rc = osc_cancel_async_page(env, opg); | |
465 | LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); | |
466 | return rc; | |
467 | } | |
468 | ||
469 | static int osc_page_flush(const struct lu_env *env, | |
470 | const struct cl_page_slice *slice, | |
471 | struct cl_io *io) | |
472 | { | |
473 | struct osc_page *opg = cl2osc_page(slice); | |
53a0d486 | 474 | int rc; |
29aaf496 | 475 | |
d7e09d03 | 476 | rc = osc_flush_async_page(env, io, opg); |
0a3bdb00 | 477 | return rc; |
d7e09d03 PT |
478 | } |
479 | ||
480 | static const struct cl_page_operations osc_page_ops = { | |
481 | .cpo_fini = osc_page_fini, | |
482 | .cpo_print = osc_page_print, | |
483 | .cpo_delete = osc_page_delete, | |
484 | .cpo_is_under_lock = osc_page_is_under_lock, | |
485 | .cpo_disown = osc_page_disown, | |
486 | .io = { | |
487 | [CRT_READ] = { | |
488 | .cpo_cache_add = osc_page_fail, | |
489 | .cpo_completion = osc_page_completion_read | |
490 | }, | |
491 | [CRT_WRITE] = { | |
492 | .cpo_cache_add = osc_page_cache_add, | |
493 | .cpo_completion = osc_page_completion_write | |
494 | } | |
495 | }, | |
496 | .cpo_clip = osc_page_clip, | |
497 | .cpo_cancel = osc_page_cancel, | |
498 | .cpo_flush = osc_page_flush | |
499 | }; | |
500 | ||
501 | int osc_page_init(const struct lu_env *env, struct cl_object *obj, | |
502 | struct cl_page *page, struct page *vmpage) | |
503 | { | |
504 | struct osc_object *osc = cl2osc(obj); | |
29ac6840 | 505 | struct osc_page *opg = cl_object_page_slice(obj, page); |
d7e09d03 PT |
506 | int result; |
507 | ||
508 | opg->ops_from = 0; | |
29ac6840 | 509 | opg->ops_to = PAGE_CACHE_SIZE; |
d7e09d03 PT |
510 | |
511 | result = osc_prep_async_page(osc, opg, vmpage, | |
512 | cl_offset(obj, page->cp_index)); | |
513 | if (result == 0) { | |
514 | struct osc_io *oio = osc_env_io(env); | |
50ffcb7e | 515 | |
d7e09d03 PT |
516 | opg->ops_srvlock = osc_io_srvlock(oio); |
517 | cl_page_slice_add(page, &opg->ops_cl, obj, | |
518 | &osc_page_ops); | |
519 | } | |
520 | /* | |
521 | * Cannot assert osc_page_protected() here as read-ahead | |
522 | * creates temporary pages outside of a lock. | |
523 | */ | |
524 | /* ops_inflight and ops_lru are the same field, but it doesn't | |
525 | * hurt to initialize it twice :-) */ | |
526 | INIT_LIST_HEAD(&opg->ops_inflight); | |
527 | INIT_LIST_HEAD(&opg->ops_lru); | |
528 | ||
529 | /* reserve an LRU space for this page */ | |
530 | if (page->cp_type == CPT_CACHEABLE && result == 0) | |
531 | result = osc_lru_reserve(env, osc, opg); | |
532 | ||
533 | return result; | |
534 | } | |
535 | ||
536 | /** | |
537 | * Helper function called by osc_io_submit() for every page in an immediate | |
538 | * transfer (i.e., transferred synchronously). | |
539 | */ | |
540 | void osc_page_submit(const struct lu_env *env, struct osc_page *opg, | |
541 | enum cl_req_type crt, int brw_flags) | |
542 | { | |
543 | struct osc_async_page *oap = &opg->ops_oap; | |
29ac6840 | 544 | struct osc_object *obj = oap->oap_obj; |
d7e09d03 PT |
545 | |
546 | LINVRNT(osc_page_protected(env, opg, | |
547 | crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1)); | |
548 | ||
2d00bd17 JP |
549 | LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n", |
550 | oap, oap->oap_magic); | |
d7e09d03 PT |
551 | LASSERT(oap->oap_async_flags & ASYNC_READY); |
552 | LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); | |
553 | ||
29ac6840 CH |
554 | oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; |
555 | oap->oap_page_off = opg->ops_from; | |
556 | oap->oap_count = opg->ops_to - opg->ops_from; | |
40daf5ce | 557 | oap->oap_brw_flags = brw_flags | OBD_BRW_SYNC; |
d7e09d03 PT |
558 | |
559 | if (!client_is_remote(osc_export(obj)) && | |
2eb90a75 | 560 | capable(CFS_CAP_SYS_RESOURCE)) { |
d7e09d03 PT |
561 | oap->oap_brw_flags |= OBD_BRW_NOQUOTA; |
562 | oap->oap_cmd |= OBD_BRW_NOQUOTA; | |
563 | } | |
564 | ||
565 | opg->ops_submit_time = cfs_time_current(); | |
566 | osc_page_transfer_get(opg, "transfer\0imm"); | |
567 | osc_page_transfer_add(env, opg, crt); | |
568 | } | |
569 | ||
570 | /* --------------- LRU page management ------------------ */ | |
571 | ||
572 | /* OSC is a natural place to manage LRU pages as applications are specialized | |
573 | * to write OSC by OSC. Ideally, if one OSC is used more frequently it should | |
574 | * occupy more LRU slots. On the other hand, we should avoid using up all LRU | |
575 | * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep | |
576 | * for free LRU slots - this will be very bad so the algorithm requires each | |
577 | * OSC to free slots voluntarily to maintain a reasonable number of free slots | |
578 | * at any time. | |
579 | */ | |
580 | ||
2f5723a1 | 581 | static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq); |
d7e09d03 PT |
582 | static atomic_t osc_lru_waiters = ATOMIC_INIT(0); |
583 | /* LRU pages are freed in batch mode. OSC should at least free this | |
584 | * number of pages to avoid running out of LRU budget, and.. */ | |
585 | static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT); /* 2M */ | |
11d66e89 | 586 | /* free this number at most otherwise it will take too long time to finish. */ |
d7e09d03 PT |
587 | static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */ |
588 | ||
589 | /* Check if we can free LRU slots from this OSC. If there exists LRU waiters, | |
590 | * we should free slots aggressively. In this way, slots are freed in a steady | |
591 | * step to maintain fairness among OSCs. | |
592 | * | |
593 | * Return how many LRU pages should be freed. */ | |
594 | static int osc_cache_too_much(struct client_obd *cli) | |
595 | { | |
596 | struct cl_client_cache *cache = cli->cl_cache; | |
597 | int pages = atomic_read(&cli->cl_lru_in_list) >> 1; | |
598 | ||
599 | if (atomic_read(&osc_lru_waiters) > 0 && | |
600 | atomic_read(cli->cl_lru_left) < lru_shrink_max) | |
601 | /* drop lru pages aggressively */ | |
602 | return min(pages, lru_shrink_max); | |
603 | ||
604 | /* if it's going to run out LRU slots, we should free some, but not | |
11d66e89 | 605 | * too much to maintain fairness among OSCs. */ |
d7e09d03 PT |
606 | if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) { |
607 | unsigned long tmp; | |
608 | ||
609 | tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users); | |
610 | if (pages > tmp) | |
611 | return min(pages, lru_shrink_max); | |
612 | ||
613 | return pages > lru_shrink_min ? lru_shrink_min : 0; | |
614 | } | |
615 | ||
616 | return 0; | |
617 | } | |
618 | ||
619 | /* Return how many pages are not discarded in @pvec. */ | |
620 | static int discard_pagevec(const struct lu_env *env, struct cl_io *io, | |
621 | struct cl_page **pvec, int max_index) | |
622 | { | |
623 | int count; | |
624 | int i; | |
625 | ||
626 | for (count = 0, i = 0; i < max_index; i++) { | |
627 | struct cl_page *page = pvec[i]; | |
50ffcb7e | 628 | |
d7e09d03 PT |
629 | if (cl_page_own_try(env, io, page) == 0) { |
630 | /* free LRU page only if nobody is using it. | |
631 | * This check is necessary to avoid freeing the pages | |
632 | * having already been removed from LRU and pinned | |
633 | * for IO. */ | |
634 | if (!cl_page_in_use(page)) { | |
635 | cl_page_unmap(env, io, page); | |
636 | cl_page_discard(env, io, page); | |
637 | ++count; | |
638 | } | |
639 | cl_page_disown(env, io, page); | |
640 | } | |
641 | cl_page_put(env, page); | |
642 | pvec[i] = NULL; | |
643 | } | |
644 | return max_index - count; | |
645 | } | |
646 | ||
647 | /** | |
648 | * Drop @target of pages from LRU at most. | |
649 | */ | |
650 | int osc_lru_shrink(struct client_obd *cli, int target) | |
651 | { | |
652 | struct cl_env_nest nest; | |
653 | struct lu_env *env; | |
654 | struct cl_io *io; | |
655 | struct cl_object *clobj = NULL; | |
656 | struct cl_page **pvec; | |
657 | struct osc_page *opg; | |
658 | int maxscan = 0; | |
659 | int count = 0; | |
660 | int index = 0; | |
661 | int rc = 0; | |
d7e09d03 PT |
662 | |
663 | LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0); | |
664 | if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0) | |
0a3bdb00 | 665 | return 0; |
d7e09d03 PT |
666 | |
667 | env = cl_env_nested_get(&nest); | |
668 | if (IS_ERR(env)) | |
0a3bdb00 | 669 | return PTR_ERR(env); |
d7e09d03 PT |
670 | |
671 | pvec = osc_env_info(env)->oti_pvec; | |
672 | io = &osc_env_info(env)->oti_io; | |
673 | ||
674 | client_obd_list_lock(&cli->cl_lru_list_lock); | |
675 | atomic_inc(&cli->cl_lru_shrinkers); | |
676 | maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list)); | |
677 | while (!list_empty(&cli->cl_lru_list)) { | |
678 | struct cl_page *page; | |
679 | ||
680 | if (--maxscan < 0) | |
681 | break; | |
682 | ||
683 | opg = list_entry(cli->cl_lru_list.next, struct osc_page, | |
684 | ops_lru); | |
685 | page = cl_page_top(opg->ops_cl.cpl_page); | |
686 | if (cl_page_in_use_noref(page)) { | |
687 | list_move_tail(&opg->ops_lru, &cli->cl_lru_list); | |
688 | continue; | |
689 | } | |
690 | ||
691 | LASSERT(page->cp_obj != NULL); | |
692 | if (clobj != page->cp_obj) { | |
693 | struct cl_object *tmp = page->cp_obj; | |
694 | ||
695 | cl_object_get(tmp); | |
696 | client_obd_list_unlock(&cli->cl_lru_list_lock); | |
697 | ||
698 | if (clobj != NULL) { | |
699 | count -= discard_pagevec(env, io, pvec, index); | |
700 | index = 0; | |
701 | ||
702 | cl_io_fini(env, io); | |
703 | cl_object_put(env, clobj); | |
704 | clobj = NULL; | |
705 | } | |
706 | ||
707 | clobj = tmp; | |
708 | io->ci_obj = clobj; | |
709 | io->ci_ignore_layout = 1; | |
710 | rc = cl_io_init(env, io, CIT_MISC, clobj); | |
711 | ||
712 | client_obd_list_lock(&cli->cl_lru_list_lock); | |
713 | ||
714 | if (rc != 0) | |
715 | break; | |
716 | ||
717 | ++maxscan; | |
718 | continue; | |
719 | } | |
720 | ||
721 | /* move this page to the end of list as it will be discarded | |
722 | * soon. The page will be finally removed from LRU list in | |
723 | * osc_page_delete(). */ | |
724 | list_move_tail(&opg->ops_lru, &cli->cl_lru_list); | |
725 | ||
726 | /* it's okay to grab a refcount here w/o holding lock because | |
727 | * it has to grab cl_lru_list_lock to delete the page. */ | |
728 | cl_page_get(page); | |
729 | pvec[index++] = page; | |
730 | if (++count >= target) | |
731 | break; | |
732 | ||
733 | if (unlikely(index == OTI_PVEC_SIZE)) { | |
734 | client_obd_list_unlock(&cli->cl_lru_list_lock); | |
735 | count -= discard_pagevec(env, io, pvec, index); | |
736 | index = 0; | |
737 | ||
738 | client_obd_list_lock(&cli->cl_lru_list_lock); | |
739 | } | |
740 | } | |
741 | client_obd_list_unlock(&cli->cl_lru_list_lock); | |
742 | ||
743 | if (clobj != NULL) { | |
744 | count -= discard_pagevec(env, io, pvec, index); | |
745 | ||
746 | cl_io_fini(env, io); | |
747 | cl_object_put(env, clobj); | |
748 | } | |
749 | cl_env_nested_put(&nest, env); | |
750 | ||
751 | atomic_dec(&cli->cl_lru_shrinkers); | |
0a3bdb00 | 752 | return count > 0 ? count : rc; |
d7e09d03 PT |
753 | } |
754 | ||
755 | static void osc_lru_add(struct client_obd *cli, struct osc_page *opg) | |
756 | { | |
757 | bool wakeup = false; | |
758 | ||
759 | if (!opg->ops_in_lru) | |
760 | return; | |
761 | ||
762 | atomic_dec(&cli->cl_lru_busy); | |
763 | client_obd_list_lock(&cli->cl_lru_list_lock); | |
764 | if (list_empty(&opg->ops_lru)) { | |
765 | list_move_tail(&opg->ops_lru, &cli->cl_lru_list); | |
766 | atomic_inc_return(&cli->cl_lru_in_list); | |
767 | wakeup = atomic_read(&osc_lru_waiters) > 0; | |
768 | } | |
769 | client_obd_list_unlock(&cli->cl_lru_list_lock); | |
770 | ||
771 | if (wakeup) { | |
772 | osc_lru_shrink(cli, osc_cache_too_much(cli)); | |
773 | wake_up_all(&osc_lru_waitq); | |
774 | } | |
775 | } | |
776 | ||
777 | /* delete page from LRUlist. The page can be deleted from LRUlist for two | |
778 | * reasons: redirtied or deleted from page cache. */ | |
779 | static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del) | |
780 | { | |
781 | if (opg->ops_in_lru) { | |
782 | client_obd_list_lock(&cli->cl_lru_list_lock); | |
783 | if (!list_empty(&opg->ops_lru)) { | |
784 | LASSERT(atomic_read(&cli->cl_lru_in_list) > 0); | |
785 | list_del_init(&opg->ops_lru); | |
786 | atomic_dec(&cli->cl_lru_in_list); | |
787 | if (!del) | |
788 | atomic_inc(&cli->cl_lru_busy); | |
789 | } else if (del) { | |
790 | LASSERT(atomic_read(&cli->cl_lru_busy) > 0); | |
791 | atomic_dec(&cli->cl_lru_busy); | |
792 | } | |
793 | client_obd_list_unlock(&cli->cl_lru_list_lock); | |
794 | if (del) { | |
795 | atomic_inc(cli->cl_lru_left); | |
796 | /* this is a great place to release more LRU pages if | |
797 | * this osc occupies too many LRU pages and kernel is | |
798 | * stealing one of them. | |
799 | * cl_lru_shrinkers is to avoid recursive call in case | |
800 | * we're already in the context of osc_lru_shrink(). */ | |
cad6fafa BJ |
801 | if (atomic_read(&cli->cl_lru_shrinkers) == 0 && |
802 | !memory_pressure_get()) | |
d7e09d03 PT |
803 | osc_lru_shrink(cli, osc_cache_too_much(cli)); |
804 | wake_up(&osc_lru_waitq); | |
805 | } | |
806 | } else { | |
807 | LASSERT(list_empty(&opg->ops_lru)); | |
808 | } | |
809 | } | |
810 | ||
811 | static inline int max_to_shrink(struct client_obd *cli) | |
812 | { | |
813 | return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max); | |
814 | } | |
815 | ||
816 | static int osc_lru_reclaim(struct client_obd *cli) | |
817 | { | |
818 | struct cl_client_cache *cache = cli->cl_cache; | |
819 | int max_scans; | |
820 | int rc; | |
821 | ||
822 | LASSERT(cache != NULL); | |
d7e09d03 PT |
823 | |
824 | rc = osc_lru_shrink(cli, lru_shrink_min); | |
825 | if (rc != 0) { | |
826 | CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n", | |
827 | cli->cl_import->imp_obd->obd_name, rc, cli); | |
828 | return rc; | |
829 | } | |
830 | ||
831 | CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n", | |
832 | cli->cl_import->imp_obd->obd_name, cli, | |
833 | atomic_read(&cli->cl_lru_in_list), | |
834 | atomic_read(&cli->cl_lru_busy)); | |
835 | ||
836 | /* Reclaim LRU slots from other client_obd as it can't free enough | |
837 | * from its own. This should rarely happen. */ | |
838 | spin_lock(&cache->ccc_lru_lock); | |
0df83c18 HN |
839 | LASSERT(!list_empty(&cache->ccc_lru)); |
840 | ||
d7e09d03 PT |
841 | cache->ccc_lru_shrinkers++; |
842 | list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); | |
843 | ||
844 | max_scans = atomic_read(&cache->ccc_users); | |
845 | while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) { | |
846 | cli = list_entry(cache->ccc_lru.next, struct client_obd, | |
847 | cl_lru_osc); | |
848 | ||
849 | CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n", | |
850 | cli->cl_import->imp_obd->obd_name, cli, | |
851 | atomic_read(&cli->cl_lru_in_list), | |
852 | atomic_read(&cli->cl_lru_busy)); | |
853 | ||
854 | list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); | |
855 | if (atomic_read(&cli->cl_lru_in_list) > 0) { | |
856 | spin_unlock(&cache->ccc_lru_lock); | |
857 | ||
858 | rc = osc_lru_shrink(cli, max_to_shrink(cli)); | |
859 | spin_lock(&cache->ccc_lru_lock); | |
860 | if (rc != 0) | |
861 | break; | |
862 | } | |
863 | } | |
864 | spin_unlock(&cache->ccc_lru_lock); | |
865 | ||
866 | CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n", | |
867 | cli->cl_import->imp_obd->obd_name, cli, rc); | |
868 | return rc; | |
869 | } | |
870 | ||
871 | static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, | |
872 | struct osc_page *opg) | |
873 | { | |
874 | struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
875 | struct client_obd *cli = osc_cli(obj); | |
876 | int rc = 0; | |
d7e09d03 PT |
877 | |
878 | if (cli->cl_cache == NULL) /* shall not be in LRU */ | |
0a3bdb00 | 879 | return 0; |
d7e09d03 PT |
880 | |
881 | LASSERT(atomic_read(cli->cl_lru_left) >= 0); | |
305ec768 | 882 | while (!atomic_add_unless(cli->cl_lru_left, -1, 0)) { |
d7e09d03 PT |
883 | int gen; |
884 | ||
885 | /* run out of LRU spaces, try to drop some by itself */ | |
886 | rc = osc_lru_reclaim(cli); | |
887 | if (rc < 0) | |
888 | break; | |
889 | if (rc > 0) | |
890 | continue; | |
891 | ||
892 | cond_resched(); | |
893 | ||
894 | /* slowest case, all of caching pages are busy, notifying | |
895 | * other OSCs that we're lack of LRU slots. */ | |
896 | atomic_inc(&osc_lru_waiters); | |
897 | ||
898 | gen = atomic_read(&cli->cl_lru_in_list); | |
899 | rc = l_wait_event(osc_lru_waitq, | |
900 | atomic_read(cli->cl_lru_left) > 0 || | |
901 | (atomic_read(&cli->cl_lru_in_list) > 0 && | |
902 | gen != atomic_read(&cli->cl_lru_in_list)), | |
903 | &lwi); | |
904 | ||
905 | atomic_dec(&osc_lru_waiters); | |
906 | if (rc < 0) | |
907 | break; | |
908 | } | |
909 | ||
910 | if (rc >= 0) { | |
911 | atomic_inc(&cli->cl_lru_busy); | |
912 | opg->ops_in_lru = 1; | |
913 | rc = 0; | |
914 | } | |
915 | ||
0a3bdb00 | 916 | return rc; |
d7e09d03 PT |
917 | } |
918 | ||
919 | /** @} osc */ |