Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
1dc563a6 | 30 | * Copyright (c) 2012, 2015, Intel Corporation. |
d7e09d03 PT |
31 | * |
32 | */ | |
33 | /* | |
34 | * This file is part of Lustre, http://www.lustre.org/ | |
35 | * Lustre is a trademark of Sun Microsystems, Inc. | |
36 | * | |
37 | * osc cache management. | |
38 | * | |
39 | * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> | |
40 | */ | |
41 | ||
42 | #define DEBUG_SUBSYSTEM S_OSC | |
43 | ||
44 | #include "osc_cl_internal.h" | |
45 | #include "osc_internal.h" | |
46 | ||
47 | static int extent_debug; /* set it to be true for more debug */ | |
48 | ||
49 | static void osc_update_pending(struct osc_object *obj, int cmd, int delta); | |
50 | static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, | |
51 | int state); | |
52 | static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, | |
53 | struct osc_async_page *oap, int sent, int rc); | |
54 | static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, | |
55 | int cmd); | |
56 | static int osc_refresh_count(const struct lu_env *env, | |
57 | struct osc_async_page *oap, int cmd); | |
58 | static int osc_io_unplug_async(const struct lu_env *env, | |
59 | struct client_obd *cli, struct osc_object *osc); | |
60 | static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, | |
61 | unsigned int lost_grant); | |
62 | ||
63 | static void osc_extent_tree_dump0(int level, struct osc_object *obj, | |
64 | const char *func, int line); | |
65 | #define osc_extent_tree_dump(lvl, obj) \ | |
66 | osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) | |
67 | ||
68 | /** \addtogroup osc | |
69 | * @{ | |
70 | */ | |
71 | ||
72 | /* ------------------ osc extent ------------------ */ | |
73 | static inline char *ext_flags(struct osc_extent *ext, char *flags) | |
74 | { | |
75 | char *buf = flags; | |
76 | *buf++ = ext->oe_rw ? 'r' : 'w'; | |
77 | if (ext->oe_intree) | |
78 | *buf++ = 'i'; | |
79 | if (ext->oe_srvlock) | |
80 | *buf++ = 's'; | |
81 | if (ext->oe_hp) | |
82 | *buf++ = 'h'; | |
83 | if (ext->oe_urgent) | |
84 | *buf++ = 'u'; | |
85 | if (ext->oe_memalloc) | |
86 | *buf++ = 'm'; | |
87 | if (ext->oe_trunc_pending) | |
88 | *buf++ = 't'; | |
89 | if (ext->oe_fsync_wait) | |
90 | *buf++ = 'Y'; | |
91 | *buf = 0; | |
92 | return flags; | |
93 | } | |
94 | ||
95 | static inline char list_empty_marker(struct list_head *list) | |
96 | { | |
97 | return list_empty(list) ? '-' : '+'; | |
98 | } | |
99 | ||
100 | #define EXTSTR "[%lu -> %lu/%lu]" | |
101 | #define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end | |
cad6fafa BJ |
102 | static const char *oes_strings[] = { |
103 | "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; | |
d7e09d03 PT |
104 | |
105 | #define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ | |
106 | struct osc_extent *__ext = (extent); \ | |
d7e09d03 PT |
107 | char __buf[16]; \ |
108 | \ | |
109 | CDEBUG(lvl, \ | |
110 | "extent %p@{" EXTSTR ", " \ | |
111 | "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ | |
112 | /* ----- extent part 0 ----- */ \ | |
113 | __ext, EXTPARA(__ext), \ | |
114 | /* ----- part 1 ----- */ \ | |
29ac6840 CH |
115 | atomic_read(&__ext->oe_refc), \ |
116 | atomic_read(&__ext->oe_users), \ | |
d7e09d03 | 117 | list_empty_marker(&__ext->oe_link), \ |
cad6fafa | 118 | oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ |
d7e09d03 PT |
119 | __ext->oe_obj, \ |
120 | /* ----- part 2 ----- */ \ | |
121 | __ext->oe_grants, __ext->oe_nr_pages, \ | |
122 | list_empty_marker(&__ext->oe_pages), \ | |
123 | waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ | |
124 | __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \ | |
125 | /* ----- part 4 ----- */ \ | |
126 | ## __VA_ARGS__); \ | |
127 | } while (0) | |
128 | ||
129 | #undef EASSERTF | |
130 | #define EASSERTF(expr, ext, fmt, args...) do { \ | |
131 | if (!(expr)) { \ | |
cad6fafa BJ |
132 | OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ |
133 | osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ | |
d7e09d03 | 134 | LASSERT(expr); \ |
cad6fafa | 135 | } \ |
d7e09d03 PT |
136 | } while (0) |
137 | ||
138 | #undef EASSERT | |
139 | #define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") | |
140 | ||
141 | static inline struct osc_extent *rb_extent(struct rb_node *n) | |
142 | { | |
7f1ae4c0 | 143 | if (!n) |
d7e09d03 PT |
144 | return NULL; |
145 | ||
146 | return container_of(n, struct osc_extent, oe_node); | |
147 | } | |
148 | ||
149 | static inline struct osc_extent *next_extent(struct osc_extent *ext) | |
150 | { | |
7f1ae4c0 | 151 | if (!ext) |
d7e09d03 PT |
152 | return NULL; |
153 | ||
154 | LASSERT(ext->oe_intree); | |
155 | return rb_extent(rb_next(&ext->oe_node)); | |
156 | } | |
157 | ||
158 | static inline struct osc_extent *prev_extent(struct osc_extent *ext) | |
159 | { | |
7f1ae4c0 | 160 | if (!ext) |
d7e09d03 PT |
161 | return NULL; |
162 | ||
163 | LASSERT(ext->oe_intree); | |
164 | return rb_extent(rb_prev(&ext->oe_node)); | |
165 | } | |
166 | ||
167 | static inline struct osc_extent *first_extent(struct osc_object *obj) | |
168 | { | |
169 | return rb_extent(rb_first(&obj->oo_root)); | |
170 | } | |
171 | ||
172 | /* object must be locked by caller. */ | |
173 | static int osc_extent_sanity_check0(struct osc_extent *ext, | |
174 | const char *func, const int line) | |
175 | { | |
176 | struct osc_object *obj = ext->oe_obj; | |
177 | struct osc_async_page *oap; | |
178 | int page_count; | |
179 | int rc = 0; | |
180 | ||
490e0e89 JL |
181 | if (!osc_object_is_locked(obj)) { |
182 | rc = 9; | |
183 | goto out; | |
184 | } | |
d7e09d03 | 185 | |
490e0e89 JL |
186 | if (ext->oe_state >= OES_STATE_MAX) { |
187 | rc = 10; | |
188 | goto out; | |
189 | } | |
d7e09d03 | 190 | |
490e0e89 JL |
191 | if (atomic_read(&ext->oe_refc) <= 0) { |
192 | rc = 20; | |
193 | goto out; | |
194 | } | |
d7e09d03 | 195 | |
490e0e89 JL |
196 | if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) { |
197 | rc = 30; | |
198 | goto out; | |
199 | } | |
d7e09d03 PT |
200 | |
201 | switch (ext->oe_state) { | |
202 | case OES_INV: | |
203 | if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) | |
490e0e89 JL |
204 | rc = 35; |
205 | else | |
206 | rc = 0; | |
207 | goto out; | |
d7e09d03 | 208 | case OES_ACTIVE: |
490e0e89 JL |
209 | if (atomic_read(&ext->oe_users) == 0) { |
210 | rc = 40; | |
211 | goto out; | |
212 | } | |
213 | if (ext->oe_hp) { | |
214 | rc = 50; | |
215 | goto out; | |
216 | } | |
217 | if (ext->oe_fsync_wait && !ext->oe_urgent) { | |
218 | rc = 55; | |
219 | goto out; | |
220 | } | |
d7e09d03 PT |
221 | break; |
222 | case OES_CACHE: | |
490e0e89 JL |
223 | if (ext->oe_grants == 0) { |
224 | rc = 60; | |
225 | goto out; | |
226 | } | |
227 | if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) { | |
228 | rc = 65; | |
229 | goto out; | |
230 | } | |
d7e09d03 | 231 | default: |
490e0e89 JL |
232 | if (atomic_read(&ext->oe_users) > 0) { |
233 | rc = 70; | |
234 | goto out; | |
235 | } | |
d7e09d03 PT |
236 | } |
237 | ||
490e0e89 JL |
238 | if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) { |
239 | rc = 80; | |
240 | goto out; | |
241 | } | |
d7e09d03 | 242 | |
7f1ae4c0 | 243 | if (!ext->oe_osclock && ext->oe_grants > 0) { |
490e0e89 JL |
244 | rc = 90; |
245 | goto out; | |
246 | } | |
d7e09d03 PT |
247 | |
248 | if (ext->oe_osclock) { | |
249 | struct cl_lock_descr *descr; | |
50ffcb7e | 250 | |
d7e09d03 PT |
251 | descr = &ext->oe_osclock->cll_descr; |
252 | if (!(descr->cld_start <= ext->oe_start && | |
490e0e89 JL |
253 | descr->cld_end >= ext->oe_max_end)) { |
254 | rc = 100; | |
255 | goto out; | |
256 | } | |
d7e09d03 PT |
257 | } |
258 | ||
490e0e89 JL |
259 | if (ext->oe_nr_pages > ext->oe_mppr) { |
260 | rc = 105; | |
261 | goto out; | |
262 | } | |
d7e09d03 PT |
263 | |
264 | /* Do not verify page list if extent is in RPC. This is because an | |
30aa9c52 OD |
265 | * in-RPC extent is supposed to be exclusively accessible w/o lock. |
266 | */ | |
490e0e89 JL |
267 | if (ext->oe_state > OES_CACHE) { |
268 | rc = 0; | |
269 | goto out; | |
270 | } | |
d7e09d03 | 271 | |
490e0e89 JL |
272 | if (!extent_debug) { |
273 | rc = 0; | |
274 | goto out; | |
275 | } | |
d7e09d03 PT |
276 | |
277 | page_count = 0; | |
278 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
279 | pgoff_t index = oap2cl_page(oap)->cp_index; | |
280 | ++page_count; | |
490e0e89 JL |
281 | if (index > ext->oe_end || index < ext->oe_start) { |
282 | rc = 110; | |
283 | goto out; | |
284 | } | |
285 | } | |
286 | if (page_count != ext->oe_nr_pages) { | |
287 | rc = 120; | |
288 | goto out; | |
d7e09d03 | 289 | } |
d7e09d03 PT |
290 | |
291 | out: | |
292 | if (rc != 0) | |
293 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
294 | "%s:%d sanity check %p failed with rc = %d\n", | |
295 | func, line, ext, rc); | |
296 | return rc; | |
297 | } | |
298 | ||
299 | #define sanity_check_nolock(ext) \ | |
300 | osc_extent_sanity_check0(ext, __func__, __LINE__) | |
301 | ||
29ac6840 CH |
302 | #define sanity_check(ext) ({ \ |
303 | int __res; \ | |
d7e09d03 | 304 | osc_object_lock((ext)->oe_obj); \ |
29ac6840 CH |
305 | __res = sanity_check_nolock(ext); \ |
306 | osc_object_unlock((ext)->oe_obj); \ | |
307 | __res; \ | |
d7e09d03 PT |
308 | }) |
309 | ||
d7e09d03 PT |
310 | /** |
311 | * sanity check - to make sure there is no overlapped extent in the tree. | |
312 | */ | |
313 | static int osc_extent_is_overlapped(struct osc_object *obj, | |
314 | struct osc_extent *ext) | |
315 | { | |
316 | struct osc_extent *tmp; | |
317 | ||
318 | LASSERT(osc_object_is_locked(obj)); | |
319 | ||
320 | if (!extent_debug) | |
321 | return 0; | |
322 | ||
7f1ae4c0 | 323 | for (tmp = first_extent(obj); tmp; tmp = next_extent(tmp)) { |
d7e09d03 PT |
324 | if (tmp == ext) |
325 | continue; | |
326 | if (tmp->oe_end >= ext->oe_start && | |
327 | tmp->oe_start <= ext->oe_end) | |
328 | return 1; | |
329 | } | |
330 | return 0; | |
331 | } | |
332 | ||
333 | static void osc_extent_state_set(struct osc_extent *ext, int state) | |
334 | { | |
335 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
336 | LASSERT(state >= OES_INV && state < OES_STATE_MAX); | |
337 | ||
338 | /* Never try to sanity check a state changing extent :-) */ | |
339 | /* LASSERT(sanity_check_nolock(ext) == 0); */ | |
340 | ||
341 | /* TODO: validate the state machine */ | |
342 | ext->oe_state = state; | |
343 | wake_up_all(&ext->oe_waitq); | |
344 | } | |
345 | ||
346 | static struct osc_extent *osc_extent_alloc(struct osc_object *obj) | |
347 | { | |
348 | struct osc_extent *ext; | |
349 | ||
40113370 | 350 | ext = kmem_cache_alloc(osc_extent_kmem, GFP_NOFS | __GFP_ZERO); |
7f1ae4c0 | 351 | if (!ext) |
d7e09d03 PT |
352 | return NULL; |
353 | ||
354 | RB_CLEAR_NODE(&ext->oe_node); | |
355 | ext->oe_obj = obj; | |
356 | atomic_set(&ext->oe_refc, 1); | |
357 | atomic_set(&ext->oe_users, 0); | |
358 | INIT_LIST_HEAD(&ext->oe_link); | |
359 | ext->oe_state = OES_INV; | |
360 | INIT_LIST_HEAD(&ext->oe_pages); | |
361 | init_waitqueue_head(&ext->oe_waitq); | |
362 | ext->oe_osclock = NULL; | |
363 | ||
364 | return ext; | |
365 | } | |
366 | ||
367 | static void osc_extent_free(struct osc_extent *ext) | |
368 | { | |
50d30362 | 369 | kmem_cache_free(osc_extent_kmem, ext); |
d7e09d03 PT |
370 | } |
371 | ||
372 | static struct osc_extent *osc_extent_get(struct osc_extent *ext) | |
373 | { | |
374 | LASSERT(atomic_read(&ext->oe_refc) >= 0); | |
375 | atomic_inc(&ext->oe_refc); | |
376 | return ext; | |
377 | } | |
378 | ||
379 | static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) | |
380 | { | |
381 | LASSERT(atomic_read(&ext->oe_refc) > 0); | |
382 | if (atomic_dec_and_test(&ext->oe_refc)) { | |
383 | LASSERT(list_empty(&ext->oe_link)); | |
384 | LASSERT(atomic_read(&ext->oe_users) == 0); | |
385 | LASSERT(ext->oe_state == OES_INV); | |
386 | LASSERT(!ext->oe_intree); | |
387 | ||
388 | if (ext->oe_osclock) { | |
389 | cl_lock_put(env, ext->oe_osclock); | |
390 | ext->oe_osclock = NULL; | |
391 | } | |
392 | osc_extent_free(ext); | |
393 | } | |
394 | } | |
395 | ||
396 | /** | |
397 | * osc_extent_put_trust() is a special version of osc_extent_put() when | |
398 | * it's known that the caller is not the last user. This is to address the | |
399 | * problem of lacking of lu_env ;-). | |
400 | */ | |
401 | static void osc_extent_put_trust(struct osc_extent *ext) | |
402 | { | |
403 | LASSERT(atomic_read(&ext->oe_refc) > 1); | |
404 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
405 | atomic_dec(&ext->oe_refc); | |
406 | } | |
407 | ||
408 | /** | |
409 | * Return the extent which includes pgoff @index, or return the greatest | |
410 | * previous extent in the tree. | |
411 | */ | |
412 | static struct osc_extent *osc_extent_search(struct osc_object *obj, | |
413 | pgoff_t index) | |
414 | { | |
29ac6840 | 415 | struct rb_node *n = obj->oo_root.rb_node; |
d7e09d03 PT |
416 | struct osc_extent *tmp, *p = NULL; |
417 | ||
418 | LASSERT(osc_object_is_locked(obj)); | |
7f1ae4c0 | 419 | while (n) { |
d7e09d03 PT |
420 | tmp = rb_extent(n); |
421 | if (index < tmp->oe_start) { | |
422 | n = n->rb_left; | |
423 | } else if (index > tmp->oe_end) { | |
424 | p = rb_extent(n); | |
425 | n = n->rb_right; | |
426 | } else { | |
427 | return tmp; | |
428 | } | |
429 | } | |
430 | return p; | |
431 | } | |
432 | ||
433 | /* | |
434 | * Return the extent covering @index, otherwise return NULL. | |
435 | * caller must have held object lock. | |
436 | */ | |
437 | static struct osc_extent *osc_extent_lookup(struct osc_object *obj, | |
438 | pgoff_t index) | |
439 | { | |
440 | struct osc_extent *ext; | |
441 | ||
442 | ext = osc_extent_search(obj, index); | |
7f1ae4c0 | 443 | if (ext && ext->oe_start <= index && index <= ext->oe_end) |
d7e09d03 PT |
444 | return osc_extent_get(ext); |
445 | return NULL; | |
446 | } | |
447 | ||
448 | /* caller must have held object lock. */ | |
449 | static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) | |
450 | { | |
29ac6840 CH |
451 | struct rb_node **n = &obj->oo_root.rb_node; |
452 | struct rb_node *parent = NULL; | |
d7e09d03 PT |
453 | struct osc_extent *tmp; |
454 | ||
455 | LASSERT(ext->oe_intree == 0); | |
456 | LASSERT(ext->oe_obj == obj); | |
457 | LASSERT(osc_object_is_locked(obj)); | |
7f1ae4c0 | 458 | while (*n) { |
d7e09d03 PT |
459 | tmp = rb_extent(*n); |
460 | parent = *n; | |
461 | ||
462 | if (ext->oe_end < tmp->oe_start) | |
463 | n = &(*n)->rb_left; | |
464 | else if (ext->oe_start > tmp->oe_end) | |
465 | n = &(*n)->rb_right; | |
466 | else | |
467 | EASSERTF(0, tmp, EXTSTR, EXTPARA(ext)); | |
468 | } | |
469 | rb_link_node(&ext->oe_node, parent, n); | |
470 | rb_insert_color(&ext->oe_node, &obj->oo_root); | |
471 | osc_extent_get(ext); | |
472 | ext->oe_intree = 1; | |
473 | } | |
474 | ||
475 | /* caller must have held object lock. */ | |
476 | static void osc_extent_erase(struct osc_extent *ext) | |
477 | { | |
478 | struct osc_object *obj = ext->oe_obj; | |
50ffcb7e | 479 | |
d7e09d03 PT |
480 | LASSERT(osc_object_is_locked(obj)); |
481 | if (ext->oe_intree) { | |
482 | rb_erase(&ext->oe_node, &obj->oo_root); | |
483 | ext->oe_intree = 0; | |
484 | /* rbtree held a refcount */ | |
485 | osc_extent_put_trust(ext); | |
486 | } | |
487 | } | |
488 | ||
489 | static struct osc_extent *osc_extent_hold(struct osc_extent *ext) | |
490 | { | |
491 | struct osc_object *obj = ext->oe_obj; | |
492 | ||
493 | LASSERT(osc_object_is_locked(obj)); | |
494 | LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); | |
495 | if (ext->oe_state == OES_CACHE) { | |
496 | osc_extent_state_set(ext, OES_ACTIVE); | |
497 | osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); | |
498 | } | |
499 | atomic_inc(&ext->oe_users); | |
500 | list_del_init(&ext->oe_link); | |
501 | return osc_extent_get(ext); | |
502 | } | |
503 | ||
504 | static void __osc_extent_remove(struct osc_extent *ext) | |
505 | { | |
506 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
507 | LASSERT(list_empty(&ext->oe_pages)); | |
508 | osc_extent_erase(ext); | |
509 | list_del_init(&ext->oe_link); | |
510 | osc_extent_state_set(ext, OES_INV); | |
511 | OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); | |
512 | } | |
513 | ||
514 | static void osc_extent_remove(struct osc_extent *ext) | |
515 | { | |
516 | struct osc_object *obj = ext->oe_obj; | |
517 | ||
518 | osc_object_lock(obj); | |
519 | __osc_extent_remove(ext); | |
520 | osc_object_unlock(obj); | |
521 | } | |
522 | ||
523 | /** | |
524 | * This function is used to merge extents to get better performance. It checks | |
525 | * if @cur and @victim are contiguous at chunk level. | |
526 | */ | |
527 | static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, | |
528 | struct osc_extent *victim) | |
529 | { | |
530 | struct osc_object *obj = cur->oe_obj; | |
531 | pgoff_t chunk_start; | |
532 | pgoff_t chunk_end; | |
533 | int ppc_bits; | |
534 | ||
535 | LASSERT(cur->oe_state == OES_CACHE); | |
536 | LASSERT(osc_object_is_locked(obj)); | |
7f1ae4c0 | 537 | if (!victim) |
d7e09d03 PT |
538 | return -EINVAL; |
539 | ||
540 | if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) | |
541 | return -EBUSY; | |
542 | ||
543 | if (cur->oe_max_end != victim->oe_max_end) | |
544 | return -ERANGE; | |
545 | ||
546 | LASSERT(cur->oe_osclock == victim->oe_osclock); | |
547 | ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; | |
548 | chunk_start = cur->oe_start >> ppc_bits; | |
29ac6840 CH |
549 | chunk_end = cur->oe_end >> ppc_bits; |
550 | if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && | |
d7e09d03 PT |
551 | chunk_end + 1 != victim->oe_start >> ppc_bits) |
552 | return -ERANGE; | |
553 | ||
554 | OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); | |
555 | ||
29ac6840 CH |
556 | cur->oe_start = min(cur->oe_start, victim->oe_start); |
557 | cur->oe_end = max(cur->oe_end, victim->oe_end); | |
558 | cur->oe_grants += victim->oe_grants; | |
d7e09d03 PT |
559 | cur->oe_nr_pages += victim->oe_nr_pages; |
560 | /* only the following bits are needed to merge */ | |
29ac6840 | 561 | cur->oe_urgent |= victim->oe_urgent; |
d7e09d03 PT |
562 | cur->oe_memalloc |= victim->oe_memalloc; |
563 | list_splice_init(&victim->oe_pages, &cur->oe_pages); | |
564 | list_del_init(&victim->oe_link); | |
565 | victim->oe_nr_pages = 0; | |
566 | ||
567 | osc_extent_get(victim); | |
568 | __osc_extent_remove(victim); | |
569 | osc_extent_put(env, victim); | |
570 | ||
571 | OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); | |
572 | return 0; | |
573 | } | |
574 | ||
575 | /** | |
576 | * Drop user count of osc_extent, and unplug IO asynchronously. | |
577 | */ | |
882e7e21 | 578 | void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) |
d7e09d03 PT |
579 | { |
580 | struct osc_object *obj = ext->oe_obj; | |
d7e09d03 PT |
581 | |
582 | LASSERT(atomic_read(&ext->oe_users) > 0); | |
583 | LASSERT(sanity_check(ext) == 0); | |
584 | LASSERT(ext->oe_grants > 0); | |
585 | ||
586 | if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { | |
587 | LASSERT(ext->oe_state == OES_ACTIVE); | |
588 | if (ext->oe_trunc_pending) { | |
589 | /* a truncate process is waiting for this extent. | |
590 | * This may happen due to a race, check | |
30aa9c52 OD |
591 | * osc_cache_truncate_start(). |
592 | */ | |
d7e09d03 PT |
593 | osc_extent_state_set(ext, OES_TRUNC); |
594 | ext->oe_trunc_pending = 0; | |
595 | } else { | |
596 | osc_extent_state_set(ext, OES_CACHE); | |
597 | osc_update_pending(obj, OBD_BRW_WRITE, | |
598 | ext->oe_nr_pages); | |
599 | ||
600 | /* try to merge the previous and next extent. */ | |
601 | osc_extent_merge(env, ext, prev_extent(ext)); | |
602 | osc_extent_merge(env, ext, next_extent(ext)); | |
603 | ||
604 | if (ext->oe_urgent) | |
605 | list_move_tail(&ext->oe_link, | |
79910d7d | 606 | &obj->oo_urgent_exts); |
d7e09d03 PT |
607 | } |
608 | osc_object_unlock(obj); | |
609 | ||
610 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
611 | } | |
612 | osc_extent_put(env, ext); | |
d7e09d03 PT |
613 | } |
614 | ||
615 | static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) | |
616 | { | |
617 | return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); | |
618 | } | |
619 | ||
620 | /** | |
621 | * Find or create an extent which includes @index, core function to manage | |
622 | * extent tree. | |
623 | */ | |
6ef3f3c7 SB |
624 | static struct osc_extent *osc_extent_find(const struct lu_env *env, |
625 | struct osc_object *obj, pgoff_t index, | |
626 | int *grants) | |
d7e09d03 PT |
627 | |
628 | { | |
629 | struct client_obd *cli = osc_cli(obj); | |
29ac6840 | 630 | struct cl_lock *lock; |
d7e09d03 PT |
631 | struct osc_extent *cur; |
632 | struct osc_extent *ext; | |
633 | struct osc_extent *conflict = NULL; | |
634 | struct osc_extent *found = NULL; | |
29ac6840 CH |
635 | pgoff_t chunk; |
636 | pgoff_t max_end; | |
637 | int max_pages; /* max_pages_per_rpc */ | |
638 | int chunksize; | |
639 | int ppc_bits; /* pages per chunk bits */ | |
640 | int chunk_mask; | |
641 | int rc; | |
d7e09d03 PT |
642 | |
643 | cur = osc_extent_alloc(obj); | |
7f1ae4c0 | 644 | if (!cur) |
0a3bdb00 | 645 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
646 | |
647 | lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0); | |
d7e09d03 PT |
648 | LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); |
649 | ||
650 | LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); | |
29ac6840 | 651 | ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; |
d7e09d03 | 652 | chunk_mask = ~((1 << ppc_bits) - 1); |
29ac6840 CH |
653 | chunksize = 1 << cli->cl_chunkbits; |
654 | chunk = index >> ppc_bits; | |
d7e09d03 PT |
655 | |
656 | /* align end to rpc edge, rpc size may not be a power 2 integer. */ | |
657 | max_pages = cli->cl_max_pages_per_rpc; | |
658 | LASSERT((max_pages & ~chunk_mask) == 0); | |
659 | max_end = index - (index % max_pages) + max_pages - 1; | |
660 | max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end); | |
661 | ||
662 | /* initialize new extent by parameters so far */ | |
663 | cur->oe_max_end = max_end; | |
29ac6840 CH |
664 | cur->oe_start = index & chunk_mask; |
665 | cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; | |
d7e09d03 PT |
666 | if (cur->oe_start < lock->cll_descr.cld_start) |
667 | cur->oe_start = lock->cll_descr.cld_start; | |
668 | if (cur->oe_end > max_end) | |
669 | cur->oe_end = max_end; | |
670 | cur->oe_osclock = lock; | |
29ac6840 CH |
671 | cur->oe_grants = 0; |
672 | cur->oe_mppr = max_pages; | |
d7e09d03 PT |
673 | |
674 | /* grants has been allocated by caller */ | |
675 | LASSERTF(*grants >= chunksize + cli->cl_extent_tax, | |
676 | "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); | |
677 | LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur)); | |
678 | ||
679 | restart: | |
680 | osc_object_lock(obj); | |
681 | ext = osc_extent_search(obj, cur->oe_start); | |
7f1ae4c0 | 682 | if (!ext) |
d7e09d03 | 683 | ext = first_extent(obj); |
7f1ae4c0 | 684 | while (ext) { |
d7e09d03 | 685 | loff_t ext_chk_start = ext->oe_start >> ppc_bits; |
29ac6840 | 686 | loff_t ext_chk_end = ext->oe_end >> ppc_bits; |
d7e09d03 PT |
687 | |
688 | LASSERT(sanity_check_nolock(ext) == 0); | |
689 | if (chunk > ext_chk_end + 1) | |
690 | break; | |
691 | ||
692 | /* if covering by different locks, no chance to match */ | |
693 | if (lock != ext->oe_osclock) { | |
694 | EASSERTF(!overlapped(ext, cur), ext, | |
695 | EXTSTR, EXTPARA(cur)); | |
696 | ||
697 | ext = next_extent(ext); | |
698 | continue; | |
699 | } | |
700 | ||
701 | /* discontiguous chunks? */ | |
702 | if (chunk + 1 < ext_chk_start) { | |
703 | ext = next_extent(ext); | |
704 | continue; | |
705 | } | |
706 | ||
707 | /* ok, from now on, ext and cur have these attrs: | |
708 | * 1. covered by the same lock | |
30aa9c52 OD |
709 | * 2. contiguous at chunk level or overlapping. |
710 | */ | |
d7e09d03 PT |
711 | |
712 | if (overlapped(ext, cur)) { | |
713 | /* cur is the minimum unit, so overlapping means | |
30aa9c52 OD |
714 | * full contain. |
715 | */ | |
d7e09d03 PT |
716 | EASSERTF((ext->oe_start <= cur->oe_start && |
717 | ext->oe_end >= cur->oe_end), | |
718 | ext, EXTSTR, EXTPARA(cur)); | |
719 | ||
720 | if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { | |
721 | /* for simplicity, we wait for this extent to | |
30aa9c52 OD |
722 | * finish before going forward. |
723 | */ | |
d7e09d03 PT |
724 | conflict = osc_extent_get(ext); |
725 | break; | |
726 | } | |
727 | ||
728 | found = osc_extent_hold(ext); | |
729 | break; | |
730 | } | |
731 | ||
732 | /* non-overlapped extent */ | |
733 | if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { | |
734 | /* we can't do anything for a non OES_CACHE extent, or | |
735 | * if there is someone waiting for this extent to be | |
30aa9c52 OD |
736 | * flushed, try next one. |
737 | */ | |
d7e09d03 PT |
738 | ext = next_extent(ext); |
739 | continue; | |
740 | } | |
741 | ||
742 | /* check if they belong to the same rpc slot before trying to | |
743 | * merge. the extents are not overlapped and contiguous at | |
30aa9c52 OD |
744 | * chunk level to get here. |
745 | */ | |
d7e09d03 PT |
746 | if (ext->oe_max_end != max_end) { |
747 | /* if they don't belong to the same RPC slot or | |
30aa9c52 OD |
748 | * max_pages_per_rpc has ever changed, do not merge. |
749 | */ | |
d7e09d03 PT |
750 | ext = next_extent(ext); |
751 | continue; | |
752 | } | |
753 | ||
754 | /* it's required that an extent must be contiguous at chunk | |
755 | * level so that we know the whole extent is covered by grant | |
756 | * (the pages in the extent are NOT required to be contiguous). | |
757 | * Otherwise, it will be too much difficult to know which | |
30aa9c52 OD |
758 | * chunks have grants allocated. |
759 | */ | |
d7e09d03 PT |
760 | |
761 | /* try to do front merge - extend ext's start */ | |
762 | if (chunk + 1 == ext_chk_start) { | |
763 | /* ext must be chunk size aligned */ | |
764 | EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); | |
765 | ||
766 | /* pull ext's start back to cover cur */ | |
29ac6840 | 767 | ext->oe_start = cur->oe_start; |
d7e09d03 PT |
768 | ext->oe_grants += chunksize; |
769 | *grants -= chunksize; | |
770 | ||
771 | found = osc_extent_hold(ext); | |
772 | } else if (chunk == ext_chk_end + 1) { | |
773 | /* rear merge */ | |
29ac6840 | 774 | ext->oe_end = cur->oe_end; |
d7e09d03 PT |
775 | ext->oe_grants += chunksize; |
776 | *grants -= chunksize; | |
777 | ||
778 | /* try to merge with the next one because we just fill | |
30aa9c52 OD |
779 | * in a gap |
780 | */ | |
d7e09d03 PT |
781 | if (osc_extent_merge(env, ext, next_extent(ext)) == 0) |
782 | /* we can save extent tax from next extent */ | |
783 | *grants += cli->cl_extent_tax; | |
784 | ||
785 | found = osc_extent_hold(ext); | |
786 | } | |
7f1ae4c0 | 787 | if (found) |
d7e09d03 PT |
788 | break; |
789 | ||
790 | ext = next_extent(ext); | |
791 | } | |
792 | ||
793 | osc_extent_tree_dump(D_CACHE, obj); | |
7f1ae4c0 OD |
794 | if (found) { |
795 | LASSERT(!conflict); | |
d7e09d03 PT |
796 | if (!IS_ERR(found)) { |
797 | LASSERT(found->oe_osclock == cur->oe_osclock); | |
798 | OSC_EXTENT_DUMP(D_CACHE, found, | |
799 | "found caching ext for %lu.\n", index); | |
800 | } | |
7f1ae4c0 | 801 | } else if (!conflict) { |
d7e09d03 PT |
802 | /* create a new extent */ |
803 | EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); | |
804 | cur->oe_grants = chunksize + cli->cl_extent_tax; | |
805 | *grants -= cur->oe_grants; | |
806 | LASSERT(*grants >= 0); | |
807 | ||
808 | cur->oe_state = OES_CACHE; | |
809 | found = osc_extent_hold(cur); | |
810 | osc_extent_insert(obj, cur); | |
811 | OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", | |
812 | index, lock->cll_descr.cld_end); | |
813 | } | |
814 | osc_object_unlock(obj); | |
815 | ||
7f1ae4c0 OD |
816 | if (conflict) { |
817 | LASSERT(!found); | |
d7e09d03 PT |
818 | |
819 | /* waiting for IO to finish. Please notice that it's impossible | |
30aa9c52 OD |
820 | * to be an OES_TRUNC extent. |
821 | */ | |
d7e09d03 PT |
822 | rc = osc_extent_wait(env, conflict, OES_INV); |
823 | osc_extent_put(env, conflict); | |
824 | conflict = NULL; | |
490e0e89 JL |
825 | if (rc < 0) { |
826 | found = ERR_PTR(rc); | |
827 | goto out; | |
828 | } | |
d7e09d03 PT |
829 | |
830 | goto restart; | |
831 | } | |
d7e09d03 PT |
832 | |
833 | out: | |
834 | osc_extent_put(env, cur); | |
835 | LASSERT(*grants >= 0); | |
836 | return found; | |
837 | } | |
838 | ||
839 | /** | |
840 | * Called when IO is finished to an extent. | |
841 | */ | |
842 | int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, | |
843 | int sent, int rc) | |
844 | { | |
845 | struct client_obd *cli = osc_cli(ext->oe_obj); | |
846 | struct osc_async_page *oap; | |
847 | struct osc_async_page *tmp; | |
848 | int nr_pages = ext->oe_nr_pages; | |
849 | int lost_grant = 0; | |
850 | int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; | |
851 | __u64 last_off = 0; | |
852 | int last_count = -1; | |
d7e09d03 PT |
853 | |
854 | OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); | |
855 | ||
856 | ext->oe_rc = rc ?: ext->oe_nr_pages; | |
857 | EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); | |
79910d7d | 858 | list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { |
d7e09d03 PT |
859 | list_del_init(&oap->oap_rpc_item); |
860 | list_del_init(&oap->oap_pending_item); | |
861 | if (last_off <= oap->oap_obj_off) { | |
862 | last_off = oap->oap_obj_off; | |
863 | last_count = oap->oap_count; | |
864 | } | |
865 | ||
866 | --ext->oe_nr_pages; | |
867 | osc_ap_completion(env, cli, oap, sent, rc); | |
868 | } | |
869 | EASSERT(ext->oe_nr_pages == 0, ext); | |
870 | ||
871 | if (!sent) { | |
872 | lost_grant = ext->oe_grants; | |
873 | } else if (blocksize < PAGE_CACHE_SIZE && | |
874 | last_count != PAGE_CACHE_SIZE) { | |
875 | /* For short writes we shouldn't count parts of pages that | |
876 | * span a whole chunk on the OST side, or our accounting goes | |
30aa9c52 OD |
877 | * wrong. Should match the code in filter_grant_check. |
878 | */ | |
d7e09d03 PT |
879 | int offset = oap->oap_page_off & ~CFS_PAGE_MASK; |
880 | int count = oap->oap_count + (offset & (blocksize - 1)); | |
881 | int end = (offset + oap->oap_count) & (blocksize - 1); | |
50ffcb7e | 882 | |
d7e09d03 PT |
883 | if (end) |
884 | count += blocksize - end; | |
885 | ||
886 | lost_grant = PAGE_CACHE_SIZE - count; | |
887 | } | |
888 | if (ext->oe_grants > 0) | |
889 | osc_free_grant(cli, nr_pages, lost_grant); | |
890 | ||
891 | osc_extent_remove(ext); | |
892 | /* put the refcount for RPC */ | |
893 | osc_extent_put(env, ext); | |
0a3bdb00 | 894 | return 0; |
d7e09d03 PT |
895 | } |
896 | ||
897 | static int extent_wait_cb(struct osc_extent *ext, int state) | |
898 | { | |
899 | int ret; | |
900 | ||
901 | osc_object_lock(ext->oe_obj); | |
902 | ret = ext->oe_state == state; | |
903 | osc_object_unlock(ext->oe_obj); | |
904 | ||
905 | return ret; | |
906 | } | |
907 | ||
908 | /** | |
909 | * Wait for the extent's state to become @state. | |
910 | */ | |
911 | static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, | |
912 | int state) | |
913 | { | |
914 | struct osc_object *obj = ext->oe_obj; | |
915 | struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, | |
916 | LWI_ON_SIGNAL_NOOP, NULL); | |
917 | int rc = 0; | |
d7e09d03 PT |
918 | |
919 | osc_object_lock(obj); | |
920 | LASSERT(sanity_check_nolock(ext) == 0); | |
921 | /* `Kick' this extent only if the caller is waiting for it to be | |
30aa9c52 OD |
922 | * written out. |
923 | */ | |
ce248d59 AS |
924 | if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp && |
925 | !ext->oe_trunc_pending) { | |
d7e09d03 PT |
926 | if (ext->oe_state == OES_ACTIVE) { |
927 | ext->oe_urgent = 1; | |
928 | } else if (ext->oe_state == OES_CACHE) { | |
929 | ext->oe_urgent = 1; | |
930 | osc_extent_hold(ext); | |
931 | rc = 1; | |
932 | } | |
933 | } | |
934 | osc_object_unlock(obj); | |
935 | if (rc == 1) | |
936 | osc_extent_release(env, ext); | |
937 | ||
938 | /* wait for the extent until its state becomes @state */ | |
939 | rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); | |
940 | if (rc == -ETIMEDOUT) { | |
941 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
942 | "%s: wait ext to %d timedout, recovery in progress?\n", | |
943 | osc_export(obj)->exp_obd->obd_name, state); | |
944 | ||
945 | lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
946 | rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), | |
947 | &lwi); | |
948 | } | |
949 | if (rc == 0 && ext->oe_rc < 0) | |
950 | rc = ext->oe_rc; | |
0a3bdb00 | 951 | return rc; |
d7e09d03 PT |
952 | } |
953 | ||
954 | /** | |
955 | * Discard pages with index greater than @size. If @ext is overlapped with | |
956 | * @size, then partial truncate happens. | |
957 | */ | |
958 | static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, | |
29ac6840 | 959 | bool partial) |
d7e09d03 | 960 | { |
29ac6840 CH |
961 | struct cl_env_nest nest; |
962 | struct lu_env *env; | |
963 | struct cl_io *io; | |
964 | struct osc_object *obj = ext->oe_obj; | |
965 | struct client_obd *cli = osc_cli(obj); | |
d7e09d03 PT |
966 | struct osc_async_page *oap; |
967 | struct osc_async_page *tmp; | |
29ac6840 CH |
968 | int pages_in_chunk = 0; |
969 | int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
970 | __u64 trunc_chunk = trunc_index >> ppc_bits; | |
971 | int grants = 0; | |
972 | int nr_pages = 0; | |
973 | int rc = 0; | |
d7e09d03 PT |
974 | |
975 | LASSERT(sanity_check(ext) == 0); | |
ce248d59 AS |
976 | EASSERT(ext->oe_state == OES_TRUNC, ext); |
977 | EASSERT(!ext->oe_urgent, ext); | |
d7e09d03 PT |
978 | |
979 | /* Request new lu_env. | |
980 | * We can't use that env from osc_cache_truncate_start() because | |
30aa9c52 OD |
981 | * it's from lov_io_sub and not fully initialized. |
982 | */ | |
d7e09d03 PT |
983 | env = cl_env_nested_get(&nest); |
984 | io = &osc_env_info(env)->oti_io; | |
985 | io->ci_obj = cl_object_top(osc2cl(obj)); | |
986 | rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); | |
987 | if (rc < 0) | |
490e0e89 | 988 | goto out; |
d7e09d03 PT |
989 | |
990 | /* discard all pages with index greater then trunc_index */ | |
79910d7d | 991 | list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { |
29ac6840 CH |
992 | struct cl_page *sub = oap2cl_page(oap); |
993 | struct cl_page *page = cl_page_top(sub); | |
d7e09d03 PT |
994 | |
995 | LASSERT(list_empty(&oap->oap_rpc_item)); | |
996 | ||
997 | /* only discard the pages with their index greater than | |
30aa9c52 OD |
998 | * trunc_index, and ... |
999 | */ | |
d7e09d03 PT |
1000 | if (sub->cp_index < trunc_index || |
1001 | (sub->cp_index == trunc_index && partial)) { | |
1002 | /* accounting how many pages remaining in the chunk | |
1003 | * so that we can calculate grants correctly. */ | |
1004 | if (sub->cp_index >> ppc_bits == trunc_chunk) | |
1005 | ++pages_in_chunk; | |
1006 | continue; | |
1007 | } | |
1008 | ||
1009 | list_del_init(&oap->oap_pending_item); | |
1010 | ||
1011 | cl_page_get(page); | |
1012 | lu_ref_add(&page->cp_reference, "truncate", current); | |
1013 | ||
1014 | if (cl_page_own(env, io, page) == 0) { | |
1015 | cl_page_unmap(env, io, page); | |
1016 | cl_page_discard(env, io, page); | |
1017 | cl_page_disown(env, io, page); | |
1018 | } else { | |
1019 | LASSERT(page->cp_state == CPS_FREEING); | |
1020 | LASSERT(0); | |
1021 | } | |
1022 | ||
1023 | lu_ref_del(&page->cp_reference, "truncate", current); | |
1024 | cl_page_put(env, page); | |
1025 | ||
1026 | --ext->oe_nr_pages; | |
1027 | ++nr_pages; | |
1028 | } | |
1029 | EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, | |
1030 | ext->oe_nr_pages == 0), | |
1031 | ext, "trunc_index %lu, partial %d\n", trunc_index, partial); | |
1032 | ||
1033 | osc_object_lock(obj); | |
1034 | if (ext->oe_nr_pages == 0) { | |
1035 | LASSERT(pages_in_chunk == 0); | |
1036 | grants = ext->oe_grants; | |
1037 | ext->oe_grants = 0; | |
1038 | } else { /* calculate how many grants we can free */ | |
29ac6840 | 1039 | int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; |
d7e09d03 PT |
1040 | pgoff_t last_index; |
1041 | ||
d7e09d03 | 1042 | /* if there is no pages in this chunk, we can also free grants |
30aa9c52 OD |
1043 | * for the last chunk |
1044 | */ | |
d7e09d03 PT |
1045 | if (pages_in_chunk == 0) { |
1046 | /* if this is the 1st chunk and no pages in this chunk, | |
1047 | * ext->oe_nr_pages must be zero, so we should be in | |
30aa9c52 OD |
1048 | * the other if-clause. |
1049 | */ | |
d7e09d03 PT |
1050 | LASSERT(trunc_chunk > 0); |
1051 | --trunc_chunk; | |
1052 | ++chunks; | |
1053 | } | |
1054 | ||
1055 | /* this is what we can free from this extent */ | |
29ac6840 | 1056 | grants = chunks << cli->cl_chunkbits; |
d7e09d03 | 1057 | ext->oe_grants -= grants; |
29ac6840 CH |
1058 | last_index = ((trunc_chunk + 1) << ppc_bits) - 1; |
1059 | ext->oe_end = min(last_index, ext->oe_max_end); | |
d7e09d03 PT |
1060 | LASSERT(ext->oe_end >= ext->oe_start); |
1061 | LASSERT(ext->oe_grants > 0); | |
1062 | } | |
1063 | osc_object_unlock(obj); | |
1064 | ||
1065 | if (grants > 0 || nr_pages > 0) | |
1066 | osc_free_grant(cli, nr_pages, grants); | |
1067 | ||
1068 | out: | |
1069 | cl_io_fini(env, io); | |
1070 | cl_env_nested_put(&nest, env); | |
0a3bdb00 | 1071 | return rc; |
d7e09d03 PT |
1072 | } |
1073 | ||
1074 | /** | |
1075 | * This function is used to make the extent prepared for transfer. | |
34ca8748 | 1076 | * A race with flushing page - ll_writepage() has to be handled cautiously. |
d7e09d03 PT |
1077 | */ |
1078 | static int osc_extent_make_ready(const struct lu_env *env, | |
1079 | struct osc_extent *ext) | |
1080 | { | |
1081 | struct osc_async_page *oap; | |
1082 | struct osc_async_page *last = NULL; | |
1083 | struct osc_object *obj = ext->oe_obj; | |
1084 | int page_count = 0; | |
1085 | int rc; | |
d7e09d03 PT |
1086 | |
1087 | /* we're going to grab page lock, so object lock must not be taken. */ | |
1088 | LASSERT(sanity_check(ext) == 0); | |
1089 | /* in locking state, any process should not touch this extent. */ | |
1090 | EASSERT(ext->oe_state == OES_LOCKING, ext); | |
7f1ae4c0 | 1091 | EASSERT(ext->oe_owner, ext); |
d7e09d03 PT |
1092 | |
1093 | OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); | |
1094 | ||
1095 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
1096 | ++page_count; | |
7f1ae4c0 | 1097 | if (!last || last->oap_obj_off < oap->oap_obj_off) |
d7e09d03 PT |
1098 | last = oap; |
1099 | ||
1100 | /* checking ASYNC_READY is race safe */ | |
1101 | if ((oap->oap_async_flags & ASYNC_READY) != 0) | |
1102 | continue; | |
1103 | ||
1104 | rc = osc_make_ready(env, oap, OBD_BRW_WRITE); | |
1105 | switch (rc) { | |
1106 | case 0: | |
1107 | spin_lock(&oap->oap_lock); | |
1108 | oap->oap_async_flags |= ASYNC_READY; | |
1109 | spin_unlock(&oap->oap_lock); | |
1110 | break; | |
1111 | case -EALREADY: | |
1112 | LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); | |
1113 | break; | |
1114 | default: | |
1115 | LASSERTF(0, "unknown return code: %d\n", rc); | |
1116 | } | |
1117 | } | |
1118 | ||
1119 | LASSERT(page_count == ext->oe_nr_pages); | |
7f1ae4c0 | 1120 | LASSERT(last); |
d7e09d03 | 1121 | /* the last page is the only one we need to refresh its count by |
30aa9c52 OD |
1122 | * the size of file. |
1123 | */ | |
d7e09d03 PT |
1124 | if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { |
1125 | last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); | |
1126 | LASSERT(last->oap_count > 0); | |
1127 | LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE); | |
1128 | last->oap_async_flags |= ASYNC_COUNT_STABLE; | |
1129 | } | |
1130 | ||
1131 | /* for the rest of pages, we don't need to call osf_refresh_count() | |
30aa9c52 OD |
1132 | * because it's known they are not the last page |
1133 | */ | |
d7e09d03 PT |
1134 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { |
1135 | if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { | |
1136 | oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; | |
1137 | oap->oap_async_flags |= ASYNC_COUNT_STABLE; | |
1138 | } | |
1139 | } | |
1140 | ||
1141 | osc_object_lock(obj); | |
1142 | osc_extent_state_set(ext, OES_RPC); | |
1143 | osc_object_unlock(obj); | |
1144 | /* get a refcount for RPC. */ | |
1145 | osc_extent_get(ext); | |
1146 | ||
0a3bdb00 | 1147 | return 0; |
d7e09d03 PT |
1148 | } |
1149 | ||
1150 | /** | |
1151 | * Quick and simple version of osc_extent_find(). This function is frequently | |
1152 | * called to expand the extent for the same IO. To expand the extent, the | |
1153 | * page index must be in the same or next chunk of ext->oe_end. | |
1154 | */ | |
1155 | static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) | |
1156 | { | |
1157 | struct osc_object *obj = ext->oe_obj; | |
1158 | struct client_obd *cli = osc_cli(obj); | |
1159 | struct osc_extent *next; | |
1160 | int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
1161 | pgoff_t chunk = index >> ppc_bits; | |
1162 | pgoff_t end_chunk; | |
1163 | pgoff_t end_index; | |
1164 | int chunksize = 1 << cli->cl_chunkbits; | |
1165 | int rc = 0; | |
d7e09d03 PT |
1166 | |
1167 | LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); | |
1168 | osc_object_lock(obj); | |
1169 | LASSERT(sanity_check_nolock(ext) == 0); | |
1170 | end_chunk = ext->oe_end >> ppc_bits; | |
490e0e89 JL |
1171 | if (chunk > end_chunk + 1) { |
1172 | rc = -ERANGE; | |
1173 | goto out; | |
1174 | } | |
d7e09d03 | 1175 | |
490e0e89 JL |
1176 | if (end_chunk >= chunk) { |
1177 | rc = 0; | |
1178 | goto out; | |
1179 | } | |
d7e09d03 PT |
1180 | |
1181 | LASSERT(end_chunk + 1 == chunk); | |
1182 | /* try to expand this extent to cover @index */ | |
1183 | end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); | |
1184 | ||
1185 | next = next_extent(ext); | |
7f1ae4c0 | 1186 | if (next && next->oe_start <= end_index) { |
d7e09d03 | 1187 | /* complex mode - overlapped with the next extent, |
30aa9c52 OD |
1188 | * this case will be handled by osc_extent_find() |
1189 | */ | |
490e0e89 JL |
1190 | rc = -EAGAIN; |
1191 | goto out; | |
1192 | } | |
d7e09d03 PT |
1193 | |
1194 | ext->oe_end = end_index; | |
1195 | ext->oe_grants += chunksize; | |
1196 | *grants -= chunksize; | |
1197 | LASSERT(*grants >= 0); | |
1198 | EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, | |
1199 | "overlapped after expanding for %lu.\n", index); | |
d7e09d03 PT |
1200 | |
1201 | out: | |
1202 | osc_object_unlock(obj); | |
0a3bdb00 | 1203 | return rc; |
d7e09d03 PT |
1204 | } |
1205 | ||
1206 | static void osc_extent_tree_dump0(int level, struct osc_object *obj, | |
1207 | const char *func, int line) | |
1208 | { | |
1209 | struct osc_extent *ext; | |
1210 | int cnt; | |
1211 | ||
1212 | CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", | |
1213 | obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); | |
1214 | ||
1215 | /* osc_object_lock(obj); */ | |
1216 | cnt = 1; | |
7f1ae4c0 | 1217 | for (ext = first_extent(obj); ext; ext = next_extent(ext)) |
d7e09d03 PT |
1218 | OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); |
1219 | ||
1220 | cnt = 1; | |
1221 | list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) | |
1222 | OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); | |
1223 | ||
1224 | cnt = 1; | |
1225 | list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) | |
1226 | OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); | |
1227 | ||
1228 | cnt = 1; | |
1229 | list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) | |
1230 | OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); | |
1231 | /* osc_object_unlock(obj); */ | |
1232 | } | |
1233 | ||
1234 | /* ------------------ osc extent end ------------------ */ | |
1235 | ||
1236 | static inline int osc_is_ready(struct osc_object *osc) | |
1237 | { | |
1238 | return !list_empty(&osc->oo_ready_item) || | |
1239 | !list_empty(&osc->oo_hp_ready_item); | |
1240 | } | |
1241 | ||
1242 | #define OSC_IO_DEBUG(OSC, STR, args...) \ | |
1243 | CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ | |
1244 | (OSC), osc_is_ready(OSC), \ | |
1245 | list_empty_marker(&(OSC)->oo_hp_ready_item), \ | |
1246 | list_empty_marker(&(OSC)->oo_ready_item), \ | |
1247 | atomic_read(&(OSC)->oo_nr_writes), \ | |
1248 | list_empty_marker(&(OSC)->oo_hp_exts), \ | |
1249 | list_empty_marker(&(OSC)->oo_urgent_exts), \ | |
1250 | atomic_read(&(OSC)->oo_nr_reads), \ | |
1251 | list_empty_marker(&(OSC)->oo_reading_exts), \ | |
1252 | ##args) | |
1253 | ||
1254 | static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, | |
1255 | int cmd) | |
1256 | { | |
29ac6840 CH |
1257 | struct osc_page *opg = oap2osc_page(oap); |
1258 | struct cl_page *page = cl_page_top(oap2cl_page(oap)); | |
d7e09d03 PT |
1259 | int result; |
1260 | ||
1261 | LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ | |
1262 | ||
d7e09d03 PT |
1263 | result = cl_page_make_ready(env, page, CRT_WRITE); |
1264 | if (result == 0) | |
1265 | opg->ops_submit_time = cfs_time_current(); | |
0a3bdb00 | 1266 | return result; |
d7e09d03 PT |
1267 | } |
1268 | ||
1269 | static int osc_refresh_count(const struct lu_env *env, | |
1270 | struct osc_async_page *oap, int cmd) | |
1271 | { | |
29ac6840 CH |
1272 | struct osc_page *opg = oap2osc_page(oap); |
1273 | struct cl_page *page = oap2cl_page(oap); | |
d7e09d03 | 1274 | struct cl_object *obj; |
29ac6840 | 1275 | struct cl_attr *attr = &osc_env_info(env)->oti_attr; |
d7e09d03 PT |
1276 | |
1277 | int result; | |
1278 | loff_t kms; | |
1279 | ||
1280 | /* readpage queues with _COUNT_STABLE, shouldn't get here. */ | |
1281 | LASSERT(!(cmd & OBD_BRW_READ)); | |
d7e09d03 PT |
1282 | obj = opg->ops_cl.cpl_obj; |
1283 | ||
1284 | cl_object_attr_lock(obj); | |
1285 | result = cl_object_attr_get(env, obj, attr); | |
1286 | cl_object_attr_unlock(obj); | |
1287 | if (result < 0) | |
1288 | return result; | |
1289 | kms = attr->cat_kms; | |
1290 | if (cl_offset(obj, page->cp_index) >= kms) | |
1291 | /* catch race with truncate */ | |
1292 | return 0; | |
1293 | else if (cl_offset(obj, page->cp_index + 1) > kms) | |
1294 | /* catch sub-page write at end of file */ | |
1295 | return kms % PAGE_CACHE_SIZE; | |
1296 | else | |
1297 | return PAGE_CACHE_SIZE; | |
1298 | } | |
1299 | ||
1300 | static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, | |
1301 | int cmd, int rc) | |
1302 | { | |
29ac6840 CH |
1303 | struct osc_page *opg = oap2osc_page(oap); |
1304 | struct cl_page *page = cl_page_top(oap2cl_page(oap)); | |
1305 | struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); | |
1306 | enum cl_req_type crt; | |
d7e09d03 PT |
1307 | int srvlock; |
1308 | ||
d7e09d03 PT |
1309 | cmd &= ~OBD_BRW_NOQUOTA; |
1310 | LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); | |
1311 | LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); | |
1312 | LASSERT(opg->ops_transfer_pinned); | |
1313 | ||
1314 | /* | |
1315 | * page->cp_req can be NULL if io submission failed before | |
1316 | * cl_req was allocated. | |
1317 | */ | |
7f1ae4c0 | 1318 | if (page->cp_req) |
d7e09d03 | 1319 | cl_req_page_done(env, page); |
7f1ae4c0 | 1320 | LASSERT(!page->cp_req); |
d7e09d03 PT |
1321 | |
1322 | crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; | |
1323 | /* Clear opg->ops_transfer_pinned before VM lock is released. */ | |
1324 | opg->ops_transfer_pinned = 0; | |
1325 | ||
1326 | spin_lock(&obj->oo_seatbelt); | |
7f1ae4c0 | 1327 | LASSERT(opg->ops_submitter); |
d7e09d03 PT |
1328 | LASSERT(!list_empty(&opg->ops_inflight)); |
1329 | list_del_init(&opg->ops_inflight); | |
1330 | opg->ops_submitter = NULL; | |
1331 | spin_unlock(&obj->oo_seatbelt); | |
1332 | ||
1333 | opg->ops_submit_time = 0; | |
1334 | srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; | |
1335 | ||
1336 | /* statistic */ | |
1337 | if (rc == 0 && srvlock) { | |
29ac6840 | 1338 | struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; |
d7e09d03 PT |
1339 | struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; |
1340 | int bytes = oap->oap_count; | |
1341 | ||
1342 | if (crt == CRT_READ) | |
1343 | stats->os_lockless_reads += bytes; | |
1344 | else | |
1345 | stats->os_lockless_writes += bytes; | |
1346 | } | |
1347 | ||
1348 | /* | |
1349 | * This has to be the last operation with the page, as locks are | |
1350 | * released in cl_page_completion() and nothing except for the | |
1351 | * reference counter protects page from concurrent reclaim. | |
1352 | */ | |
1353 | lu_ref_del(&page->cp_reference, "transfer", page); | |
1354 | ||
1355 | cl_page_completion(env, page, crt, rc); | |
1356 | ||
0a3bdb00 | 1357 | return 0; |
d7e09d03 PT |
1358 | } |
1359 | ||
1360 | #define OSC_DUMP_GRANT(cli, fmt, args...) do { \ | |
1361 | struct client_obd *__tmp = (cli); \ | |
1362 | CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \ | |
c52f69c5 | 1363 | "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt, \ |
d7e09d03 PT |
1364 | __tmp->cl_import->imp_obd->obd_name, \ |
1365 | __tmp->cl_dirty, __tmp->cl_dirty_max, \ | |
1366 | atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ | |
d7e09d03 PT |
1367 | __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ |
1368 | __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \ | |
1369 | } while (0) | |
1370 | ||
1371 | /* caller must hold loi_list_lock */ | |
1372 | static void osc_consume_write_grant(struct client_obd *cli, | |
1373 | struct brw_page *pga) | |
1374 | { | |
5e42bc9d | 1375 | assert_spin_locked(&cli->cl_loi_list_lock.lock); |
d7e09d03 PT |
1376 | LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); |
1377 | atomic_inc(&obd_dirty_pages); | |
1378 | cli->cl_dirty += PAGE_CACHE_SIZE; | |
1379 | pga->flag |= OBD_BRW_FROM_GRANT; | |
1380 | CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", | |
1381 | PAGE_CACHE_SIZE, pga, pga->pg); | |
1382 | osc_update_next_shrink(cli); | |
1383 | } | |
1384 | ||
1385 | /* the companion to osc_consume_write_grant, called when a brw has completed. | |
30aa9c52 OD |
1386 | * must be called with the loi lock held. |
1387 | */ | |
d7e09d03 PT |
1388 | static void osc_release_write_grant(struct client_obd *cli, |
1389 | struct brw_page *pga) | |
1390 | { | |
5e42bc9d | 1391 | assert_spin_locked(&cli->cl_loi_list_lock.lock); |
d7e09d03 | 1392 | if (!(pga->flag & OBD_BRW_FROM_GRANT)) { |
d7e09d03 PT |
1393 | return; |
1394 | } | |
1395 | ||
1396 | pga->flag &= ~OBD_BRW_FROM_GRANT; | |
1397 | atomic_dec(&obd_dirty_pages); | |
1398 | cli->cl_dirty -= PAGE_CACHE_SIZE; | |
1399 | if (pga->flag & OBD_BRW_NOCACHE) { | |
1400 | pga->flag &= ~OBD_BRW_NOCACHE; | |
1401 | atomic_dec(&obd_dirty_transit_pages); | |
1402 | cli->cl_dirty_transit -= PAGE_CACHE_SIZE; | |
1403 | } | |
d7e09d03 PT |
1404 | } |
1405 | ||
1406 | /** | |
1407 | * To avoid sleeping with object lock held, it's good for us allocate enough | |
1408 | * grants before entering into critical section. | |
1409 | * | |
1410 | * client_obd_list_lock held by caller | |
1411 | */ | |
1412 | static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) | |
1413 | { | |
1414 | int rc = -EDQUOT; | |
1415 | ||
1416 | if (cli->cl_avail_grant >= bytes) { | |
29ac6840 | 1417 | cli->cl_avail_grant -= bytes; |
d7e09d03 PT |
1418 | cli->cl_reserved_grant += bytes; |
1419 | rc = 0; | |
1420 | } | |
1421 | return rc; | |
1422 | } | |
1423 | ||
1424 | static void __osc_unreserve_grant(struct client_obd *cli, | |
1425 | unsigned int reserved, unsigned int unused) | |
1426 | { | |
1427 | /* it's quite normal for us to get more grant than reserved. | |
1428 | * Thinking about a case that two extents merged by adding a new | |
1429 | * chunk, we can save one extent tax. If extent tax is greater than | |
30aa9c52 OD |
1430 | * one chunk, we can save more grant by adding a new chunk |
1431 | */ | |
d7e09d03 PT |
1432 | cli->cl_reserved_grant -= reserved; |
1433 | if (unused > reserved) { | |
1434 | cli->cl_avail_grant += reserved; | |
1435 | cli->cl_lost_grant += unused - reserved; | |
1436 | } else { | |
1437 | cli->cl_avail_grant += unused; | |
1438 | } | |
1439 | } | |
1440 | ||
6ef3f3c7 SB |
1441 | static void osc_unreserve_grant(struct client_obd *cli, |
1442 | unsigned int reserved, unsigned int unused) | |
d7e09d03 PT |
1443 | { |
1444 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1445 | __osc_unreserve_grant(cli, reserved, unused); | |
1446 | if (unused > 0) | |
1447 | osc_wake_cache_waiters(cli); | |
1448 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1449 | } | |
1450 | ||
1451 | /** | |
1452 | * Free grant after IO is finished or canceled. | |
1453 | * | |
1454 | * @lost_grant is used to remember how many grants we have allocated but not | |
1455 | * used, we should return these grants to OST. There're two cases where grants | |
1456 | * can be lost: | |
1457 | * 1. truncate; | |
1458 | * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was | |
1459 | * written. In this case OST may use less chunks to serve this partial | |
1460 | * write. OSTs don't actually know the page size on the client side. so | |
1461 | * clients have to calculate lost grant by the blocksize on the OST. | |
1462 | * See filter_grant_check() for details. | |
1463 | */ | |
1464 | static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, | |
1465 | unsigned int lost_grant) | |
1466 | { | |
1467 | int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
1468 | ||
1469 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1470 | atomic_sub(nr_pages, &obd_dirty_pages); | |
1471 | cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT; | |
1472 | cli->cl_lost_grant += lost_grant; | |
1473 | if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { | |
1474 | /* borrow some grant from truncate to avoid the case that | |
30aa9c52 OD |
1475 | * truncate uses up all avail grant |
1476 | */ | |
d7e09d03 PT |
1477 | cli->cl_lost_grant -= grant; |
1478 | cli->cl_avail_grant += grant; | |
1479 | } | |
1480 | osc_wake_cache_waiters(cli); | |
1481 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1482 | CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", | |
1483 | lost_grant, cli->cl_lost_grant, | |
1484 | cli->cl_avail_grant, cli->cl_dirty); | |
1485 | } | |
1486 | ||
1487 | /** | |
1488 | * The companion to osc_enter_cache(), called when @oap is no longer part of | |
1489 | * the dirty accounting due to error. | |
1490 | */ | |
1491 | static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) | |
1492 | { | |
1493 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1494 | osc_release_write_grant(cli, &oap->oap_brw_page); | |
1495 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1496 | } | |
1497 | ||
1498 | /** | |
1499 | * Non-blocking version of osc_enter_cache() that consumes grant only when it | |
1500 | * is available. | |
1501 | */ | |
1502 | static int osc_enter_cache_try(struct client_obd *cli, | |
1503 | struct osc_async_page *oap, | |
1504 | int bytes, int transient) | |
1505 | { | |
1506 | int rc; | |
1507 | ||
1508 | OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); | |
1509 | ||
1510 | rc = osc_reserve_grant(cli, bytes); | |
1511 | if (rc < 0) | |
1512 | return 0; | |
1513 | ||
1514 | if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max && | |
c52f69c5 | 1515 | atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { |
d7e09d03 PT |
1516 | osc_consume_write_grant(cli, &oap->oap_brw_page); |
1517 | if (transient) { | |
1518 | cli->cl_dirty_transit += PAGE_CACHE_SIZE; | |
1519 | atomic_inc(&obd_dirty_transit_pages); | |
1520 | oap->oap_brw_flags |= OBD_BRW_NOCACHE; | |
1521 | } | |
1522 | rc = 1; | |
1523 | } else { | |
1524 | __osc_unreserve_grant(cli, bytes, bytes); | |
1525 | rc = 0; | |
1526 | } | |
1527 | return rc; | |
1528 | } | |
1529 | ||
1530 | static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) | |
1531 | { | |
1532 | int rc; | |
50ffcb7e | 1533 | |
d7e09d03 PT |
1534 | client_obd_list_lock(&cli->cl_loi_list_lock); |
1535 | rc = list_empty(&ocw->ocw_entry); | |
1536 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1537 | return rc; | |
1538 | } | |
1539 | ||
1540 | /** | |
1541 | * The main entry to reserve dirty page accounting. Usually the grant reserved | |
1542 | * in this function will be freed in bulk in osc_free_grant() unless it fails | |
1543 | * to add osc cache, in that case, it will be freed in osc_exit_cache(). | |
1544 | * | |
1545 | * The process will be put into sleep if it's already run out of grant. | |
1546 | */ | |
1547 | static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, | |
1548 | struct osc_async_page *oap, int bytes) | |
1549 | { | |
1550 | struct osc_object *osc = oap->oap_obj; | |
29ac6840 | 1551 | struct lov_oinfo *loi = osc->oo_oinfo; |
d7e09d03 PT |
1552 | struct osc_cache_waiter ocw; |
1553 | struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
1554 | int rc = -EDQUOT; | |
d7e09d03 PT |
1555 | |
1556 | OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); | |
1557 | ||
1558 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1559 | ||
1560 | /* force the caller to try sync io. this can jump the list | |
30aa9c52 OD |
1561 | * of queued writes and create a discontiguous rpc stream |
1562 | */ | |
d7e09d03 PT |
1563 | if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || |
1564 | cli->cl_dirty_max < PAGE_CACHE_SIZE || | |
490e0e89 JL |
1565 | cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) { |
1566 | rc = -EDQUOT; | |
1567 | goto out; | |
1568 | } | |
d7e09d03 PT |
1569 | |
1570 | /* Hopefully normal case - cache space and write credits available */ | |
490e0e89 JL |
1571 | if (osc_enter_cache_try(cli, oap, bytes, 0)) { |
1572 | rc = 0; | |
1573 | goto out; | |
1574 | } | |
d7e09d03 PT |
1575 | |
1576 | /* We can get here for two reasons: too many dirty pages in cache, or | |
1577 | * run out of grants. In both cases we should write dirty pages out. | |
1578 | * Adding a cache waiter will trigger urgent write-out no matter what | |
1579 | * RPC size will be. | |
1580 | * The exiting condition is no avail grants and no dirty pages caching, | |
30aa9c52 OD |
1581 | * that really means there is no space on the OST. |
1582 | */ | |
d7e09d03 PT |
1583 | init_waitqueue_head(&ocw.ocw_waitq); |
1584 | ocw.ocw_oap = oap; | |
1585 | ocw.ocw_grant = bytes; | |
1586 | while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { | |
1587 | list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); | |
1588 | ocw.ocw_rc = 0; | |
1589 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1590 | ||
1591 | osc_io_unplug_async(env, cli, NULL); | |
1592 | ||
1593 | CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", | |
1594 | cli->cl_import->imp_obd->obd_name, &ocw, oap); | |
1595 | ||
1596 | rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); | |
1597 | ||
1598 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1599 | ||
1600 | /* l_wait_event is interrupted by signal */ | |
1601 | if (rc < 0) { | |
1602 | list_del_init(&ocw.ocw_entry); | |
490e0e89 | 1603 | goto out; |
d7e09d03 PT |
1604 | } |
1605 | ||
1606 | LASSERT(list_empty(&ocw.ocw_entry)); | |
1607 | rc = ocw.ocw_rc; | |
1608 | ||
1609 | if (rc != -EDQUOT) | |
490e0e89 JL |
1610 | goto out; |
1611 | if (osc_enter_cache_try(cli, oap, bytes, 0)) { | |
1612 | rc = 0; | |
1613 | goto out; | |
1614 | } | |
d7e09d03 | 1615 | } |
d7e09d03 PT |
1616 | out: |
1617 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1618 | OSC_DUMP_GRANT(cli, "returned %d.\n", rc); | |
0a3bdb00 | 1619 | return rc; |
d7e09d03 PT |
1620 | } |
1621 | ||
1622 | /* caller must hold loi_list_lock */ | |
1623 | void osc_wake_cache_waiters(struct client_obd *cli) | |
1624 | { | |
1625 | struct list_head *l, *tmp; | |
1626 | struct osc_cache_waiter *ocw; | |
1627 | ||
d7e09d03 PT |
1628 | list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { |
1629 | ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); | |
1630 | list_del_init(&ocw->ocw_entry); | |
1631 | ||
1632 | ocw->ocw_rc = -EDQUOT; | |
1633 | /* we can't dirty more */ | |
c52f69c5 OD |
1634 | if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) || |
1635 | (atomic_read(&obd_dirty_pages) + 1 > | |
1636 | obd_max_dirty_pages)) { | |
2d00bd17 JP |
1637 | CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n", |
1638 | cli->cl_dirty, | |
d7e09d03 PT |
1639 | cli->cl_dirty_max, obd_max_dirty_pages); |
1640 | goto wakeup; | |
1641 | } | |
1642 | ||
1643 | ocw->ocw_rc = 0; | |
1644 | if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) | |
1645 | ocw->ocw_rc = -EDQUOT; | |
1646 | ||
1647 | wakeup: | |
1648 | CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", | |
1649 | ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); | |
1650 | ||
1651 | wake_up(&ocw->ocw_waitq); | |
1652 | } | |
d7e09d03 PT |
1653 | } |
1654 | ||
1655 | static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) | |
1656 | { | |
1657 | int hprpc = !!list_empty(&osc->oo_hp_exts); | |
50ffcb7e | 1658 | |
d7e09d03 PT |
1659 | return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; |
1660 | } | |
1661 | ||
1662 | /* This maintains the lists of pending pages to read/write for a given object | |
1663 | * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() | |
30aa9c52 OD |
1664 | * to quickly find objects that are ready to send an RPC. |
1665 | */ | |
d7e09d03 PT |
1666 | static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, |
1667 | int cmd) | |
1668 | { | |
1669 | int invalid_import = 0; | |
d7e09d03 PT |
1670 | |
1671 | /* if we have an invalid import we want to drain the queued pages | |
1672 | * by forcing them through rpcs that immediately fail and complete | |
1673 | * the pages. recovery relies on this to empty the queued pages | |
30aa9c52 OD |
1674 | * before canceling the locks and evicting down the llite pages |
1675 | */ | |
7f1ae4c0 | 1676 | if (!cli->cl_import || cli->cl_import->imp_invalid) |
d7e09d03 PT |
1677 | invalid_import = 1; |
1678 | ||
1679 | if (cmd & OBD_BRW_WRITE) { | |
1680 | if (atomic_read(&osc->oo_nr_writes) == 0) | |
0a3bdb00 | 1681 | return 0; |
d7e09d03 PT |
1682 | if (invalid_import) { |
1683 | CDEBUG(D_CACHE, "invalid import forcing RPC\n"); | |
0a3bdb00 | 1684 | return 1; |
d7e09d03 PT |
1685 | } |
1686 | if (!list_empty(&osc->oo_hp_exts)) { | |
1687 | CDEBUG(D_CACHE, "high prio request forcing RPC\n"); | |
0a3bdb00 | 1688 | return 1; |
d7e09d03 PT |
1689 | } |
1690 | if (!list_empty(&osc->oo_urgent_exts)) { | |
1691 | CDEBUG(D_CACHE, "urgent request forcing RPC\n"); | |
0a3bdb00 | 1692 | return 1; |
d7e09d03 PT |
1693 | } |
1694 | /* trigger a write rpc stream as long as there are dirtiers | |
1695 | * waiting for space. as they're waiting, they're not going to | |
30aa9c52 OD |
1696 | * create more pages to coalesce with what's waiting.. |
1697 | */ | |
d7e09d03 PT |
1698 | if (!list_empty(&cli->cl_cache_waiters)) { |
1699 | CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); | |
0a3bdb00 | 1700 | return 1; |
d7e09d03 PT |
1701 | } |
1702 | if (atomic_read(&osc->oo_nr_writes) >= | |
1703 | cli->cl_max_pages_per_rpc) | |
0a3bdb00 | 1704 | return 1; |
d7e09d03 PT |
1705 | } else { |
1706 | if (atomic_read(&osc->oo_nr_reads) == 0) | |
0a3bdb00 | 1707 | return 0; |
d7e09d03 PT |
1708 | if (invalid_import) { |
1709 | CDEBUG(D_CACHE, "invalid import forcing RPC\n"); | |
0a3bdb00 | 1710 | return 1; |
d7e09d03 PT |
1711 | } |
1712 | /* all read are urgent. */ | |
1713 | if (!list_empty(&osc->oo_reading_exts)) | |
0a3bdb00 | 1714 | return 1; |
d7e09d03 PT |
1715 | } |
1716 | ||
0a3bdb00 | 1717 | return 0; |
d7e09d03 PT |
1718 | } |
1719 | ||
1720 | static void osc_update_pending(struct osc_object *obj, int cmd, int delta) | |
1721 | { | |
1722 | struct client_obd *cli = osc_cli(obj); | |
50ffcb7e | 1723 | |
d7e09d03 PT |
1724 | if (cmd & OBD_BRW_WRITE) { |
1725 | atomic_add(delta, &obj->oo_nr_writes); | |
1726 | atomic_add(delta, &cli->cl_pending_w_pages); | |
1727 | LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); | |
1728 | } else { | |
1729 | atomic_add(delta, &obj->oo_nr_reads); | |
1730 | atomic_add(delta, &cli->cl_pending_r_pages); | |
1731 | LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); | |
1732 | } | |
1733 | OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); | |
1734 | } | |
1735 | ||
1736 | static int osc_makes_hprpc(struct osc_object *obj) | |
1737 | { | |
1738 | return !list_empty(&obj->oo_hp_exts); | |
1739 | } | |
1740 | ||
1741 | static void on_list(struct list_head *item, struct list_head *list, int should_be_on) | |
1742 | { | |
1743 | if (list_empty(item) && should_be_on) | |
1744 | list_add_tail(item, list); | |
1745 | else if (!list_empty(item) && !should_be_on) | |
1746 | list_del_init(item); | |
1747 | } | |
1748 | ||
1749 | /* maintain the osc's cli list membership invariants so that osc_send_oap_rpc | |
30aa9c52 OD |
1750 | * can find pages to build into rpcs quickly |
1751 | */ | |
d7e09d03 PT |
1752 | static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) |
1753 | { | |
1754 | if (osc_makes_hprpc(osc)) { | |
1755 | /* HP rpc */ | |
1756 | on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); | |
1757 | on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); | |
1758 | } else { | |
1759 | on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); | |
1760 | on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, | |
1761 | osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || | |
1762 | osc_makes_rpc(cli, osc, OBD_BRW_READ)); | |
1763 | } | |
1764 | ||
1765 | on_list(&osc->oo_write_item, &cli->cl_loi_write_list, | |
1766 | atomic_read(&osc->oo_nr_writes) > 0); | |
1767 | ||
1768 | on_list(&osc->oo_read_item, &cli->cl_loi_read_list, | |
1769 | atomic_read(&osc->oo_nr_reads) > 0); | |
1770 | ||
1771 | return osc_is_ready(osc); | |
1772 | } | |
1773 | ||
1774 | static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) | |
1775 | { | |
1776 | int is_ready; | |
1777 | ||
1778 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1779 | is_ready = __osc_list_maint(cli, osc); | |
1780 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1781 | ||
1782 | return is_ready; | |
1783 | } | |
1784 | ||
11d66e89 | 1785 | /* this is trying to propagate async writeback errors back up to the |
d7e09d03 PT |
1786 | * application. As an async write fails we record the error code for later if |
1787 | * the app does an fsync. As long as errors persist we force future rpcs to be | |
1788 | * sync so that the app can get a sync error and break the cycle of queueing | |
30aa9c52 OD |
1789 | * pages for which writeback will fail. |
1790 | */ | |
d7e09d03 PT |
1791 | static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, |
1792 | int rc) | |
1793 | { | |
1794 | if (rc) { | |
1795 | if (!ar->ar_rc) | |
1796 | ar->ar_rc = rc; | |
1797 | ||
1798 | ar->ar_force_sync = 1; | |
1799 | ar->ar_min_xid = ptlrpc_sample_next_xid(); | |
1800 | return; | |
1801 | ||
1802 | } | |
1803 | ||
1804 | if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) | |
1805 | ar->ar_force_sync = 0; | |
1806 | } | |
1807 | ||
d7e09d03 | 1808 | /* this must be called holding the loi list lock to give coverage to exit_cache, |
30aa9c52 OD |
1809 | * async_flag maintenance, and oap_request |
1810 | */ | |
d7e09d03 PT |
1811 | static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, |
1812 | struct osc_async_page *oap, int sent, int rc) | |
1813 | { | |
1814 | struct osc_object *osc = oap->oap_obj; | |
29ac6840 | 1815 | struct lov_oinfo *loi = osc->oo_oinfo; |
d7e09d03 PT |
1816 | __u64 xid = 0; |
1817 | ||
7f1ae4c0 | 1818 | if (oap->oap_request) { |
d7e09d03 PT |
1819 | xid = ptlrpc_req_xid(oap->oap_request); |
1820 | ptlrpc_req_finished(oap->oap_request); | |
1821 | oap->oap_request = NULL; | |
1822 | } | |
1823 | ||
1824 | /* As the transfer for this page is being done, clear the flags */ | |
1825 | spin_lock(&oap->oap_lock); | |
1826 | oap->oap_async_flags = 0; | |
1827 | spin_unlock(&oap->oap_lock); | |
1828 | oap->oap_interrupted = 0; | |
1829 | ||
1830 | if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { | |
1831 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1832 | osc_process_ar(&cli->cl_ar, xid, rc); | |
1833 | osc_process_ar(&loi->loi_ar, xid, rc); | |
1834 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1835 | } | |
1836 | ||
1837 | rc = osc_completion(env, oap, oap->oap_cmd, rc); | |
1838 | if (rc) | |
1839 | CERROR("completion on oap %p obj %p returns %d.\n", | |
1840 | oap, osc, rc); | |
d7e09d03 PT |
1841 | } |
1842 | ||
1843 | /** | |
1844 | * Try to add extent to one RPC. We need to think about the following things: | |
1845 | * - # of pages must not be over max_pages_per_rpc | |
1846 | * - extent must be compatible with previous ones | |
1847 | */ | |
1848 | static int try_to_add_extent_for_io(struct client_obd *cli, | |
1849 | struct osc_extent *ext, struct list_head *rpclist, | |
1850 | int *pc, unsigned int *max_pages) | |
1851 | { | |
1852 | struct osc_extent *tmp; | |
c00266e3 AB |
1853 | struct osc_async_page *oap = list_first_entry(&ext->oe_pages, |
1854 | struct osc_async_page, | |
1855 | oap_pending_item); | |
d7e09d03 PT |
1856 | |
1857 | EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), | |
1858 | ext); | |
1859 | ||
1860 | *max_pages = max(ext->oe_mppr, *max_pages); | |
1861 | if (*pc + ext->oe_nr_pages > *max_pages) | |
0a3bdb00 | 1862 | return 0; |
d7e09d03 PT |
1863 | |
1864 | list_for_each_entry(tmp, rpclist, oe_link) { | |
c00266e3 AB |
1865 | struct osc_async_page *oap2; |
1866 | ||
1867 | oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, | |
1868 | oap_pending_item); | |
d7e09d03 | 1869 | EASSERT(tmp->oe_owner == current, tmp); |
c00266e3 AB |
1870 | if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { |
1871 | CDEBUG(D_CACHE, "Do not permit different type of IO" | |
1872 | " for a same RPC\n"); | |
1873 | return 0; | |
1874 | } | |
d7e09d03 PT |
1875 | |
1876 | if (tmp->oe_srvlock != ext->oe_srvlock || | |
1877 | !tmp->oe_grants != !ext->oe_grants) | |
0a3bdb00 | 1878 | return 0; |
d7e09d03 PT |
1879 | |
1880 | /* remove break for strict check */ | |
1881 | break; | |
1882 | } | |
1883 | ||
1884 | *pc += ext->oe_nr_pages; | |
1885 | list_move_tail(&ext->oe_link, rpclist); | |
1886 | ext->oe_owner = current; | |
0a3bdb00 | 1887 | return 1; |
d7e09d03 PT |
1888 | } |
1889 | ||
1890 | /** | |
1891 | * In order to prevent multiple ptlrpcd from breaking contiguous extents, | |
1892 | * get_write_extent() takes all appropriate extents in atomic. | |
1893 | * | |
1894 | * The following policy is used to collect extents for IO: | |
1895 | * 1. Add as many HP extents as possible; | |
1896 | * 2. Add the first urgent extent in urgent extent list and take it out of | |
1897 | * urgent list; | |
1898 | * 3. Add subsequent extents of this urgent extent; | |
1899 | * 4. If urgent list is not empty, goto 2; | |
1900 | * 5. Traverse the extent tree from the 1st extent; | |
1901 | * 6. Above steps exit if there is no space in this RPC. | |
1902 | */ | |
1903 | static int get_write_extents(struct osc_object *obj, struct list_head *rpclist) | |
1904 | { | |
1905 | struct client_obd *cli = osc_cli(obj); | |
1906 | struct osc_extent *ext; | |
1907 | int page_count = 0; | |
1908 | unsigned int max_pages = cli->cl_max_pages_per_rpc; | |
1909 | ||
1910 | LASSERT(osc_object_is_locked(obj)); | |
1911 | while (!list_empty(&obj->oo_hp_exts)) { | |
1912 | ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, | |
79910d7d | 1913 | oe_link); |
d7e09d03 PT |
1914 | LASSERT(ext->oe_state == OES_CACHE); |
1915 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1916 | &max_pages)) | |
1917 | return page_count; | |
1918 | EASSERT(ext->oe_nr_pages <= max_pages, ext); | |
1919 | } | |
1920 | if (page_count == max_pages) | |
1921 | return page_count; | |
1922 | ||
1923 | while (!list_empty(&obj->oo_urgent_exts)) { | |
1924 | ext = list_entry(obj->oo_urgent_exts.next, | |
79910d7d | 1925 | struct osc_extent, oe_link); |
d7e09d03 PT |
1926 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, |
1927 | &max_pages)) | |
1928 | return page_count; | |
1929 | ||
1930 | if (!ext->oe_intree) | |
1931 | continue; | |
1932 | ||
1933 | while ((ext = next_extent(ext)) != NULL) { | |
1934 | if ((ext->oe_state != OES_CACHE) || | |
1935 | (!list_empty(&ext->oe_link) && | |
7f1ae4c0 | 1936 | ext->oe_owner)) |
d7e09d03 PT |
1937 | continue; |
1938 | ||
1939 | if (!try_to_add_extent_for_io(cli, ext, rpclist, | |
1940 | &page_count, &max_pages)) | |
1941 | return page_count; | |
1942 | } | |
1943 | } | |
1944 | if (page_count == max_pages) | |
1945 | return page_count; | |
1946 | ||
1947 | ext = first_extent(obj); | |
7f1ae4c0 | 1948 | while (ext) { |
d7e09d03 PT |
1949 | if ((ext->oe_state != OES_CACHE) || |
1950 | /* this extent may be already in current rpclist */ | |
7f1ae4c0 | 1951 | (!list_empty(&ext->oe_link) && ext->oe_owner)) { |
d7e09d03 PT |
1952 | ext = next_extent(ext); |
1953 | continue; | |
1954 | } | |
1955 | ||
1956 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1957 | &max_pages)) | |
1958 | return page_count; | |
1959 | ||
1960 | ext = next_extent(ext); | |
1961 | } | |
1962 | return page_count; | |
1963 | } | |
1964 | ||
1965 | static int | |
1966 | osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, | |
c5c4c6fa | 1967 | struct osc_object *osc) |
a161de86 | 1968 | __must_hold(osc) |
d7e09d03 PT |
1969 | { |
1970 | LIST_HEAD(rpclist); | |
1971 | struct osc_extent *ext; | |
1972 | struct osc_extent *tmp; | |
1973 | struct osc_extent *first = NULL; | |
21aef7d9 | 1974 | u32 page_count = 0; |
d7e09d03 PT |
1975 | int srvlock = 0; |
1976 | int rc = 0; | |
d7e09d03 PT |
1977 | |
1978 | LASSERT(osc_object_is_locked(osc)); | |
1979 | ||
1980 | page_count = get_write_extents(osc, &rpclist); | |
1981 | LASSERT(equi(page_count == 0, list_empty(&rpclist))); | |
1982 | ||
1983 | if (list_empty(&rpclist)) | |
0a3bdb00 | 1984 | return 0; |
d7e09d03 PT |
1985 | |
1986 | osc_update_pending(osc, OBD_BRW_WRITE, -page_count); | |
1987 | ||
1988 | list_for_each_entry(ext, &rpclist, oe_link) { | |
1989 | LASSERT(ext->oe_state == OES_CACHE || | |
1990 | ext->oe_state == OES_LOCK_DONE); | |
1991 | if (ext->oe_state == OES_CACHE) | |
1992 | osc_extent_state_set(ext, OES_LOCKING); | |
1993 | else | |
1994 | osc_extent_state_set(ext, OES_RPC); | |
1995 | } | |
1996 | ||
1997 | /* we're going to grab page lock, so release object lock because | |
30aa9c52 OD |
1998 | * lock order is page lock -> object lock. |
1999 | */ | |
d7e09d03 PT |
2000 | osc_object_unlock(osc); |
2001 | ||
2002 | list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { | |
2003 | if (ext->oe_state == OES_LOCKING) { | |
2004 | rc = osc_extent_make_ready(env, ext); | |
2005 | if (unlikely(rc < 0)) { | |
2006 | list_del_init(&ext->oe_link); | |
2007 | osc_extent_finish(env, ext, 0, rc); | |
2008 | continue; | |
2009 | } | |
2010 | } | |
7f1ae4c0 | 2011 | if (!first) { |
d7e09d03 PT |
2012 | first = ext; |
2013 | srvlock = ext->oe_srvlock; | |
2014 | } else { | |
2015 | LASSERT(srvlock == ext->oe_srvlock); | |
2016 | } | |
2017 | } | |
2018 | ||
2019 | if (!list_empty(&rpclist)) { | |
2020 | LASSERT(page_count > 0); | |
c5c4c6fa | 2021 | rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE); |
d7e09d03 PT |
2022 | LASSERT(list_empty(&rpclist)); |
2023 | } | |
2024 | ||
2025 | osc_object_lock(osc); | |
0a3bdb00 | 2026 | return rc; |
d7e09d03 PT |
2027 | } |
2028 | ||
2029 | /** | |
2030 | * prepare pages for ASYNC io and put pages in send queue. | |
2031 | * | |
2032 | * \param cmd OBD_BRW_* macroses | |
2033 | * \param lop pending pages | |
2034 | * | |
2035 | * \return zero if no page added to send queue. | |
2036 | * \return 1 if pages successfully added to send queue. | |
2037 | * \return negative on errors. | |
2038 | */ | |
2039 | static int | |
2040 | osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, | |
c5c4c6fa | 2041 | struct osc_object *osc) |
a161de86 | 2042 | __must_hold(osc) |
d7e09d03 PT |
2043 | { |
2044 | struct osc_extent *ext; | |
2045 | struct osc_extent *next; | |
2046 | LIST_HEAD(rpclist); | |
2047 | int page_count = 0; | |
2048 | unsigned int max_pages = cli->cl_max_pages_per_rpc; | |
2049 | int rc = 0; | |
d7e09d03 PT |
2050 | |
2051 | LASSERT(osc_object_is_locked(osc)); | |
79910d7d | 2052 | list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { |
d7e09d03 PT |
2053 | EASSERT(ext->oe_state == OES_LOCK_DONE, ext); |
2054 | if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count, | |
2055 | &max_pages)) | |
2056 | break; | |
2057 | osc_extent_state_set(ext, OES_RPC); | |
2058 | EASSERT(ext->oe_nr_pages <= max_pages, ext); | |
2059 | } | |
2060 | LASSERT(page_count <= max_pages); | |
2061 | ||
2062 | osc_update_pending(osc, OBD_BRW_READ, -page_count); | |
2063 | ||
2064 | if (!list_empty(&rpclist)) { | |
2065 | osc_object_unlock(osc); | |
2066 | ||
2067 | LASSERT(page_count > 0); | |
c5c4c6fa | 2068 | rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ); |
d7e09d03 PT |
2069 | LASSERT(list_empty(&rpclist)); |
2070 | ||
2071 | osc_object_lock(osc); | |
2072 | } | |
0a3bdb00 | 2073 | return rc; |
d7e09d03 PT |
2074 | } |
2075 | ||
2076 | #define list_to_obj(list, item) ({ \ | |
2077 | struct list_head *__tmp = (list)->next; \ | |
29ac6840 | 2078 | list_del_init(__tmp); \ |
d7e09d03 PT |
2079 | list_entry(__tmp, struct osc_object, oo_##item); \ |
2080 | }) | |
2081 | ||
2082 | /* This is called by osc_check_rpcs() to find which objects have pages that | |
30aa9c52 OD |
2083 | * we could be sending. These lists are maintained by osc_makes_rpc(). |
2084 | */ | |
d7e09d03 PT |
2085 | static struct osc_object *osc_next_obj(struct client_obd *cli) |
2086 | { | |
d7e09d03 PT |
2087 | /* First return objects that have blocked locks so that they |
2088 | * will be flushed quickly and other clients can get the lock, | |
30aa9c52 OD |
2089 | * then objects which have pages ready to be stuffed into RPCs |
2090 | */ | |
d7e09d03 | 2091 | if (!list_empty(&cli->cl_loi_hp_ready_list)) |
0a3bdb00 | 2092 | return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item); |
d7e09d03 | 2093 | if (!list_empty(&cli->cl_loi_ready_list)) |
0a3bdb00 | 2094 | return list_to_obj(&cli->cl_loi_ready_list, ready_item); |
d7e09d03 PT |
2095 | |
2096 | /* then if we have cache waiters, return all objects with queued | |
2097 | * writes. This is especially important when many small files | |
2098 | * have filled up the cache and not been fired into rpcs because | |
30aa9c52 OD |
2099 | * they don't pass the nr_pending/object threshold |
2100 | */ | |
d7e09d03 PT |
2101 | if (!list_empty(&cli->cl_cache_waiters) && |
2102 | !list_empty(&cli->cl_loi_write_list)) | |
0a3bdb00 | 2103 | return list_to_obj(&cli->cl_loi_write_list, write_item); |
d7e09d03 PT |
2104 | |
2105 | /* then return all queued objects when we have an invalid import | |
30aa9c52 OD |
2106 | * so that they get flushed |
2107 | */ | |
7f1ae4c0 | 2108 | if (!cli->cl_import || cli->cl_import->imp_invalid) { |
d7e09d03 | 2109 | if (!list_empty(&cli->cl_loi_write_list)) |
0a3bdb00 | 2110 | return list_to_obj(&cli->cl_loi_write_list, write_item); |
d7e09d03 | 2111 | if (!list_empty(&cli->cl_loi_read_list)) |
0a3bdb00 | 2112 | return list_to_obj(&cli->cl_loi_read_list, read_item); |
d7e09d03 | 2113 | } |
0a3bdb00 | 2114 | return NULL; |
d7e09d03 PT |
2115 | } |
2116 | ||
2117 | /* called with the loi list lock held */ | |
c5c4c6fa | 2118 | static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) |
a161de86 | 2119 | __must_hold(&cli->cl_loi_list_lock) |
d7e09d03 PT |
2120 | { |
2121 | struct osc_object *osc; | |
2122 | int rc = 0; | |
d7e09d03 PT |
2123 | |
2124 | while ((osc = osc_next_obj(cli)) != NULL) { | |
2125 | struct cl_object *obj = osc2cl(osc); | |
631abc6e | 2126 | struct lu_ref_link link; |
d7e09d03 PT |
2127 | |
2128 | OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); | |
2129 | ||
2130 | if (osc_max_rpc_in_flight(cli, osc)) { | |
2131 | __osc_list_maint(cli, osc); | |
2132 | break; | |
2133 | } | |
2134 | ||
2135 | cl_object_get(obj); | |
2136 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
631abc6e JH |
2137 | lu_object_ref_add_at(&obj->co_lu, &link, "check", |
2138 | current); | |
d7e09d03 PT |
2139 | |
2140 | /* attempt some read/write balancing by alternating between | |
2141 | * reads and writes in an object. The makes_rpc checks here | |
2142 | * would be redundant if we were getting read/write work items | |
2143 | * instead of objects. we don't want send_oap_rpc to drain a | |
2144 | * partial read pending queue when we're given this object to | |
30aa9c52 OD |
2145 | * do io on writes while there are cache waiters |
2146 | */ | |
d7e09d03 PT |
2147 | osc_object_lock(osc); |
2148 | if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { | |
c5c4c6fa | 2149 | rc = osc_send_write_rpc(env, cli, osc); |
d7e09d03 PT |
2150 | if (rc < 0) { |
2151 | CERROR("Write request failed with %d\n", rc); | |
2152 | ||
2153 | /* osc_send_write_rpc failed, mostly because of | |
2154 | * memory pressure. | |
2155 | * | |
2156 | * It can't break here, because if: | |
2157 | * - a page was submitted by osc_io_submit, so | |
2158 | * page locked; | |
2159 | * - no request in flight | |
2160 | * - no subsequent request | |
2161 | * The system will be in live-lock state, | |
2162 | * because there is no chance to call | |
2163 | * osc_io_unplug() and osc_check_rpcs() any | |
2164 | * more. pdflush can't help in this case, | |
2165 | * because it might be blocked at grabbing | |
2166 | * the page lock as we mentioned. | |
2167 | * | |
30aa9c52 OD |
2168 | * Anyway, continue to drain pages. |
2169 | */ | |
d7e09d03 PT |
2170 | /* break; */ |
2171 | } | |
2172 | } | |
2173 | if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { | |
c5c4c6fa | 2174 | rc = osc_send_read_rpc(env, cli, osc); |
d7e09d03 PT |
2175 | if (rc < 0) |
2176 | CERROR("Read request failed with %d\n", rc); | |
2177 | } | |
2178 | osc_object_unlock(osc); | |
2179 | ||
2180 | osc_list_maint(cli, osc); | |
631abc6e JH |
2181 | lu_object_ref_del_at(&obj->co_lu, &link, "check", |
2182 | current); | |
d7e09d03 PT |
2183 | cl_object_put(env, obj); |
2184 | ||
2185 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2186 | } | |
2187 | } | |
2188 | ||
2189 | static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, | |
c5c4c6fa | 2190 | struct osc_object *osc, int async) |
d7e09d03 | 2191 | { |
d7e09d03 PT |
2192 | int rc = 0; |
2193 | ||
7f1ae4c0 | 2194 | if (osc && osc_list_maint(cli, osc) == 0) |
cad6fafa BJ |
2195 | return 0; |
2196 | ||
2197 | if (!async) { | |
2198 | /* disable osc_lru_shrink() temporarily to avoid | |
30aa9c52 OD |
2199 | * potential stack overrun problem. LU-2859 |
2200 | */ | |
cad6fafa BJ |
2201 | atomic_inc(&cli->cl_lru_shrinkers); |
2202 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
c5c4c6fa | 2203 | osc_check_rpcs(env, cli); |
cad6fafa BJ |
2204 | client_obd_list_unlock(&cli->cl_loi_list_lock); |
2205 | atomic_dec(&cli->cl_lru_shrinkers); | |
2206 | } else { | |
2207 | CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); | |
7f1ae4c0 | 2208 | LASSERT(cli->cl_writeback_work); |
cad6fafa | 2209 | rc = ptlrpcd_queue_work(cli->cl_writeback_work); |
d7e09d03 | 2210 | } |
d7e09d03 PT |
2211 | return rc; |
2212 | } | |
2213 | ||
2214 | static int osc_io_unplug_async(const struct lu_env *env, | |
29ac6840 | 2215 | struct client_obd *cli, struct osc_object *osc) |
d7e09d03 | 2216 | { |
c5c4c6fa | 2217 | return osc_io_unplug0(env, cli, osc, 1); |
d7e09d03 PT |
2218 | } |
2219 | ||
2220 | void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, | |
c5c4c6fa | 2221 | struct osc_object *osc) |
d7e09d03 | 2222 | { |
c5c4c6fa | 2223 | (void)osc_io_unplug0(env, cli, osc, 0); |
d7e09d03 PT |
2224 | } |
2225 | ||
2226 | int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, | |
2227 | struct page *page, loff_t offset) | |
2228 | { | |
29ac6840 | 2229 | struct obd_export *exp = osc_export(osc); |
d7e09d03 | 2230 | struct osc_async_page *oap = &ops->ops_oap; |
d7e09d03 PT |
2231 | |
2232 | if (!page) | |
2233 | return cfs_size_round(sizeof(*oap)); | |
2234 | ||
2235 | oap->oap_magic = OAP_MAGIC; | |
2236 | oap->oap_cli = &exp->exp_obd->u.cli; | |
2237 | oap->oap_obj = osc; | |
2238 | ||
2239 | oap->oap_page = page; | |
2240 | oap->oap_obj_off = offset; | |
2241 | LASSERT(!(offset & ~CFS_PAGE_MASK)); | |
2242 | ||
2eb90a75 | 2243 | if (!client_is_remote(exp) && capable(CFS_CAP_SYS_RESOURCE)) |
d7e09d03 PT |
2244 | oap->oap_brw_flags = OBD_BRW_NOQUOTA; |
2245 | ||
2246 | INIT_LIST_HEAD(&oap->oap_pending_item); | |
2247 | INIT_LIST_HEAD(&oap->oap_rpc_item); | |
2248 | ||
2249 | spin_lock_init(&oap->oap_lock); | |
b0f5aad5 | 2250 | CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", |
d7e09d03 | 2251 | oap, page, oap->oap_obj_off); |
0a3bdb00 | 2252 | return 0; |
d7e09d03 PT |
2253 | } |
2254 | ||
2255 | int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, | |
2256 | struct osc_page *ops) | |
2257 | { | |
2258 | struct osc_io *oio = osc_env_io(env); | |
29ac6840 | 2259 | struct osc_extent *ext = NULL; |
d7e09d03 | 2260 | struct osc_async_page *oap = &ops->ops_oap; |
29ac6840 CH |
2261 | struct client_obd *cli = oap->oap_cli; |
2262 | struct osc_object *osc = oap->oap_obj; | |
d7e09d03 | 2263 | pgoff_t index; |
29ac6840 CH |
2264 | int grants = 0; |
2265 | int brw_flags = OBD_BRW_ASYNC; | |
2266 | int cmd = OBD_BRW_WRITE; | |
2267 | int need_release = 0; | |
2268 | int rc = 0; | |
d7e09d03 PT |
2269 | |
2270 | if (oap->oap_magic != OAP_MAGIC) | |
0a3bdb00 | 2271 | return -EINVAL; |
d7e09d03 | 2272 | |
7f1ae4c0 | 2273 | if (!cli->cl_import || cli->cl_import->imp_invalid) |
0a3bdb00 | 2274 | return -EIO; |
d7e09d03 PT |
2275 | |
2276 | if (!list_empty(&oap->oap_pending_item) || | |
2277 | !list_empty(&oap->oap_rpc_item)) | |
0a3bdb00 | 2278 | return -EBUSY; |
d7e09d03 PT |
2279 | |
2280 | /* Set the OBD_BRW_SRVLOCK before the page is queued. */ | |
2281 | brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; | |
2282 | if (!client_is_remote(osc_export(osc)) && | |
2eb90a75 | 2283 | capable(CFS_CAP_SYS_RESOURCE)) { |
d7e09d03 PT |
2284 | brw_flags |= OBD_BRW_NOQUOTA; |
2285 | cmd |= OBD_BRW_NOQUOTA; | |
2286 | } | |
2287 | ||
2288 | /* check if the file's owner/group is over quota */ | |
2289 | if (!(cmd & OBD_BRW_NOQUOTA)) { | |
2290 | struct cl_object *obj; | |
29ac6840 | 2291 | struct cl_attr *attr; |
d7e09d03 PT |
2292 | unsigned int qid[MAXQUOTAS]; |
2293 | ||
2294 | obj = cl_object_top(&osc->oo_cl); | |
2295 | attr = &osc_env_info(env)->oti_attr; | |
2296 | ||
2297 | cl_object_attr_lock(obj); | |
2298 | rc = cl_object_attr_get(env, obj, attr); | |
2299 | cl_object_attr_unlock(obj); | |
2300 | ||
2301 | qid[USRQUOTA] = attr->cat_uid; | |
2302 | qid[GRPQUOTA] = attr->cat_gid; | |
2303 | if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) | |
2304 | rc = -EDQUOT; | |
2305 | if (rc) | |
0a3bdb00 | 2306 | return rc; |
d7e09d03 PT |
2307 | } |
2308 | ||
2309 | oap->oap_cmd = cmd; | |
2310 | oap->oap_page_off = ops->ops_from; | |
2311 | oap->oap_count = ops->ops_to - ops->ops_from; | |
2312 | oap->oap_async_flags = 0; | |
2313 | oap->oap_brw_flags = brw_flags; | |
2314 | ||
2315 | OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", | |
2316 | oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); | |
2317 | ||
2318 | index = oap2cl_page(oap)->cp_index; | |
2319 | ||
2320 | /* Add this page into extent by the following steps: | |
2321 | * 1. if there exists an active extent for this IO, mostly this page | |
2322 | * can be added to the active extent and sometimes we need to | |
11d66e89 | 2323 | * expand extent to accommodate this page; |
30aa9c52 OD |
2324 | * 2. otherwise, a new extent will be allocated. |
2325 | */ | |
d7e09d03 PT |
2326 | |
2327 | ext = oio->oi_active; | |
7f1ae4c0 | 2328 | if (ext && ext->oe_start <= index && ext->oe_max_end >= index) { |
d7e09d03 | 2329 | /* one chunk plus extent overhead must be enough to write this |
30aa9c52 OD |
2330 | * page |
2331 | */ | |
d7e09d03 PT |
2332 | grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; |
2333 | if (ext->oe_end >= index) | |
2334 | grants = 0; | |
2335 | ||
2336 | /* it doesn't need any grant to dirty this page */ | |
2337 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2338 | rc = osc_enter_cache_try(cli, oap, grants, 0); | |
2339 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
2340 | if (rc == 0) { /* try failed */ | |
2341 | grants = 0; | |
2342 | need_release = 1; | |
2343 | } else if (ext->oe_end < index) { | |
2344 | int tmp = grants; | |
2345 | /* try to expand this extent */ | |
2346 | rc = osc_extent_expand(ext, index, &tmp); | |
2347 | if (rc < 0) { | |
2348 | need_release = 1; | |
2349 | /* don't free reserved grant */ | |
2350 | } else { | |
2351 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2352 | "expanded for %lu.\n", index); | |
2353 | osc_unreserve_grant(cli, grants, tmp); | |
2354 | grants = 0; | |
2355 | } | |
2356 | } | |
2357 | rc = 0; | |
7f1ae4c0 | 2358 | } else if (ext) { |
d7e09d03 PT |
2359 | /* index is located outside of active extent */ |
2360 | need_release = 1; | |
2361 | } | |
2362 | if (need_release) { | |
2363 | osc_extent_release(env, ext); | |
2364 | oio->oi_active = NULL; | |
2365 | ext = NULL; | |
2366 | } | |
2367 | ||
7f1ae4c0 | 2368 | if (!ext) { |
d7e09d03 PT |
2369 | int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; |
2370 | ||
2371 | /* try to find new extent to cover this page */ | |
7f1ae4c0 | 2372 | LASSERT(!oio->oi_active); |
d7e09d03 | 2373 | /* we may have allocated grant for this page if we failed |
30aa9c52 OD |
2374 | * to expand the previous active extent. |
2375 | */ | |
d7e09d03 PT |
2376 | LASSERT(ergo(grants > 0, grants >= tmp)); |
2377 | ||
2378 | rc = 0; | |
2379 | if (grants == 0) { | |
2380 | /* we haven't allocated grant for this page. */ | |
2381 | rc = osc_enter_cache(env, cli, oap, tmp); | |
2382 | if (rc == 0) | |
2383 | grants = tmp; | |
2384 | } | |
2385 | ||
2386 | tmp = grants; | |
2387 | if (rc == 0) { | |
2388 | ext = osc_extent_find(env, osc, index, &tmp); | |
2389 | if (IS_ERR(ext)) { | |
2390 | LASSERT(tmp == grants); | |
2391 | osc_exit_cache(cli, oap); | |
2392 | rc = PTR_ERR(ext); | |
2393 | ext = NULL; | |
2394 | } else { | |
2395 | oio->oi_active = ext; | |
2396 | } | |
2397 | } | |
2398 | if (grants > 0) | |
2399 | osc_unreserve_grant(cli, grants, tmp); | |
2400 | } | |
2401 | ||
7f1ae4c0 OD |
2402 | LASSERT(ergo(rc == 0, ext)); |
2403 | if (ext) { | |
d7e09d03 PT |
2404 | EASSERTF(ext->oe_end >= index && ext->oe_start <= index, |
2405 | ext, "index = %lu.\n", index); | |
2406 | LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); | |
2407 | ||
2408 | osc_object_lock(osc); | |
2409 | if (ext->oe_nr_pages == 0) | |
2410 | ext->oe_srvlock = ops->ops_srvlock; | |
2411 | else | |
2412 | LASSERT(ext->oe_srvlock == ops->ops_srvlock); | |
2413 | ++ext->oe_nr_pages; | |
2414 | list_add_tail(&oap->oap_pending_item, &ext->oe_pages); | |
2415 | osc_object_unlock(osc); | |
2416 | } | |
0a3bdb00 | 2417 | return rc; |
d7e09d03 PT |
2418 | } |
2419 | ||
2420 | int osc_teardown_async_page(const struct lu_env *env, | |
2421 | struct osc_object *obj, struct osc_page *ops) | |
2422 | { | |
2423 | struct osc_async_page *oap = &ops->ops_oap; | |
29ac6840 | 2424 | struct osc_extent *ext = NULL; |
d7e09d03 | 2425 | int rc = 0; |
d7e09d03 PT |
2426 | |
2427 | LASSERT(oap->oap_magic == OAP_MAGIC); | |
2428 | ||
2429 | CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", | |
2430 | oap, ops, oap2cl_page(oap)->cp_index); | |
2431 | ||
2432 | osc_object_lock(obj); | |
2433 | if (!list_empty(&oap->oap_rpc_item)) { | |
2434 | CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); | |
2435 | rc = -EBUSY; | |
2436 | } else if (!list_empty(&oap->oap_pending_item)) { | |
2437 | ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index); | |
2438 | /* only truncated pages are allowed to be taken out. | |
2439 | * See osc_extent_truncate() and osc_cache_truncate_start() | |
30aa9c52 OD |
2440 | * for details. |
2441 | */ | |
7f1ae4c0 | 2442 | if (ext && ext->oe_state != OES_TRUNC) { |
d7e09d03 PT |
2443 | OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", |
2444 | oap2cl_page(oap)->cp_index); | |
2445 | rc = -EBUSY; | |
2446 | } | |
2447 | } | |
2448 | osc_object_unlock(obj); | |
7f1ae4c0 | 2449 | if (ext) |
d7e09d03 | 2450 | osc_extent_put(env, ext); |
0a3bdb00 | 2451 | return rc; |
d7e09d03 PT |
2452 | } |
2453 | ||
2454 | /** | |
2455 | * This is called when a page is picked up by kernel to write out. | |
2456 | * | |
2457 | * We should find out the corresponding extent and add the whole extent | |
2458 | * into urgent list. The extent may be being truncated or used, handle it | |
2459 | * carefully. | |
2460 | */ | |
2461 | int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, | |
2462 | struct osc_page *ops) | |
2463 | { | |
29ac6840 CH |
2464 | struct osc_extent *ext = NULL; |
2465 | struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); | |
2466 | struct cl_page *cp = ops->ops_cl.cpl_page; | |
2467 | pgoff_t index = cp->cp_index; | |
d7e09d03 PT |
2468 | struct osc_async_page *oap = &ops->ops_oap; |
2469 | bool unplug = false; | |
2470 | int rc = 0; | |
d7e09d03 PT |
2471 | |
2472 | osc_object_lock(obj); | |
2473 | ext = osc_extent_lookup(obj, index); | |
7f1ae4c0 | 2474 | if (!ext) { |
d7e09d03 PT |
2475 | osc_extent_tree_dump(D_ERROR, obj); |
2476 | LASSERTF(0, "page index %lu is NOT covered.\n", index); | |
2477 | } | |
2478 | ||
2479 | switch (ext->oe_state) { | |
2480 | case OES_RPC: | |
2481 | case OES_LOCK_DONE: | |
2482 | CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp), | |
2483 | "flush an in-rpc page?\n"); | |
2484 | LASSERT(0); | |
2485 | break; | |
2486 | case OES_LOCKING: | |
2487 | /* If we know this extent is being written out, we should abort | |
2488 | * so that the writer can make this page ready. Otherwise, there | |
2489 | * exists a deadlock problem because other process can wait for | |
2490 | * page writeback bit holding page lock; and meanwhile in | |
2491 | * vvp_page_make_ready(), we need to grab page lock before | |
30aa9c52 OD |
2492 | * really sending the RPC. |
2493 | */ | |
d7e09d03 PT |
2494 | case OES_TRUNC: |
2495 | /* race with truncate, page will be redirtied */ | |
15f13cde AK |
2496 | case OES_ACTIVE: |
2497 | /* The extent is active so we need to abort and let the caller | |
2498 | * re-dirty the page. If we continued on here, and we were the | |
2499 | * one making the extent active, we could deadlock waiting for | |
2500 | * the page writeback to clear but it won't because the extent | |
30aa9c52 OD |
2501 | * is active and won't be written out. |
2502 | */ | |
490e0e89 JL |
2503 | rc = -EAGAIN; |
2504 | goto out; | |
d7e09d03 PT |
2505 | default: |
2506 | break; | |
2507 | } | |
2508 | ||
2509 | rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE); | |
2510 | if (rc) | |
490e0e89 | 2511 | goto out; |
d7e09d03 PT |
2512 | |
2513 | spin_lock(&oap->oap_lock); | |
2514 | oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; | |
2515 | spin_unlock(&oap->oap_lock); | |
2516 | ||
2517 | if (memory_pressure_get()) | |
2518 | ext->oe_memalloc = 1; | |
2519 | ||
2520 | ext->oe_urgent = 1; | |
2521 | if (ext->oe_state == OES_CACHE) { | |
2522 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2523 | "flush page %p make it urgent.\n", oap); | |
2524 | if (list_empty(&ext->oe_link)) | |
2525 | list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2526 | unplug = true; | |
2527 | } | |
2528 | rc = 0; | |
d7e09d03 PT |
2529 | |
2530 | out: | |
2531 | osc_object_unlock(obj); | |
2532 | osc_extent_put(env, ext); | |
2533 | if (unplug) | |
2534 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
2535 | return rc; | |
2536 | } | |
2537 | ||
2538 | /** | |
2539 | * this is called when a sync waiter receives an interruption. Its job is to | |
2540 | * get the caller woken as soon as possible. If its page hasn't been put in an | |
2541 | * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as | |
2542 | * desiring interruption which will forcefully complete the rpc once the rpc | |
2543 | * has timed out. | |
2544 | */ | |
2545 | int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) | |
2546 | { | |
2547 | struct osc_async_page *oap = &ops->ops_oap; | |
29ac6840 CH |
2548 | struct osc_object *obj = oap->oap_obj; |
2549 | struct client_obd *cli = osc_cli(obj); | |
2550 | struct osc_extent *ext; | |
2551 | struct osc_extent *found = NULL; | |
2552 | struct list_head *plist; | |
d7e09d03 | 2553 | pgoff_t index = oap2cl_page(oap)->cp_index; |
29ac6840 CH |
2554 | int rc = -EBUSY; |
2555 | int cmd; | |
d7e09d03 PT |
2556 | |
2557 | LASSERT(!oap->oap_interrupted); | |
2558 | oap->oap_interrupted = 1; | |
2559 | ||
2560 | /* Find out the caching extent */ | |
2561 | osc_object_lock(obj); | |
2562 | if (oap->oap_cmd & OBD_BRW_WRITE) { | |
2563 | plist = &obj->oo_urgent_exts; | |
29ac6840 | 2564 | cmd = OBD_BRW_WRITE; |
d7e09d03 PT |
2565 | } else { |
2566 | plist = &obj->oo_reading_exts; | |
29ac6840 | 2567 | cmd = OBD_BRW_READ; |
d7e09d03 PT |
2568 | } |
2569 | list_for_each_entry(ext, plist, oe_link) { | |
2570 | if (ext->oe_start <= index && ext->oe_end >= index) { | |
2571 | LASSERT(ext->oe_state == OES_LOCK_DONE); | |
2572 | /* For OES_LOCK_DONE state extent, it has already held | |
30aa9c52 OD |
2573 | * a refcount for RPC. |
2574 | */ | |
d7e09d03 PT |
2575 | found = osc_extent_get(ext); |
2576 | break; | |
2577 | } | |
2578 | } | |
7f1ae4c0 | 2579 | if (found) { |
d7e09d03 PT |
2580 | list_del_init(&found->oe_link); |
2581 | osc_update_pending(obj, cmd, -found->oe_nr_pages); | |
2582 | osc_object_unlock(obj); | |
2583 | ||
2584 | osc_extent_finish(env, found, 0, -EINTR); | |
2585 | osc_extent_put(env, found); | |
2586 | rc = 0; | |
2587 | } else { | |
2588 | osc_object_unlock(obj); | |
2589 | /* ok, it's been put in an rpc. only one oap gets a request | |
30aa9c52 OD |
2590 | * reference |
2591 | */ | |
7f1ae4c0 | 2592 | if (oap->oap_request) { |
d7e09d03 PT |
2593 | ptlrpc_mark_interrupted(oap->oap_request); |
2594 | ptlrpcd_wake(oap->oap_request); | |
2595 | ptlrpc_req_finished(oap->oap_request); | |
2596 | oap->oap_request = NULL; | |
2597 | } | |
2598 | } | |
2599 | ||
2600 | osc_list_maint(cli, obj); | |
0a3bdb00 | 2601 | return rc; |
d7e09d03 PT |
2602 | } |
2603 | ||
2604 | int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, | |
2605 | struct list_head *list, int cmd, int brw_flags) | |
2606 | { | |
29ac6840 CH |
2607 | struct client_obd *cli = osc_cli(obj); |
2608 | struct osc_extent *ext; | |
f13ab92e | 2609 | struct osc_async_page *oap, *tmp; |
29ac6840 CH |
2610 | int page_count = 0; |
2611 | int mppr = cli->cl_max_pages_per_rpc; | |
2612 | pgoff_t start = CL_PAGE_EOF; | |
2613 | pgoff_t end = 0; | |
d7e09d03 PT |
2614 | |
2615 | list_for_each_entry(oap, list, oap_pending_item) { | |
2616 | struct cl_page *cp = oap2cl_page(oap); | |
50ffcb7e | 2617 | |
d7e09d03 PT |
2618 | if (cp->cp_index > end) |
2619 | end = cp->cp_index; | |
2620 | if (cp->cp_index < start) | |
2621 | start = cp->cp_index; | |
2622 | ++page_count; | |
2623 | mppr <<= (page_count > mppr); | |
2624 | } | |
2625 | ||
2626 | ext = osc_extent_alloc(obj); | |
7f1ae4c0 | 2627 | if (!ext) { |
f13ab92e | 2628 | list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { |
d7e09d03 PT |
2629 | list_del_init(&oap->oap_pending_item); |
2630 | osc_ap_completion(env, cli, oap, 0, -ENOMEM); | |
2631 | } | |
0a3bdb00 | 2632 | return -ENOMEM; |
d7e09d03 PT |
2633 | } |
2634 | ||
2635 | ext->oe_rw = !!(cmd & OBD_BRW_READ); | |
2636 | ext->oe_urgent = 1; | |
2637 | ext->oe_start = start; | |
2638 | ext->oe_end = ext->oe_max_end = end; | |
2639 | ext->oe_obj = obj; | |
2640 | ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); | |
2641 | ext->oe_nr_pages = page_count; | |
2642 | ext->oe_mppr = mppr; | |
2643 | list_splice_init(list, &ext->oe_pages); | |
2644 | ||
2645 | osc_object_lock(obj); | |
2646 | /* Reuse the initial refcount for RPC, don't drop it */ | |
2647 | osc_extent_state_set(ext, OES_LOCK_DONE); | |
2648 | if (cmd & OBD_BRW_WRITE) { | |
2649 | list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2650 | osc_update_pending(obj, OBD_BRW_WRITE, page_count); | |
2651 | } else { | |
2652 | list_add_tail(&ext->oe_link, &obj->oo_reading_exts); | |
2653 | osc_update_pending(obj, OBD_BRW_READ, page_count); | |
2654 | } | |
2655 | osc_object_unlock(obj); | |
2656 | ||
c61ac979 | 2657 | osc_io_unplug_async(env, cli, obj); |
0a3bdb00 | 2658 | return 0; |
d7e09d03 PT |
2659 | } |
2660 | ||
2661 | /** | |
2662 | * Called by osc_io_setattr_start() to freeze and destroy covering extents. | |
2663 | */ | |
2664 | int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, | |
2665 | struct osc_object *obj, __u64 size) | |
2666 | { | |
2667 | struct client_obd *cli = osc_cli(obj); | |
2668 | struct osc_extent *ext; | |
2669 | struct osc_extent *waiting = NULL; | |
2670 | pgoff_t index; | |
2671 | LIST_HEAD(list); | |
2672 | int result = 0; | |
2673 | bool partial; | |
d7e09d03 PT |
2674 | |
2675 | /* pages with index greater or equal to index will be truncated. */ | |
2676 | index = cl_index(osc2cl(obj), size); | |
2677 | partial = size > cl_offset(osc2cl(obj), index); | |
2678 | ||
2679 | again: | |
2680 | osc_object_lock(obj); | |
2681 | ext = osc_extent_search(obj, index); | |
7f1ae4c0 | 2682 | if (!ext) |
d7e09d03 PT |
2683 | ext = first_extent(obj); |
2684 | else if (ext->oe_end < index) | |
2685 | ext = next_extent(ext); | |
7f1ae4c0 | 2686 | while (ext) { |
d7e09d03 PT |
2687 | EASSERT(ext->oe_state != OES_TRUNC, ext); |
2688 | ||
2689 | if (ext->oe_state > OES_CACHE || ext->oe_urgent) { | |
2690 | /* if ext is in urgent state, it means there must exist | |
2691 | * a page already having been flushed by write_page(). | |
2692 | * We have to wait for this extent because we can't | |
30aa9c52 OD |
2693 | * truncate that page. |
2694 | */ | |
d7e09d03 PT |
2695 | LASSERT(!ext->oe_hp); |
2696 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2697 | "waiting for busy extent\n"); | |
2698 | waiting = osc_extent_get(ext); | |
2699 | break; | |
2700 | } | |
2701 | ||
b0f5aad5 | 2702 | OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); |
d7e09d03 PT |
2703 | |
2704 | osc_extent_get(ext); | |
2705 | if (ext->oe_state == OES_ACTIVE) { | |
2706 | /* though we grab inode mutex for write path, but we | |
2707 | * release it before releasing extent(in osc_io_end()), | |
2708 | * so there is a race window that an extent is still | |
30aa9c52 OD |
2709 | * in OES_ACTIVE when truncate starts. |
2710 | */ | |
d7e09d03 PT |
2711 | LASSERT(!ext->oe_trunc_pending); |
2712 | ext->oe_trunc_pending = 1; | |
2713 | } else { | |
2714 | EASSERT(ext->oe_state == OES_CACHE, ext); | |
2715 | osc_extent_state_set(ext, OES_TRUNC); | |
2716 | osc_update_pending(obj, OBD_BRW_WRITE, | |
2717 | -ext->oe_nr_pages); | |
2718 | } | |
2719 | EASSERT(list_empty(&ext->oe_link), ext); | |
2720 | list_add_tail(&ext->oe_link, &list); | |
2721 | ||
2722 | ext = next_extent(ext); | |
2723 | } | |
2724 | osc_object_unlock(obj); | |
2725 | ||
2726 | osc_list_maint(cli, obj); | |
2727 | ||
2728 | while (!list_empty(&list)) { | |
2729 | int rc; | |
2730 | ||
2731 | ext = list_entry(list.next, struct osc_extent, oe_link); | |
2732 | list_del_init(&ext->oe_link); | |
2733 | ||
2734 | /* extent may be in OES_ACTIVE state because inode mutex | |
30aa9c52 OD |
2735 | * is released before osc_io_end() in file write case |
2736 | */ | |
d7e09d03 PT |
2737 | if (ext->oe_state != OES_TRUNC) |
2738 | osc_extent_wait(env, ext, OES_TRUNC); | |
2739 | ||
2740 | rc = osc_extent_truncate(ext, index, partial); | |
2741 | if (rc < 0) { | |
2742 | if (result == 0) | |
2743 | result = rc; | |
2744 | ||
2745 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
2746 | "truncate error %d\n", rc); | |
2747 | } else if (ext->oe_nr_pages == 0) { | |
2748 | osc_extent_remove(ext); | |
2749 | } else { | |
2750 | /* this must be an overlapped extent which means only | |
2751 | * part of pages in this extent have been truncated. | |
2752 | */ | |
2753 | EASSERTF(ext->oe_start <= index, ext, | |
2754 | "trunc index = %lu/%d.\n", index, partial); | |
2755 | /* fix index to skip this partially truncated extent */ | |
2756 | index = ext->oe_end + 1; | |
2757 | partial = false; | |
2758 | ||
2759 | /* we need to hold this extent in OES_TRUNC state so | |
2760 | * that no writeback will happen. This is to avoid | |
30aa9c52 OD |
2761 | * BUG 17397. |
2762 | */ | |
7f1ae4c0 | 2763 | LASSERT(!oio->oi_trunc); |
d7e09d03 PT |
2764 | oio->oi_trunc = osc_extent_get(ext); |
2765 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
b0f5aad5 | 2766 | "trunc at %llu\n", size); |
d7e09d03 PT |
2767 | } |
2768 | osc_extent_put(env, ext); | |
2769 | } | |
7f1ae4c0 | 2770 | if (waiting) { |
d7e09d03 PT |
2771 | int rc; |
2772 | ||
2773 | /* ignore the result of osc_extent_wait the write initiator | |
30aa9c52 OD |
2774 | * should take care of it. |
2775 | */ | |
d7e09d03 PT |
2776 | rc = osc_extent_wait(env, waiting, OES_INV); |
2777 | if (rc < 0) | |
451721cc | 2778 | OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); |
d7e09d03 PT |
2779 | |
2780 | osc_extent_put(env, waiting); | |
2781 | waiting = NULL; | |
2782 | goto again; | |
2783 | } | |
0a3bdb00 | 2784 | return result; |
d7e09d03 PT |
2785 | } |
2786 | ||
2787 | /** | |
2788 | * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. | |
2789 | */ | |
2790 | void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, | |
2791 | struct osc_object *obj) | |
2792 | { | |
2793 | struct osc_extent *ext = oio->oi_trunc; | |
2794 | ||
2795 | oio->oi_trunc = NULL; | |
7f1ae4c0 | 2796 | if (ext) { |
d7e09d03 PT |
2797 | bool unplug = false; |
2798 | ||
2799 | EASSERT(ext->oe_nr_pages > 0, ext); | |
2800 | EASSERT(ext->oe_state == OES_TRUNC, ext); | |
2801 | EASSERT(!ext->oe_urgent, ext); | |
2802 | ||
2803 | OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); | |
2804 | osc_object_lock(obj); | |
2805 | osc_extent_state_set(ext, OES_CACHE); | |
2806 | if (ext->oe_fsync_wait && !ext->oe_urgent) { | |
2807 | ext->oe_urgent = 1; | |
2808 | list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2809 | unplug = true; | |
2810 | } | |
2811 | osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); | |
2812 | osc_object_unlock(obj); | |
2813 | osc_extent_put(env, ext); | |
2814 | ||
2815 | if (unplug) | |
2816 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
2817 | } | |
2818 | } | |
2819 | ||
2820 | /** | |
2821 | * Wait for extents in a specific range to be written out. | |
2822 | * The caller must have called osc_cache_writeback_range() to issue IO | |
2823 | * otherwise it will take a long time for this function to finish. | |
2824 | * | |
2825 | * Caller must hold inode_mutex , or cancel exclusive dlm lock so that | |
2826 | * nobody else can dirty this range of file while we're waiting for | |
2827 | * extents to be written. | |
2828 | */ | |
2829 | int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, | |
2830 | pgoff_t start, pgoff_t end) | |
2831 | { | |
2832 | struct osc_extent *ext; | |
2833 | pgoff_t index = start; | |
29ac6840 | 2834 | int result = 0; |
d7e09d03 PT |
2835 | |
2836 | again: | |
2837 | osc_object_lock(obj); | |
2838 | ext = osc_extent_search(obj, index); | |
7f1ae4c0 | 2839 | if (!ext) |
d7e09d03 PT |
2840 | ext = first_extent(obj); |
2841 | else if (ext->oe_end < index) | |
2842 | ext = next_extent(ext); | |
7f1ae4c0 | 2843 | while (ext) { |
d7e09d03 PT |
2844 | int rc; |
2845 | ||
2846 | if (ext->oe_start > end) | |
2847 | break; | |
2848 | ||
2849 | if (!ext->oe_fsync_wait) { | |
2850 | ext = next_extent(ext); | |
2851 | continue; | |
2852 | } | |
2853 | ||
2854 | EASSERT(ergo(ext->oe_state == OES_CACHE, | |
2855 | ext->oe_hp || ext->oe_urgent), ext); | |
2856 | EASSERT(ergo(ext->oe_state == OES_ACTIVE, | |
2857 | !ext->oe_hp && ext->oe_urgent), ext); | |
2858 | ||
2859 | index = ext->oe_end + 1; | |
2860 | osc_extent_get(ext); | |
2861 | osc_object_unlock(obj); | |
2862 | ||
2863 | rc = osc_extent_wait(env, ext, OES_INV); | |
2864 | if (result == 0) | |
2865 | result = rc; | |
2866 | osc_extent_put(env, ext); | |
2867 | goto again; | |
2868 | } | |
2869 | osc_object_unlock(obj); | |
2870 | ||
2871 | OSC_IO_DEBUG(obj, "sync file range.\n"); | |
0a3bdb00 | 2872 | return result; |
d7e09d03 PT |
2873 | } |
2874 | ||
2875 | /** | |
2876 | * Called to write out a range of osc object. | |
2877 | * | |
2878 | * @hp : should be set this is caused by lock cancel; | |
2879 | * @discard: is set if dirty pages should be dropped - file will be deleted or | |
2880 | * truncated, this implies there is no partially discarding extents. | |
2881 | * | |
2882 | * Return how many pages will be issued, or error code if error occurred. | |
2883 | */ | |
2884 | int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, | |
2885 | pgoff_t start, pgoff_t end, int hp, int discard) | |
2886 | { | |
2887 | struct osc_extent *ext; | |
2888 | LIST_HEAD(discard_list); | |
2889 | bool unplug = false; | |
2890 | int result = 0; | |
d7e09d03 PT |
2891 | |
2892 | osc_object_lock(obj); | |
2893 | ext = osc_extent_search(obj, start); | |
7f1ae4c0 | 2894 | if (!ext) |
d7e09d03 PT |
2895 | ext = first_extent(obj); |
2896 | else if (ext->oe_end < start) | |
2897 | ext = next_extent(ext); | |
7f1ae4c0 | 2898 | while (ext) { |
d7e09d03 PT |
2899 | if (ext->oe_start > end) |
2900 | break; | |
2901 | ||
2902 | ext->oe_fsync_wait = 1; | |
2903 | switch (ext->oe_state) { | |
2904 | case OES_CACHE: | |
2905 | result += ext->oe_nr_pages; | |
2906 | if (!discard) { | |
2907 | struct list_head *list = NULL; | |
50ffcb7e | 2908 | |
d7e09d03 PT |
2909 | if (hp) { |
2910 | EASSERT(!ext->oe_hp, ext); | |
2911 | ext->oe_hp = 1; | |
2912 | list = &obj->oo_hp_exts; | |
2913 | } else if (!ext->oe_urgent) { | |
2914 | ext->oe_urgent = 1; | |
2915 | list = &obj->oo_urgent_exts; | |
2916 | } | |
7f1ae4c0 | 2917 | if (list) |
d7e09d03 PT |
2918 | list_move_tail(&ext->oe_link, list); |
2919 | unplug = true; | |
2920 | } else { | |
2921 | /* the only discarder is lock cancelling, so | |
30aa9c52 OD |
2922 | * [start, end] must contain this extent |
2923 | */ | |
d7e09d03 PT |
2924 | EASSERT(ext->oe_start >= start && |
2925 | ext->oe_max_end <= end, ext); | |
2926 | osc_extent_state_set(ext, OES_LOCKING); | |
2927 | ext->oe_owner = current; | |
79910d7d | 2928 | list_move_tail(&ext->oe_link, &discard_list); |
d7e09d03 PT |
2929 | osc_update_pending(obj, OBD_BRW_WRITE, |
2930 | -ext->oe_nr_pages); | |
2931 | } | |
2932 | break; | |
2933 | case OES_ACTIVE: | |
2934 | /* It's pretty bad to wait for ACTIVE extents, because | |
2935 | * we don't know how long we will wait for it to be | |
2936 | * flushed since it may be blocked at awaiting more | |
30aa9c52 OD |
2937 | * grants. We do this for the correctness of fsync. |
2938 | */ | |
d7e09d03 PT |
2939 | LASSERT(hp == 0 && discard == 0); |
2940 | ext->oe_urgent = 1; | |
2941 | break; | |
2942 | case OES_TRUNC: | |
2943 | /* this extent is being truncated, can't do anything | |
2944 | * for it now. it will be set to urgent after truncate | |
30aa9c52 OD |
2945 | * is finished in osc_cache_truncate_end(). |
2946 | */ | |
d7e09d03 PT |
2947 | default: |
2948 | break; | |
2949 | } | |
2950 | ext = next_extent(ext); | |
2951 | } | |
2952 | osc_object_unlock(obj); | |
2953 | ||
2954 | LASSERT(ergo(!discard, list_empty(&discard_list))); | |
2955 | if (!list_empty(&discard_list)) { | |
2956 | struct osc_extent *tmp; | |
2957 | int rc; | |
2958 | ||
2959 | osc_list_maint(osc_cli(obj), obj); | |
2960 | list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { | |
2961 | list_del_init(&ext->oe_link); | |
2962 | EASSERT(ext->oe_state == OES_LOCKING, ext); | |
2963 | ||
2964 | /* Discard caching pages. We don't actually write this | |
30aa9c52 OD |
2965 | * extent out but we complete it as if we did. |
2966 | */ | |
d7e09d03 PT |
2967 | rc = osc_extent_make_ready(env, ext); |
2968 | if (unlikely(rc < 0)) { | |
2969 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
2970 | "make_ready returned %d\n", rc); | |
2971 | if (result >= 0) | |
2972 | result = rc; | |
2973 | } | |
2974 | ||
2975 | /* finish the extent as if the pages were sent */ | |
2976 | osc_extent_finish(env, ext, 0, 0); | |
2977 | } | |
2978 | } | |
2979 | ||
2980 | if (unplug) | |
c5c4c6fa | 2981 | osc_io_unplug(env, osc_cli(obj), obj); |
d7e09d03 PT |
2982 | |
2983 | if (hp || discard) { | |
2984 | int rc; | |
50ffcb7e | 2985 | |
d7e09d03 PT |
2986 | rc = osc_cache_wait_range(env, obj, start, end); |
2987 | if (result >= 0 && rc < 0) | |
2988 | result = rc; | |
2989 | } | |
2990 | ||
2991 | OSC_IO_DEBUG(obj, "cache page out.\n"); | |
0a3bdb00 | 2992 | return result; |
d7e09d03 PT |
2993 | } |
2994 | ||
2995 | /** @} osc */ |