writeback: get rid of pdflush completely
[linux-2.6-block.git] / mm / backing-dev.c
CommitLineData
3fcfab16
AM
1
2#include <linux/wait.h>
3#include <linux/backing-dev.h>
03ba3782
JA
4#include <linux/kthread.h>
5#include <linux/freezer.h>
3fcfab16 6#include <linux/fs.h>
26160158 7#include <linux/pagemap.h>
03ba3782 8#include <linux/mm.h>
3fcfab16
AM
9#include <linux/sched.h>
10#include <linux/module.h>
cf0ca9fe
PZ
11#include <linux/writeback.h>
12#include <linux/device.h>
13
26160158
JA
14void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
15{
16}
17EXPORT_SYMBOL(default_unplug_io_fn);
18
19struct backing_dev_info default_backing_dev_info = {
20 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
21 .state = 0,
22 .capabilities = BDI_CAP_MAP_COPY,
23 .unplug_io_fn = default_unplug_io_fn,
24};
25EXPORT_SYMBOL_GPL(default_backing_dev_info);
cf0ca9fe
PZ
26
27static struct class *bdi_class;
03ba3782 28DEFINE_SPINLOCK(bdi_lock);
66f3b8e2 29LIST_HEAD(bdi_list);
03ba3782
JA
30LIST_HEAD(bdi_pending_list);
31
32static struct task_struct *sync_supers_tsk;
33static struct timer_list sync_supers_timer;
34
35static int bdi_sync_supers(void *);
36static void sync_supers_timer_fn(unsigned long);
37static void arm_supers_timer(void);
38
39static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
cf0ca9fe 40
76f1418b
MS
41#ifdef CONFIG_DEBUG_FS
42#include <linux/debugfs.h>
43#include <linux/seq_file.h>
44
45static struct dentry *bdi_debug_root;
46
47static void bdi_debug_init(void)
48{
49 bdi_debug_root = debugfs_create_dir("bdi", NULL);
50}
51
52static int bdi_debug_stats_show(struct seq_file *m, void *v)
53{
54 struct backing_dev_info *bdi = m->private;
364aeb28
DR
55 unsigned long background_thresh;
56 unsigned long dirty_thresh;
57 unsigned long bdi_thresh;
76f1418b
MS
58
59 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
60
61#define K(x) ((x) << (PAGE_SHIFT - 10))
62 seq_printf(m,
63 "BdiWriteback: %8lu kB\n"
64 "BdiReclaimable: %8lu kB\n"
65 "BdiDirtyThresh: %8lu kB\n"
66 "DirtyThresh: %8lu kB\n"
67 "BackgroundThresh: %8lu kB\n",
68 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
69 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
70 K(bdi_thresh),
71 K(dirty_thresh),
72 K(background_thresh));
73#undef K
74
75 return 0;
76}
77
78static int bdi_debug_stats_open(struct inode *inode, struct file *file)
79{
80 return single_open(file, bdi_debug_stats_show, inode->i_private);
81}
82
83static const struct file_operations bdi_debug_stats_fops = {
84 .open = bdi_debug_stats_open,
85 .read = seq_read,
86 .llseek = seq_lseek,
87 .release = single_release,
88};
89
90static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
91{
92 bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
93 bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
94 bdi, &bdi_debug_stats_fops);
95}
96
97static void bdi_debug_unregister(struct backing_dev_info *bdi)
98{
99 debugfs_remove(bdi->debug_stats);
100 debugfs_remove(bdi->debug_dir);
101}
102#else
103static inline void bdi_debug_init(void)
104{
105}
106static inline void bdi_debug_register(struct backing_dev_info *bdi,
107 const char *name)
108{
109}
110static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
111{
112}
113#endif
114
cf0ca9fe
PZ
115static ssize_t read_ahead_kb_store(struct device *dev,
116 struct device_attribute *attr,
117 const char *buf, size_t count)
118{
119 struct backing_dev_info *bdi = dev_get_drvdata(dev);
120 char *end;
121 unsigned long read_ahead_kb;
122 ssize_t ret = -EINVAL;
123
124 read_ahead_kb = simple_strtoul(buf, &end, 10);
125 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
126 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
127 ret = count;
128 }
129 return ret;
130}
131
132#define K(pages) ((pages) << (PAGE_SHIFT - 10))
133
134#define BDI_SHOW(name, expr) \
135static ssize_t name##_show(struct device *dev, \
136 struct device_attribute *attr, char *page) \
137{ \
138 struct backing_dev_info *bdi = dev_get_drvdata(dev); \
139 \
140 return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
141}
142
143BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
144
189d3c4a
PZ
145static ssize_t min_ratio_store(struct device *dev,
146 struct device_attribute *attr, const char *buf, size_t count)
147{
148 struct backing_dev_info *bdi = dev_get_drvdata(dev);
149 char *end;
150 unsigned int ratio;
151 ssize_t ret = -EINVAL;
152
153 ratio = simple_strtoul(buf, &end, 10);
154 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
155 ret = bdi_set_min_ratio(bdi, ratio);
156 if (!ret)
157 ret = count;
158 }
159 return ret;
160}
161BDI_SHOW(min_ratio, bdi->min_ratio)
162
a42dde04
PZ
163static ssize_t max_ratio_store(struct device *dev,
164 struct device_attribute *attr, const char *buf, size_t count)
165{
166 struct backing_dev_info *bdi = dev_get_drvdata(dev);
167 char *end;
168 unsigned int ratio;
169 ssize_t ret = -EINVAL;
170
171 ratio = simple_strtoul(buf, &end, 10);
172 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
173 ret = bdi_set_max_ratio(bdi, ratio);
174 if (!ret)
175 ret = count;
176 }
177 return ret;
178}
179BDI_SHOW(max_ratio, bdi->max_ratio)
180
cf0ca9fe
PZ
181#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
182
183static struct device_attribute bdi_dev_attrs[] = {
184 __ATTR_RW(read_ahead_kb),
189d3c4a 185 __ATTR_RW(min_ratio),
a42dde04 186 __ATTR_RW(max_ratio),
cf0ca9fe
PZ
187 __ATTR_NULL,
188};
189
190static __init int bdi_class_init(void)
191{
192 bdi_class = class_create(THIS_MODULE, "bdi");
193 bdi_class->dev_attrs = bdi_dev_attrs;
76f1418b 194 bdi_debug_init();
cf0ca9fe
PZ
195 return 0;
196}
76f1418b 197postcore_initcall(bdi_class_init);
cf0ca9fe 198
26160158
JA
199static int __init default_bdi_init(void)
200{
201 int err;
202
03ba3782
JA
203 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
204 BUG_ON(IS_ERR(sync_supers_tsk));
205
206 init_timer(&sync_supers_timer);
207 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
208 arm_supers_timer();
209
26160158
JA
210 err = bdi_init(&default_backing_dev_info);
211 if (!err)
212 bdi_register(&default_backing_dev_info, NULL, "default");
213
214 return err;
215}
216subsys_initcall(default_bdi_init);
217
03ba3782
JA
218static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
219{
220 memset(wb, 0, sizeof(*wb));
221
222 wb->bdi = bdi;
223 wb->last_old_flush = jiffies;
224 INIT_LIST_HEAD(&wb->b_dirty);
225 INIT_LIST_HEAD(&wb->b_io);
226 INIT_LIST_HEAD(&wb->b_more_io);
227}
228
229static void bdi_task_init(struct backing_dev_info *bdi,
230 struct bdi_writeback *wb)
231{
232 struct task_struct *tsk = current;
233
234 spin_lock(&bdi->wb_lock);
235 list_add_tail_rcu(&wb->list, &bdi->wb_list);
236 spin_unlock(&bdi->wb_lock);
237
238 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
239 set_freezable();
240
241 /*
242 * Our parent may run at a different priority, just set us to normal
243 */
244 set_user_nice(tsk, 0);
245}
246
247static int bdi_start_fn(void *ptr)
248{
249 struct bdi_writeback *wb = ptr;
250 struct backing_dev_info *bdi = wb->bdi;
251 int ret;
252
253 /*
254 * Add us to the active bdi_list
255 */
256 spin_lock(&bdi_lock);
257 list_add(&bdi->bdi_list, &bdi_list);
258 spin_unlock(&bdi_lock);
259
260 bdi_task_init(bdi, wb);
261
262 /*
263 * Clear pending bit and wakeup anybody waiting to tear us down
264 */
265 clear_bit(BDI_pending, &bdi->state);
266 smp_mb__after_clear_bit();
267 wake_up_bit(&bdi->state, BDI_pending);
268
269 ret = bdi_writeback_task(wb);
270
271 /*
272 * Remove us from the list
273 */
274 spin_lock(&bdi->wb_lock);
275 list_del_rcu(&wb->list);
276 spin_unlock(&bdi->wb_lock);
277
278 /*
279 * Flush any work that raced with us exiting. No new work
280 * will be added, since this bdi isn't discoverable anymore.
281 */
282 if (!list_empty(&bdi->work_list))
283 wb_do_writeback(wb, 1);
284
285 wb->task = NULL;
286 return ret;
287}
288
289int bdi_has_dirty_io(struct backing_dev_info *bdi)
290{
291 return wb_has_dirty_io(&bdi->wb);
292}
293
294static void bdi_flush_io(struct backing_dev_info *bdi)
295{
296 struct writeback_control wbc = {
297 .bdi = bdi,
298 .sync_mode = WB_SYNC_NONE,
299 .older_than_this = NULL,
300 .range_cyclic = 1,
301 .nr_to_write = 1024,
302 };
303
304 writeback_inodes_wbc(&wbc);
305}
306
307/*
308 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
309 * or we risk deadlocking on ->s_umount. The longer term solution would be
310 * to implement sync_supers_bdi() or similar and simply do it from the
311 * bdi writeback tasks individually.
312 */
313static int bdi_sync_supers(void *unused)
314{
315 set_user_nice(current, 0);
316
317 while (!kthread_should_stop()) {
318 set_current_state(TASK_INTERRUPTIBLE);
319 schedule();
320
321 /*
322 * Do this periodically, like kupdated() did before.
323 */
324 sync_supers();
325 }
326
327 return 0;
328}
329
330static void arm_supers_timer(void)
331{
332 unsigned long next;
333
334 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
335 mod_timer(&sync_supers_timer, round_jiffies_up(next));
336}
337
338static void sync_supers_timer_fn(unsigned long unused)
339{
340 wake_up_process(sync_supers_tsk);
341 arm_supers_timer();
342}
343
344static int bdi_forker_task(void *ptr)
345{
346 struct bdi_writeback *me = ptr;
347
348 bdi_task_init(me->bdi, me);
349
350 for (;;) {
351 struct backing_dev_info *bdi, *tmp;
352 struct bdi_writeback *wb;
353
354 /*
355 * Temporary measure, we want to make sure we don't see
356 * dirty data on the default backing_dev_info
357 */
358 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
359 wb_do_writeback(me, 0);
360
361 spin_lock(&bdi_lock);
362
363 /*
364 * Check if any existing bdi's have dirty data without
365 * a thread registered. If so, set that up.
366 */
367 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
368 if (bdi->wb.task)
369 continue;
370 if (list_empty(&bdi->work_list) &&
371 !bdi_has_dirty_io(bdi))
372 continue;
373
374 bdi_add_default_flusher_task(bdi);
375 }
376
377 set_current_state(TASK_INTERRUPTIBLE);
378
379 if (list_empty(&bdi_pending_list)) {
380 unsigned long wait;
381
382 spin_unlock(&bdi_lock);
383 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
384 schedule_timeout(wait);
385 try_to_freeze();
386 continue;
387 }
388
389 __set_current_state(TASK_RUNNING);
390
391 /*
392 * This is our real job - check for pending entries in
393 * bdi_pending_list, and create the tasks that got added
394 */
395 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
396 bdi_list);
397 list_del_init(&bdi->bdi_list);
398 spin_unlock(&bdi_lock);
399
400 wb = &bdi->wb;
401 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
402 dev_name(bdi->dev));
403 /*
404 * If task creation fails, then readd the bdi to
405 * the pending list and force writeout of the bdi
406 * from this forker thread. That will free some memory
407 * and we can try again.
408 */
409 if (IS_ERR(wb->task)) {
410 wb->task = NULL;
411
412 /*
413 * Add this 'bdi' to the back, so we get
414 * a chance to flush other bdi's to free
415 * memory.
416 */
417 spin_lock(&bdi_lock);
418 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
419 spin_unlock(&bdi_lock);
420
421 bdi_flush_io(bdi);
422 }
423 }
424
425 return 0;
426}
427
428/*
429 * Add the default flusher task that gets created for any bdi
430 * that has dirty data pending writeout
431 */
432void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
433{
434 if (!bdi_cap_writeback_dirty(bdi))
435 return;
436
437 /*
438 * Check with the helper whether to proceed adding a task. Will only
439 * abort if we two or more simultanous calls to
440 * bdi_add_default_flusher_task() occured, further additions will block
441 * waiting for previous additions to finish.
442 */
443 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
444 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
445
446 /*
447 * We are now on the pending list, wake up bdi_forker_task()
448 * to finish the job and add us back to the active bdi_list
449 */
450 wake_up_process(default_backing_dev_info.wb.task);
451 }
452}
453
cf0ca9fe
PZ
454int bdi_register(struct backing_dev_info *bdi, struct device *parent,
455 const char *fmt, ...)
456{
cf0ca9fe
PZ
457 va_list args;
458 int ret = 0;
459 struct device *dev;
460
69fc208b 461 if (bdi->dev) /* The driver needs to use separate queues per device */
f1d0b063
KS
462 goto exit;
463
cf0ca9fe 464 va_start(args, fmt);
19051c50 465 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
cf0ca9fe 466 va_end(args);
cf0ca9fe
PZ
467 if (IS_ERR(dev)) {
468 ret = PTR_ERR(dev);
469 goto exit;
470 }
471
03ba3782 472 spin_lock(&bdi_lock);
66f3b8e2 473 list_add_tail(&bdi->bdi_list, &bdi_list);
03ba3782 474 spin_unlock(&bdi_lock);
66f3b8e2 475
cf0ca9fe 476 bdi->dev = dev;
cf0ca9fe 477
03ba3782
JA
478 /*
479 * Just start the forker thread for our default backing_dev_info,
480 * and add other bdi's to the list. They will get a thread created
481 * on-demand when they need it.
482 */
483 if (bdi_cap_flush_forker(bdi)) {
484 struct bdi_writeback *wb = &bdi->wb;
485
486 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
487 dev_name(dev));
488 if (IS_ERR(wb->task)) {
489 wb->task = NULL;
490 ret = -ENOMEM;
491
492 spin_lock(&bdi_lock);
493 list_del(&bdi->bdi_list);
494 spin_unlock(&bdi_lock);
495 goto exit;
496 }
497 }
498
499 bdi_debug_register(bdi, dev_name(dev));
cf0ca9fe 500exit:
cf0ca9fe
PZ
501 return ret;
502}
503EXPORT_SYMBOL(bdi_register);
504
505int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
506{
507 return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
508}
509EXPORT_SYMBOL(bdi_register_dev);
510
03ba3782
JA
511/*
512 * Remove bdi from the global list and shutdown any threads we have running
513 */
514static void bdi_wb_shutdown(struct backing_dev_info *bdi)
66f3b8e2 515{
03ba3782
JA
516 struct bdi_writeback *wb;
517
518 if (!bdi_cap_writeback_dirty(bdi))
519 return;
520
521 /*
522 * If setup is pending, wait for that to complete first
523 */
524 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
525 TASK_UNINTERRUPTIBLE);
526
527 /*
528 * Make sure nobody finds us on the bdi_list anymore
529 */
530 spin_lock(&bdi_lock);
66f3b8e2 531 list_del(&bdi->bdi_list);
03ba3782
JA
532 spin_unlock(&bdi_lock);
533
534 /*
535 * Finally, kill the kernel threads. We don't need to be RCU
536 * safe anymore, since the bdi is gone from visibility.
537 */
538 list_for_each_entry(wb, &bdi->wb_list, list)
539 kthread_stop(wb->task);
66f3b8e2
JA
540}
541
cf0ca9fe
PZ
542void bdi_unregister(struct backing_dev_info *bdi)
543{
544 if (bdi->dev) {
03ba3782
JA
545 if (!bdi_cap_flush_forker(bdi))
546 bdi_wb_shutdown(bdi);
76f1418b 547 bdi_debug_unregister(bdi);
cf0ca9fe
PZ
548 device_unregister(bdi->dev);
549 bdi->dev = NULL;
550 }
551}
552EXPORT_SYMBOL(bdi_unregister);
3fcfab16 553
b2e8fb6e
PZ
554int bdi_init(struct backing_dev_info *bdi)
555{
03ba3782 556 int i, err;
b2e8fb6e 557
cf0ca9fe
PZ
558 bdi->dev = NULL;
559
189d3c4a 560 bdi->min_ratio = 0;
a42dde04
PZ
561 bdi->max_ratio = 100;
562 bdi->max_prop_frac = PROP_FRAC_BASE;
03ba3782 563 spin_lock_init(&bdi->wb_lock);
66f3b8e2 564 INIT_LIST_HEAD(&bdi->bdi_list);
03ba3782
JA
565 INIT_LIST_HEAD(&bdi->wb_list);
566 INIT_LIST_HEAD(&bdi->work_list);
567
568 bdi_wb_init(&bdi->wb, bdi);
569
570 /*
571 * Just one thread support for now, hard code mask and count
572 */
573 bdi->wb_mask = 1;
574 bdi->wb_cnt = 1;
189d3c4a 575
b2e8fb6e 576 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
ea319518 577 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
04fbfdc1
PZ
578 if (err)
579 goto err;
580 }
581
582 bdi->dirty_exceeded = 0;
583 err = prop_local_init_percpu(&bdi->completions);
584
585 if (err) {
586err:
4b01a0b1 587 while (i--)
04fbfdc1 588 percpu_counter_destroy(&bdi->bdi_stat[i]);
b2e8fb6e
PZ
589 }
590
591 return err;
592}
593EXPORT_SYMBOL(bdi_init);
594
595void bdi_destroy(struct backing_dev_info *bdi)
596{
597 int i;
598
03ba3782 599 WARN_ON(bdi_has_dirty_io(bdi));
66f3b8e2 600
cf0ca9fe
PZ
601 bdi_unregister(bdi);
602
b2e8fb6e
PZ
603 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
604 percpu_counter_destroy(&bdi->bdi_stat[i]);
04fbfdc1
PZ
605
606 prop_local_destroy_percpu(&bdi->completions);
b2e8fb6e
PZ
607}
608EXPORT_SYMBOL(bdi_destroy);
609
3fcfab16
AM
610static wait_queue_head_t congestion_wqh[2] = {
611 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
612 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
613 };
614
1faa16d2 615void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
3fcfab16
AM
616{
617 enum bdi_state bit;
1faa16d2 618 wait_queue_head_t *wqh = &congestion_wqh[sync];
3fcfab16 619
1faa16d2 620 bit = sync ? BDI_sync_congested : BDI_async_congested;
3fcfab16
AM
621 clear_bit(bit, &bdi->state);
622 smp_mb__after_clear_bit();
623 if (waitqueue_active(wqh))
624 wake_up(wqh);
625}
626EXPORT_SYMBOL(clear_bdi_congested);
627
1faa16d2 628void set_bdi_congested(struct backing_dev_info *bdi, int sync)
3fcfab16
AM
629{
630 enum bdi_state bit;
631
1faa16d2 632 bit = sync ? BDI_sync_congested : BDI_async_congested;
3fcfab16
AM
633 set_bit(bit, &bdi->state);
634}
635EXPORT_SYMBOL(set_bdi_congested);
636
637/**
638 * congestion_wait - wait for a backing_dev to become uncongested
8aa7e847 639 * @sync: SYNC or ASYNC IO
3fcfab16
AM
640 * @timeout: timeout in jiffies
641 *
642 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
643 * write congestion. If no backing_devs are congested then just wait for the
644 * next write to be completed.
645 */
8aa7e847 646long congestion_wait(int sync, long timeout)
3fcfab16
AM
647{
648 long ret;
649 DEFINE_WAIT(wait);
8aa7e847 650 wait_queue_head_t *wqh = &congestion_wqh[sync];
3fcfab16
AM
651
652 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
653 ret = io_schedule_timeout(timeout);
654 finish_wait(wqh, &wait);
655 return ret;
656}
657EXPORT_SYMBOL(congestion_wait);
04fbfdc1 658