NTFS: Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
[linux-2.6-block.git] / fs / ntfs / aops.c
CommitLineData
1da177e4
LT
1/**
2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project.
4 *
b6ad6c52 5 * Copyright (c) 2001-2005 Anton Altaparmakov
1da177e4
LT
6 * Copyright (c) 2002 Richard Russon
7 *
8 * This program/include file is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as published
10 * by the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program/include file is distributed in the hope that it will be
14 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program (in the main directory of the Linux-NTFS
20 * distribution in the file COPYING); if not, write to the Free Software
21 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#include <linux/errno.h>
25#include <linux/mm.h>
26#include <linux/pagemap.h>
27#include <linux/swap.h>
28#include <linux/buffer_head.h>
29#include <linux/writeback.h>
30
31#include "aops.h"
32#include "attrib.h"
33#include "debug.h"
34#include "inode.h"
35#include "mft.h"
36#include "runlist.h"
37#include "types.h"
38#include "ntfs.h"
39
40/**
41 * ntfs_end_buffer_async_read - async io completion for reading attributes
42 * @bh: buffer head on which io is completed
43 * @uptodate: whether @bh is now uptodate or not
44 *
45 * Asynchronous I/O completion handler for reading pages belonging to the
46 * attribute address space of an inode. The inodes can either be files or
47 * directories or they can be fake inodes describing some attribute.
48 *
49 * If NInoMstProtected(), perform the post read mst fixups when all IO on the
50 * page has been completed and mark the page uptodate or set the error bit on
51 * the page. To determine the size of the records that need fixing up, we
52 * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
53 * record size, and index_block_size_bits, to the log(base 2) of the ntfs
54 * record size.
55 */
56static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
57{
58 static DEFINE_SPINLOCK(page_uptodate_lock);
59 unsigned long flags;
60 struct buffer_head *tmp;
61 struct page *page;
62 ntfs_inode *ni;
63 int page_uptodate = 1;
64
65 page = bh->b_page;
66 ni = NTFS_I(page->mapping->host);
67
68 if (likely(uptodate)) {
07a4e2da 69 s64 file_ofs, initialized_size;
1da177e4
LT
70
71 set_buffer_uptodate(bh);
72
73 file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
74 bh_offset(bh);
07a4e2da
AA
75 read_lock_irqsave(&ni->size_lock, flags);
76 initialized_size = ni->initialized_size;
77 read_unlock_irqrestore(&ni->size_lock, flags);
1da177e4 78 /* Check for the current buffer head overflowing. */
07a4e2da 79 if (file_ofs + bh->b_size > initialized_size) {
1da177e4
LT
80 char *addr;
81 int ofs = 0;
82
07a4e2da
AA
83 if (file_ofs < initialized_size)
84 ofs = initialized_size - file_ofs;
1da177e4
LT
85 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
86 memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
87 flush_dcache_page(page);
88 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
89 }
90 } else {
91 clear_buffer_uptodate(bh);
92 ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
93 (unsigned long long)bh->b_blocknr);
94 SetPageError(page);
95 }
96 spin_lock_irqsave(&page_uptodate_lock, flags);
97 clear_buffer_async_read(bh);
98 unlock_buffer(bh);
99 tmp = bh;
100 do {
101 if (!buffer_uptodate(tmp))
102 page_uptodate = 0;
103 if (buffer_async_read(tmp)) {
104 if (likely(buffer_locked(tmp)))
105 goto still_busy;
106 /* Async buffers must be locked. */
107 BUG();
108 }
109 tmp = tmp->b_this_page;
110 } while (tmp != bh);
111 spin_unlock_irqrestore(&page_uptodate_lock, flags);
112 /*
113 * If none of the buffers had errors then we can set the page uptodate,
114 * but we first have to perform the post read mst fixups, if the
115 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
116 * Note we ignore fixup errors as those are detected when
117 * map_mft_record() is called which gives us per record granularity
118 * rather than per page granularity.
119 */
120 if (!NInoMstProtected(ni)) {
121 if (likely(page_uptodate && !PageError(page)))
122 SetPageUptodate(page);
123 } else {
124 char *addr;
125 unsigned int i, recs;
126 u32 rec_size;
127
128 rec_size = ni->itype.index.block_size;
129 recs = PAGE_CACHE_SIZE / rec_size;
130 /* Should have been verified before we got here... */
131 BUG_ON(!recs);
132 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
133 for (i = 0; i < recs; i++)
134 post_read_mst_fixup((NTFS_RECORD*)(addr +
135 i * rec_size), rec_size);
136 flush_dcache_page(page);
137 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
b6ad6c52 138 if (likely(page_uptodate && !PageError(page)))
1da177e4
LT
139 SetPageUptodate(page);
140 }
141 unlock_page(page);
142 return;
143still_busy:
144 spin_unlock_irqrestore(&page_uptodate_lock, flags);
145 return;
146}
147
148/**
149 * ntfs_read_block - fill a @page of an address space with data
150 * @page: page cache page to fill with data
151 *
152 * Fill the page @page of the address space belonging to the @page->host inode.
153 * We read each buffer asynchronously and when all buffers are read in, our io
154 * completion handler ntfs_end_buffer_read_async(), if required, automatically
155 * applies the mst fixups to the page before finally marking it uptodate and
156 * unlocking it.
157 *
158 * We only enforce allocated_size limit because i_size is checked for in
159 * generic_file_read().
160 *
161 * Return 0 on success and -errno on error.
162 *
163 * Contains an adapted version of fs/buffer.c::block_read_full_page().
164 */
165static int ntfs_read_block(struct page *page)
166{
167 VCN vcn;
168 LCN lcn;
169 ntfs_inode *ni;
170 ntfs_volume *vol;
171 runlist_element *rl;
172 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
173 sector_t iblock, lblock, zblock;
07a4e2da 174 unsigned long flags;
1da177e4
LT
175 unsigned int blocksize, vcn_ofs;
176 int i, nr;
177 unsigned char blocksize_bits;
178
179 ni = NTFS_I(page->mapping->host);
180 vol = ni->vol;
181
182 /* $MFT/$DATA must have its complete runlist in memory at all times. */
183 BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
184
185 blocksize_bits = VFS_I(ni)->i_blkbits;
186 blocksize = 1 << blocksize_bits;
187
a01ac532 188 if (!page_has_buffers(page)) {
1da177e4 189 create_empty_buffers(page, blocksize, 0);
a01ac532
AA
190 if (unlikely(!page_has_buffers(page))) {
191 unlock_page(page);
192 return -ENOMEM;
193 }
1da177e4 194 }
a01ac532
AA
195 bh = head = page_buffers(page);
196 BUG_ON(!bh);
1da177e4
LT
197
198 iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
07a4e2da 199 read_lock_irqsave(&ni->size_lock, flags);
1da177e4
LT
200 lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
201 zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
07a4e2da 202 read_unlock_irqrestore(&ni->size_lock, flags);
1da177e4
LT
203
204 /* Loop through all the buffers in the page. */
205 rl = NULL;
206 nr = i = 0;
207 do {
208 u8 *kaddr;
8273d5d4 209 int err;
1da177e4
LT
210
211 if (unlikely(buffer_uptodate(bh)))
212 continue;
213 if (unlikely(buffer_mapped(bh))) {
214 arr[nr++] = bh;
215 continue;
216 }
8273d5d4 217 err = 0;
1da177e4
LT
218 bh->b_bdev = vol->sb->s_bdev;
219 /* Is the block within the allowed limits? */
220 if (iblock < lblock) {
221 BOOL is_retry = FALSE;
222
223 /* Convert iblock into corresponding vcn and offset. */
224 vcn = (VCN)iblock << blocksize_bits >>
225 vol->cluster_size_bits;
226 vcn_ofs = ((VCN)iblock << blocksize_bits) &
227 vol->cluster_size_mask;
228 if (!rl) {
229lock_retry_remap:
230 down_read(&ni->runlist.lock);
231 rl = ni->runlist.rl;
232 }
233 if (likely(rl != NULL)) {
234 /* Seek to element containing target vcn. */
235 while (rl->length && rl[1].vcn <= vcn)
236 rl++;
237 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
238 } else
239 lcn = LCN_RL_NOT_MAPPED;
240 /* Successful remap. */
241 if (lcn >= 0) {
242 /* Setup buffer head to correct block. */
243 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
244 + vcn_ofs) >> blocksize_bits;
245 set_buffer_mapped(bh);
246 /* Only read initialized data blocks. */
247 if (iblock < zblock) {
248 arr[nr++] = bh;
249 continue;
250 }
251 /* Fully non-initialized data block, zero it. */
252 goto handle_zblock;
253 }
254 /* It is a hole, need to zero it. */
255 if (lcn == LCN_HOLE)
256 goto handle_hole;
257 /* If first try and runlist unmapped, map and retry. */
258 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
1da177e4
LT
259 is_retry = TRUE;
260 /*
261 * Attempt to map runlist, dropping lock for
262 * the duration.
263 */
264 up_read(&ni->runlist.lock);
265 err = ntfs_map_runlist(ni, vcn);
266 if (likely(!err))
267 goto lock_retry_remap;
268 rl = NULL;
9f993fe4
AA
269 } else if (!rl)
270 up_read(&ni->runlist.lock);
8273d5d4
AA
271 /*
272 * If buffer is outside the runlist, treat it as a
273 * hole. This can happen due to concurrent truncate
274 * for example.
275 */
276 if (err == -ENOENT || lcn == LCN_ENOENT) {
277 err = 0;
278 goto handle_hole;
279 }
1da177e4 280 /* Hard error, zero out region. */
8273d5d4
AA
281 if (!err)
282 err = -EIO;
1da177e4
LT
283 bh->b_blocknr = -1;
284 SetPageError(page);
285 ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
286 "attribute type 0x%x, vcn 0x%llx, "
287 "offset 0x%x because its location on "
288 "disk could not be determined%s "
8273d5d4 289 "(error code %i).", ni->mft_no,
1da177e4
LT
290 ni->type, (unsigned long long)vcn,
291 vcn_ofs, is_retry ? " even after "
8273d5d4 292 "retrying" : "", err);
1da177e4
LT
293 }
294 /*
295 * Either iblock was outside lblock limits or
296 * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
297 * of the page and set the buffer uptodate.
298 */
299handle_hole:
300 bh->b_blocknr = -1UL;
301 clear_buffer_mapped(bh);
302handle_zblock:
303 kaddr = kmap_atomic(page, KM_USER0);
304 memset(kaddr + i * blocksize, 0, blocksize);
1da177e4 305 kunmap_atomic(kaddr, KM_USER0);
8273d5d4
AA
306 flush_dcache_page(page);
307 if (likely(!err))
308 set_buffer_uptodate(bh);
1da177e4
LT
309 } while (i++, iblock++, (bh = bh->b_this_page) != head);
310
311 /* Release the lock if we took it. */
312 if (rl)
313 up_read(&ni->runlist.lock);
314
315 /* Check we have at least one buffer ready for i/o. */
316 if (nr) {
317 struct buffer_head *tbh;
318
319 /* Lock the buffers. */
320 for (i = 0; i < nr; i++) {
321 tbh = arr[i];
322 lock_buffer(tbh);
323 tbh->b_end_io = ntfs_end_buffer_async_read;
324 set_buffer_async_read(tbh);
325 }
326 /* Finally, start i/o on the buffers. */
327 for (i = 0; i < nr; i++) {
328 tbh = arr[i];
329 if (likely(!buffer_uptodate(tbh)))
330 submit_bh(READ, tbh);
331 else
332 ntfs_end_buffer_async_read(tbh, 1);
333 }
334 return 0;
335 }
336 /* No i/o was scheduled on any of the buffers. */
337 if (likely(!PageError(page)))
338 SetPageUptodate(page);
339 else /* Signal synchronous i/o error. */
340 nr = -EIO;
341 unlock_page(page);
342 return nr;
343}
344
345/**
346 * ntfs_readpage - fill a @page of a @file with data from the device
347 * @file: open file to which the page @page belongs or NULL
348 * @page: page cache page to fill with data
349 *
350 * For non-resident attributes, ntfs_readpage() fills the @page of the open
351 * file @file by calling the ntfs version of the generic block_read_full_page()
352 * function, ntfs_read_block(), which in turn creates and reads in the buffers
353 * associated with the page asynchronously.
354 *
355 * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
356 * data from the mft record (which at this stage is most likely in memory) and
357 * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
358 * even if the mft record is not cached at this point in time, we need to wait
359 * for it to be read in before we can do the copy.
360 *
361 * Return 0 on success and -errno on error.
362 */
363static int ntfs_readpage(struct file *file, struct page *page)
364{
1da177e4
LT
365 ntfs_inode *ni, *base_ni;
366 u8 *kaddr;
367 ntfs_attr_search_ctx *ctx;
368 MFT_RECORD *mrec;
b6ad6c52 369 unsigned long flags;
1da177e4
LT
370 u32 attr_len;
371 int err = 0;
372
905685f6 373retry_readpage:
1da177e4
LT
374 BUG_ON(!PageLocked(page));
375 /*
376 * This can potentially happen because we clear PageUptodate() during
377 * ntfs_writepage() of MstProtected() attributes.
378 */
379 if (PageUptodate(page)) {
380 unlock_page(page);
381 return 0;
382 }
383 ni = NTFS_I(page->mapping->host);
311120ec
AA
384 /*
385 * Only $DATA attributes can be encrypted and only unnamed $DATA
386 * attributes can be compressed. Index root can have the flags set but
387 * this means to create compressed/encrypted files, not that the
388 * attribute is compressed/encrypted.
389 */
390 if (ni->type != AT_INDEX_ROOT) {
391 /* If attribute is encrypted, deny access, just like NT4. */
392 if (NInoEncrypted(ni)) {
393 BUG_ON(ni->type != AT_DATA);
394 err = -EACCES;
395 goto err_out;
396 }
397 /* Compressed data streams are handled in compress.c. */
398 if (NInoNonResident(ni) && NInoCompressed(ni)) {
399 BUG_ON(ni->type != AT_DATA);
400 BUG_ON(ni->name_len);
401 return ntfs_read_compressed_block(page);
402 }
403 }
1da177e4
LT
404 /* NInoNonResident() == NInoIndexAllocPresent() */
405 if (NInoNonResident(ni)) {
311120ec 406 /* Normal, non-resident data stream. */
1da177e4
LT
407 return ntfs_read_block(page);
408 }
409 /*
410 * Attribute is resident, implying it is not compressed or encrypted.
411 * This also means the attribute is smaller than an mft record and
412 * hence smaller than a page, so can simply zero out any pages with
311120ec
AA
413 * index above 0. Note the attribute can actually be marked compressed
414 * but if it is resident the actual data is not compressed so we are
415 * ok to ignore the compressed flag here.
1da177e4 416 */
b6ad6c52 417 if (unlikely(page->index > 0)) {
1da177e4
LT
418 kaddr = kmap_atomic(page, KM_USER0);
419 memset(kaddr, 0, PAGE_CACHE_SIZE);
420 flush_dcache_page(page);
421 kunmap_atomic(kaddr, KM_USER0);
422 goto done;
423 }
424 if (!NInoAttr(ni))
425 base_ni = ni;
426 else
427 base_ni = ni->ext.base_ntfs_ino;
428 /* Map, pin, and lock the mft record. */
429 mrec = map_mft_record(base_ni);
430 if (IS_ERR(mrec)) {
431 err = PTR_ERR(mrec);
432 goto err_out;
433 }
905685f6
AA
434 /*
435 * If a parallel write made the attribute non-resident, drop the mft
436 * record and retry the readpage.
437 */
438 if (unlikely(NInoNonResident(ni))) {
439 unmap_mft_record(base_ni);
440 goto retry_readpage;
441 }
1da177e4
LT
442 ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
443 if (unlikely(!ctx)) {
444 err = -ENOMEM;
445 goto unm_err_out;
446 }
447 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
448 CASE_SENSITIVE, 0, NULL, 0, ctx);
449 if (unlikely(err))
450 goto put_unm_err_out;
451 attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
b6ad6c52
AA
452 read_lock_irqsave(&ni->size_lock, flags);
453 if (unlikely(attr_len > ni->initialized_size))
454 attr_len = ni->initialized_size;
455 read_unlock_irqrestore(&ni->size_lock, flags);
1da177e4
LT
456 kaddr = kmap_atomic(page, KM_USER0);
457 /* Copy the data to the page. */
458 memcpy(kaddr, (u8*)ctx->attr +
459 le16_to_cpu(ctx->attr->data.resident.value_offset),
460 attr_len);
461 /* Zero the remainder of the page. */
462 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
463 flush_dcache_page(page);
464 kunmap_atomic(kaddr, KM_USER0);
465put_unm_err_out:
466 ntfs_attr_put_search_ctx(ctx);
467unm_err_out:
468 unmap_mft_record(base_ni);
469done:
470 SetPageUptodate(page);
471err_out:
472 unlock_page(page);
473 return err;
474}
475
476#ifdef NTFS_RW
477
478/**
479 * ntfs_write_block - write a @page to the backing store
480 * @page: page cache page to write out
481 * @wbc: writeback control structure
482 *
483 * This function is for writing pages belonging to non-resident, non-mst
484 * protected attributes to their backing store.
485 *
486 * For a page with buffers, map and write the dirty buffers asynchronously
487 * under page writeback. For a page without buffers, create buffers for the
488 * page, then proceed as above.
489 *
490 * If a page doesn't have buffers the page dirty state is definitive. If a page
491 * does have buffers, the page dirty state is just a hint, and the buffer dirty
492 * state is definitive. (A hint which has rules: dirty buffers against a clean
493 * page is illegal. Other combinations are legal and need to be handled. In
494 * particular a dirty page containing clean buffers for example.)
495 *
496 * Return 0 on success and -errno on error.
497 *
498 * Based on ntfs_read_block() and __block_write_full_page().
499 */
500static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
501{
502 VCN vcn;
503 LCN lcn;
07a4e2da
AA
504 s64 initialized_size;
505 loff_t i_size;
1da177e4
LT
506 sector_t block, dblock, iblock;
507 struct inode *vi;
508 ntfs_inode *ni;
509 ntfs_volume *vol;
510 runlist_element *rl;
511 struct buffer_head *bh, *head;
07a4e2da 512 unsigned long flags;
1da177e4
LT
513 unsigned int blocksize, vcn_ofs;
514 int err;
515 BOOL need_end_writeback;
516 unsigned char blocksize_bits;
517
518 vi = page->mapping->host;
519 ni = NTFS_I(vi);
520 vol = ni->vol;
521
522 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
523 "0x%lx.", ni->mft_no, ni->type, page->index);
524
525 BUG_ON(!NInoNonResident(ni));
526 BUG_ON(NInoMstProtected(ni));
527
528 blocksize_bits = vi->i_blkbits;
529 blocksize = 1 << blocksize_bits;
530
531 if (!page_has_buffers(page)) {
532 BUG_ON(!PageUptodate(page));
533 create_empty_buffers(page, blocksize,
534 (1 << BH_Uptodate) | (1 << BH_Dirty));
a01ac532
AA
535 if (unlikely(!page_has_buffers(page))) {
536 ntfs_warning(vol->sb, "Error allocating page "
537 "buffers. Redirtying page so we try "
538 "again later.");
539 /*
540 * Put the page back on mapping->dirty_pages, but leave
541 * its buffers' dirty state as-is.
542 */
543 redirty_page_for_writepage(wbc, page);
544 unlock_page(page);
545 return 0;
546 }
1da177e4
LT
547 }
548 bh = head = page_buffers(page);
a01ac532 549 BUG_ON(!bh);
1da177e4
LT
550
551 /* NOTE: Different naming scheme to ntfs_read_block()! */
552
553 /* The first block in the page. */
554 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
555
07a4e2da
AA
556 read_lock_irqsave(&ni->size_lock, flags);
557 i_size = i_size_read(vi);
558 initialized_size = ni->initialized_size;
559 read_unlock_irqrestore(&ni->size_lock, flags);
560
1da177e4 561 /* The first out of bounds block for the data size. */
07a4e2da 562 dblock = (i_size + blocksize - 1) >> blocksize_bits;
1da177e4
LT
563
564 /* The last (fully or partially) initialized block. */
07a4e2da 565 iblock = initialized_size >> blocksize_bits;
1da177e4
LT
566
567 /*
568 * Be very careful. We have no exclusion from __set_page_dirty_buffers
569 * here, and the (potentially unmapped) buffers may become dirty at
570 * any time. If a buffer becomes dirty here after we've inspected it
571 * then we just miss that fact, and the page stays dirty.
572 *
573 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
574 * handle that here by just cleaning them.
575 */
576
577 /*
578 * Loop through all the buffers in the page, mapping all the dirty
579 * buffers to disk addresses and handling any aliases from the
580 * underlying block device's mapping.
581 */
582 rl = NULL;
583 err = 0;
584 do {
585 BOOL is_retry = FALSE;
586
587 if (unlikely(block >= dblock)) {
588 /*
589 * Mapped buffers outside i_size will occur, because
590 * this page can be outside i_size when there is a
591 * truncate in progress. The contents of such buffers
592 * were zeroed by ntfs_writepage().
593 *
594 * FIXME: What about the small race window where
595 * ntfs_writepage() has not done any clearing because
596 * the page was within i_size but before we get here,
597 * vmtruncate() modifies i_size?
598 */
599 clear_buffer_dirty(bh);
600 set_buffer_uptodate(bh);
601 continue;
602 }
603
604 /* Clean buffers are not written out, so no need to map them. */
605 if (!buffer_dirty(bh))
606 continue;
607
608 /* Make sure we have enough initialized size. */
609 if (unlikely((block >= iblock) &&
07a4e2da 610 (initialized_size < i_size))) {
1da177e4
LT
611 /*
612 * If this page is fully outside initialized size, zero
613 * out all pages between the current initialized size
614 * and the current page. Just use ntfs_readpage() to do
615 * the zeroing transparently.
616 */
617 if (block > iblock) {
618 // TODO:
619 // For each page do:
620 // - read_cache_page()
621 // Again for each page do:
622 // - wait_on_page_locked()
623 // - Check (PageUptodate(page) &&
624 // !PageError(page))
625 // Update initialized size in the attribute and
626 // in the inode.
627 // Again, for each page do:
628 // __set_page_dirty_buffers();
629 // page_cache_release()
630 // We don't need to wait on the writes.
631 // Update iblock.
632 }
633 /*
634 * The current page straddles initialized size. Zero
635 * all non-uptodate buffers and set them uptodate (and
636 * dirty?). Note, there aren't any non-uptodate buffers
637 * if the page is uptodate.
638 * FIXME: For an uptodate page, the buffers may need to
639 * be written out because they were not initialized on
640 * disk before.
641 */
642 if (!PageUptodate(page)) {
643 // TODO:
644 // Zero any non-uptodate buffers up to i_size.
645 // Set them uptodate and dirty.
646 }
647 // TODO:
648 // Update initialized size in the attribute and in the
649 // inode (up to i_size).
650 // Update iblock.
651 // FIXME: This is inefficient. Try to batch the two
652 // size changes to happen in one go.
653 ntfs_error(vol->sb, "Writing beyond initialized size "
654 "is not supported yet. Sorry.");
655 err = -EOPNOTSUPP;
656 break;
657 // Do NOT set_buffer_new() BUT DO clear buffer range
658 // outside write request range.
659 // set_buffer_uptodate() on complete buffers as well as
660 // set_buffer_dirty().
661 }
662
663 /* No need to map buffers that are already mapped. */
664 if (buffer_mapped(bh))
665 continue;
666
667 /* Unmapped, dirty buffer. Need to map it. */
668 bh->b_bdev = vol->sb->s_bdev;
669
670 /* Convert block into corresponding vcn and offset. */
671 vcn = (VCN)block << blocksize_bits;
672 vcn_ofs = vcn & vol->cluster_size_mask;
673 vcn >>= vol->cluster_size_bits;
674 if (!rl) {
675lock_retry_remap:
676 down_read(&ni->runlist.lock);
677 rl = ni->runlist.rl;
678 }
679 if (likely(rl != NULL)) {
680 /* Seek to element containing target vcn. */
681 while (rl->length && rl[1].vcn <= vcn)
682 rl++;
683 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
684 } else
685 lcn = LCN_RL_NOT_MAPPED;
686 /* Successful remap. */
687 if (lcn >= 0) {
688 /* Setup buffer head to point to correct block. */
689 bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
690 vcn_ofs) >> blocksize_bits;
691 set_buffer_mapped(bh);
692 continue;
693 }
694 /* It is a hole, need to instantiate it. */
695 if (lcn == LCN_HOLE) {
8dcdebaf
AA
696 u8 *kaddr;
697 unsigned long *bpos, *bend;
698
699 /* Check if the buffer is zero. */
700 kaddr = kmap_atomic(page, KM_USER0);
701 bpos = (unsigned long *)(kaddr + bh_offset(bh));
702 bend = (unsigned long *)((u8*)bpos + blocksize);
703 do {
704 if (unlikely(*bpos))
705 break;
706 } while (likely(++bpos < bend));
707 kunmap_atomic(kaddr, KM_USER0);
708 if (bpos == bend) {
709 /*
710 * Buffer is zero and sparse, no need to write
711 * it.
712 */
713 bh->b_blocknr = -1;
714 clear_buffer_dirty(bh);
715 continue;
716 }
1da177e4
LT
717 // TODO: Instantiate the hole.
718 // clear_buffer_new(bh);
719 // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
720 ntfs_error(vol->sb, "Writing into sparse regions is "
721 "not supported yet. Sorry.");
722 err = -EOPNOTSUPP;
723 break;
724 }
725 /* If first try and runlist unmapped, map and retry. */
726 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
727 is_retry = TRUE;
728 /*
729 * Attempt to map runlist, dropping lock for
730 * the duration.
731 */
732 up_read(&ni->runlist.lock);
733 err = ntfs_map_runlist(ni, vcn);
734 if (likely(!err))
735 goto lock_retry_remap;
736 rl = NULL;
9f993fe4
AA
737 } else if (!rl)
738 up_read(&ni->runlist.lock);
8273d5d4
AA
739 /*
740 * If buffer is outside the runlist, truncate has cut it out
741 * of the runlist. Just clean and clear the buffer and set it
742 * uptodate so it can get discarded by the VM.
743 */
744 if (err == -ENOENT || lcn == LCN_ENOENT) {
745 u8 *kaddr;
746
747 bh->b_blocknr = -1;
748 clear_buffer_dirty(bh);
749 kaddr = kmap_atomic(page, KM_USER0);
750 memset(kaddr + bh_offset(bh), 0, blocksize);
751 kunmap_atomic(kaddr, KM_USER0);
752 flush_dcache_page(page);
753 set_buffer_uptodate(bh);
754 err = 0;
755 continue;
756 }
1da177e4 757 /* Failed to map the buffer, even after retrying. */
8273d5d4
AA
758 if (!err)
759 err = -EIO;
1da177e4
LT
760 bh->b_blocknr = -1;
761 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
762 "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
763 "because its location on disk could not be "
8273d5d4 764 "determined%s (error code %i).", ni->mft_no,
1da177e4
LT
765 ni->type, (unsigned long long)vcn,
766 vcn_ofs, is_retry ? " even after "
8273d5d4 767 "retrying" : "", err);
1da177e4
LT
768 break;
769 } while (block++, (bh = bh->b_this_page) != head);
770
771 /* Release the lock if we took it. */
772 if (rl)
773 up_read(&ni->runlist.lock);
774
775 /* For the error case, need to reset bh to the beginning. */
776 bh = head;
777
54b02eb0 778 /* Just an optimization, so ->readpage() is not called later. */
1da177e4
LT
779 if (unlikely(!PageUptodate(page))) {
780 int uptodate = 1;
781 do {
782 if (!buffer_uptodate(bh)) {
783 uptodate = 0;
784 bh = head;
785 break;
786 }
787 } while ((bh = bh->b_this_page) != head);
788 if (uptodate)
789 SetPageUptodate(page);
790 }
791
792 /* Setup all mapped, dirty buffers for async write i/o. */
793 do {
1da177e4
LT
794 if (buffer_mapped(bh) && buffer_dirty(bh)) {
795 lock_buffer(bh);
796 if (test_clear_buffer_dirty(bh)) {
797 BUG_ON(!buffer_uptodate(bh));
798 mark_buffer_async_write(bh);
799 } else
800 unlock_buffer(bh);
801 } else if (unlikely(err)) {
802 /*
803 * For the error case. The buffer may have been set
804 * dirty during attachment to a dirty page.
805 */
806 if (err != -ENOMEM)
807 clear_buffer_dirty(bh);
808 }
809 } while ((bh = bh->b_this_page) != head);
810
811 if (unlikely(err)) {
812 // TODO: Remove the -EOPNOTSUPP check later on...
813 if (unlikely(err == -EOPNOTSUPP))
814 err = 0;
815 else if (err == -ENOMEM) {
816 ntfs_warning(vol->sb, "Error allocating memory. "
817 "Redirtying page so we try again "
818 "later.");
819 /*
820 * Put the page back on mapping->dirty_pages, but
821 * leave its buffer's dirty state as-is.
822 */
823 redirty_page_for_writepage(wbc, page);
824 err = 0;
825 } else
826 SetPageError(page);
827 }
828
829 BUG_ON(PageWriteback(page));
830 set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
1da177e4 831
54b02eb0 832 /* Submit the prepared buffers for i/o. */
1da177e4
LT
833 need_end_writeback = TRUE;
834 do {
835 struct buffer_head *next = bh->b_this_page;
836 if (buffer_async_write(bh)) {
837 submit_bh(WRITE, bh);
838 need_end_writeback = FALSE;
839 }
1da177e4
LT
840 bh = next;
841 } while (bh != head);
54b02eb0 842 unlock_page(page);
1da177e4
LT
843
844 /* If no i/o was started, need to end_page_writeback(). */
845 if (unlikely(need_end_writeback))
846 end_page_writeback(page);
847
848 ntfs_debug("Done.");
849 return err;
850}
851
852/**
853 * ntfs_write_mst_block - write a @page to the backing store
854 * @page: page cache page to write out
855 * @wbc: writeback control structure
856 *
857 * This function is for writing pages belonging to non-resident, mst protected
858 * attributes to their backing store. The only supported attributes are index
859 * allocation and $MFT/$DATA. Both directory inodes and index inodes are
860 * supported for the index allocation case.
861 *
862 * The page must remain locked for the duration of the write because we apply
863 * the mst fixups, write, and then undo the fixups, so if we were to unlock the
864 * page before undoing the fixups, any other user of the page will see the
865 * page contents as corrupt.
866 *
867 * We clear the page uptodate flag for the duration of the function to ensure
868 * exclusion for the $MFT/$DATA case against someone mapping an mft record we
869 * are about to apply the mst fixups to.
870 *
871 * Return 0 on success and -errno on error.
872 *
873 * Based on ntfs_write_block(), ntfs_mft_writepage(), and
874 * write_mft_record_nolock().
875 */
876static int ntfs_write_mst_block(struct page *page,
877 struct writeback_control *wbc)
878{
879 sector_t block, dblock, rec_block;
880 struct inode *vi = page->mapping->host;
881 ntfs_inode *ni = NTFS_I(vi);
882 ntfs_volume *vol = ni->vol;
883 u8 *kaddr;
1da177e4
LT
884 unsigned int rec_size = ni->itype.index.block_size;
885 ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
886 struct buffer_head *bh, *head, *tbh, *rec_start_bh;
d53ee322 887 struct buffer_head *bhs[MAX_BUF_PER_PAGE];
1da177e4 888 runlist_element *rl;
d53ee322
AA
889 int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
890 unsigned bh_size, rec_size_bits;
1da177e4 891 BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
d53ee322 892 unsigned char bh_size_bits;
1da177e4
LT
893
894 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
895 "0x%lx.", vi->i_ino, ni->type, page->index);
896 BUG_ON(!NInoNonResident(ni));
897 BUG_ON(!NInoMstProtected(ni));
898 is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
899 /*
900 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
901 * in its page cache were to be marked dirty. However this should
902 * never happen with the current driver and considering we do not
903 * handle this case here we do want to BUG(), at least for now.
904 */
905 BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
906 (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
d53ee322
AA
907 bh_size_bits = vi->i_blkbits;
908 bh_size = 1 << bh_size_bits;
909 max_bhs = PAGE_CACHE_SIZE / bh_size;
1da177e4 910 BUG_ON(!max_bhs);
d53ee322 911 BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
1da177e4
LT
912
913 /* Were we called for sync purposes? */
914 sync = (wbc->sync_mode == WB_SYNC_ALL);
915
916 /* Make sure we have mapped buffers. */
1da177e4
LT
917 bh = head = page_buffers(page);
918 BUG_ON(!bh);
919
920 rec_size_bits = ni->itype.index.block_size_bits;
921 BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
922 bhs_per_rec = rec_size >> bh_size_bits;
923 BUG_ON(!bhs_per_rec);
924
925 /* The first block in the page. */
926 rec_block = block = (sector_t)page->index <<
927 (PAGE_CACHE_SHIFT - bh_size_bits);
928
929 /* The first out of bounds block for the data size. */
07a4e2da 930 dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
1da177e4
LT
931
932 rl = NULL;
933 err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
934 page_is_dirty = rec_is_dirty = FALSE;
935 rec_start_bh = NULL;
936 do {
937 BOOL is_retry = FALSE;
938
939 if (likely(block < rec_block)) {
940 if (unlikely(block >= dblock)) {
941 clear_buffer_dirty(bh);
946929d8 942 set_buffer_uptodate(bh);
1da177e4
LT
943 continue;
944 }
945 /*
946 * This block is not the first one in the record. We
947 * ignore the buffer's dirty state because we could
948 * have raced with a parallel mark_ntfs_record_dirty().
949 */
950 if (!rec_is_dirty)
951 continue;
952 if (unlikely(err2)) {
953 if (err2 != -ENOMEM)
954 clear_buffer_dirty(bh);
955 continue;
956 }
957 } else /* if (block == rec_block) */ {
958 BUG_ON(block > rec_block);
959 /* This block is the first one in the record. */
960 rec_block += bhs_per_rec;
961 err2 = 0;
962 if (unlikely(block >= dblock)) {
963 clear_buffer_dirty(bh);
964 continue;
965 }
966 if (!buffer_dirty(bh)) {
967 /* Clean records are not written out. */
968 rec_is_dirty = FALSE;
969 continue;
970 }
971 rec_is_dirty = TRUE;
972 rec_start_bh = bh;
973 }
974 /* Need to map the buffer if it is not mapped already. */
975 if (unlikely(!buffer_mapped(bh))) {
976 VCN vcn;
977 LCN lcn;
978 unsigned int vcn_ofs;
979
481d0374 980 bh->b_bdev = vol->sb->s_bdev;
1da177e4
LT
981 /* Obtain the vcn and offset of the current block. */
982 vcn = (VCN)block << bh_size_bits;
983 vcn_ofs = vcn & vol->cluster_size_mask;
984 vcn >>= vol->cluster_size_bits;
985 if (!rl) {
986lock_retry_remap:
987 down_read(&ni->runlist.lock);
988 rl = ni->runlist.rl;
989 }
990 if (likely(rl != NULL)) {
991 /* Seek to element containing target vcn. */
992 while (rl->length && rl[1].vcn <= vcn)
993 rl++;
994 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
995 } else
996 lcn = LCN_RL_NOT_MAPPED;
997 /* Successful remap. */
998 if (likely(lcn >= 0)) {
999 /* Setup buffer head to correct block. */
1000 bh->b_blocknr = ((lcn <<
1001 vol->cluster_size_bits) +
1002 vcn_ofs) >> bh_size_bits;
1003 set_buffer_mapped(bh);
1004 } else {
1005 /*
1006 * Remap failed. Retry to map the runlist once
1007 * unless we are working on $MFT which always
1008 * has the whole of its runlist in memory.
1009 */
1010 if (!is_mft && !is_retry &&
1011 lcn == LCN_RL_NOT_MAPPED) {
1012 is_retry = TRUE;
1013 /*
1014 * Attempt to map runlist, dropping
1015 * lock for the duration.
1016 */
1017 up_read(&ni->runlist.lock);
1018 err2 = ntfs_map_runlist(ni, vcn);
1019 if (likely(!err2))
1020 goto lock_retry_remap;
1021 if (err2 == -ENOMEM)
1022 page_is_dirty = TRUE;
1023 lcn = err2;
9f993fe4 1024 } else {
1da177e4 1025 err2 = -EIO;
9f993fe4
AA
1026 if (!rl)
1027 up_read(&ni->runlist.lock);
1028 }
1da177e4
LT
1029 /* Hard error. Abort writing this record. */
1030 if (!err || err == -ENOMEM)
1031 err = err2;
1032 bh->b_blocknr = -1;
1033 ntfs_error(vol->sb, "Cannot write ntfs record "
1034 "0x%llx (inode 0x%lx, "
1035 "attribute type 0x%x) because "
1036 "its location on disk could "
1037 "not be determined (error "
8907547d
RD
1038 "code %lli).",
1039 (long long)block <<
1da177e4
LT
1040 bh_size_bits >>
1041 vol->mft_record_size_bits,
1042 ni->mft_no, ni->type,
1043 (long long)lcn);
1044 /*
1045 * If this is not the first buffer, remove the
1046 * buffers in this record from the list of
1047 * buffers to write and clear their dirty bit
1048 * if not error -ENOMEM.
1049 */
1050 if (rec_start_bh != bh) {
1051 while (bhs[--nr_bhs] != rec_start_bh)
1052 ;
1053 if (err2 != -ENOMEM) {
1054 do {
1055 clear_buffer_dirty(
1056 rec_start_bh);
1057 } while ((rec_start_bh =
1058 rec_start_bh->
1059 b_this_page) !=
1060 bh);
1061 }
1062 }
1063 continue;
1064 }
1065 }
1066 BUG_ON(!buffer_uptodate(bh));
1067 BUG_ON(nr_bhs >= max_bhs);
1068 bhs[nr_bhs++] = bh;
1069 } while (block++, (bh = bh->b_this_page) != head);
1070 if (unlikely(rl))
1071 up_read(&ni->runlist.lock);
1072 /* If there were no dirty buffers, we are done. */
1073 if (!nr_bhs)
1074 goto done;
1075 /* Map the page so we can access its contents. */
1076 kaddr = kmap(page);
1077 /* Clear the page uptodate flag whilst the mst fixups are applied. */
1078 BUG_ON(!PageUptodate(page));
1079 ClearPageUptodate(page);
1080 for (i = 0; i < nr_bhs; i++) {
1081 unsigned int ofs;
1082
1083 /* Skip buffers which are not at the beginning of records. */
1084 if (i % bhs_per_rec)
1085 continue;
1086 tbh = bhs[i];
1087 ofs = bh_offset(tbh);
1088 if (is_mft) {
1089 ntfs_inode *tni;
1090 unsigned long mft_no;
1091
1092 /* Get the mft record number. */
1093 mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1094 >> rec_size_bits;
1095 /* Check whether to write this mft record. */
1096 tni = NULL;
1097 if (!ntfs_may_write_mft_record(vol, mft_no,
1098 (MFT_RECORD*)(kaddr + ofs), &tni)) {
1099 /*
1100 * The record should not be written. This
1101 * means we need to redirty the page before
1102 * returning.
1103 */
1104 page_is_dirty = TRUE;
1105 /*
1106 * Remove the buffers in this mft record from
1107 * the list of buffers to write.
1108 */
1109 do {
1110 bhs[i] = NULL;
1111 } while (++i % bhs_per_rec);
1112 continue;
1113 }
1114 /*
1115 * The record should be written. If a locked ntfs
1116 * inode was returned, add it to the array of locked
1117 * ntfs inodes.
1118 */
1119 if (tni)
1120 locked_nis[nr_locked_nis++] = tni;
1121 }
1122 /* Apply the mst protection fixups. */
1123 err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
1124 rec_size);
1125 if (unlikely(err2)) {
1126 if (!err || err == -ENOMEM)
1127 err = -EIO;
1128 ntfs_error(vol->sb, "Failed to apply mst fixups "
1129 "(inode 0x%lx, attribute type 0x%x, "
1130 "page index 0x%lx, page offset 0x%x)!"
1131 " Unmount and run chkdsk.", vi->i_ino,
1132 ni->type, page->index, ofs);
1133 /*
1134 * Mark all the buffers in this record clean as we do
1135 * not want to write corrupt data to disk.
1136 */
1137 do {
1138 clear_buffer_dirty(bhs[i]);
1139 bhs[i] = NULL;
1140 } while (++i % bhs_per_rec);
1141 continue;
1142 }
1143 nr_recs++;
1144 }
1145 /* If no records are to be written out, we are done. */
1146 if (!nr_recs)
1147 goto unm_done;
1148 flush_dcache_page(page);
1149 /* Lock buffers and start synchronous write i/o on them. */
1150 for (i = 0; i < nr_bhs; i++) {
1151 tbh = bhs[i];
1152 if (!tbh)
1153 continue;
1154 if (unlikely(test_set_buffer_locked(tbh)))
1155 BUG();
1156 /* The buffer dirty state is now irrelevant, just clean it. */
1157 clear_buffer_dirty(tbh);
1158 BUG_ON(!buffer_uptodate(tbh));
1159 BUG_ON(!buffer_mapped(tbh));
1160 get_bh(tbh);
1161 tbh->b_end_io = end_buffer_write_sync;
1162 submit_bh(WRITE, tbh);
1163 }
1164 /* Synchronize the mft mirror now if not @sync. */
1165 if (is_mft && !sync)
1166 goto do_mirror;
1167do_wait:
1168 /* Wait on i/o completion of buffers. */
1169 for (i = 0; i < nr_bhs; i++) {
1170 tbh = bhs[i];
1171 if (!tbh)
1172 continue;
1173 wait_on_buffer(tbh);
1174 if (unlikely(!buffer_uptodate(tbh))) {
1175 ntfs_error(vol->sb, "I/O error while writing ntfs "
1176 "record buffer (inode 0x%lx, "
1177 "attribute type 0x%x, page index "
1178 "0x%lx, page offset 0x%lx)! Unmount "
1179 "and run chkdsk.", vi->i_ino, ni->type,
1180 page->index, bh_offset(tbh));
1181 if (!err || err == -ENOMEM)
1182 err = -EIO;
1183 /*
1184 * Set the buffer uptodate so the page and buffer
1185 * states do not become out of sync.
1186 */
1187 set_buffer_uptodate(tbh);
1188 }
1189 }
1190 /* If @sync, now synchronize the mft mirror. */
1191 if (is_mft && sync) {
1192do_mirror:
1193 for (i = 0; i < nr_bhs; i++) {
1194 unsigned long mft_no;
1195 unsigned int ofs;
1196
1197 /*
1198 * Skip buffers which are not at the beginning of
1199 * records.
1200 */
1201 if (i % bhs_per_rec)
1202 continue;
1203 tbh = bhs[i];
1204 /* Skip removed buffers (and hence records). */
1205 if (!tbh)
1206 continue;
1207 ofs = bh_offset(tbh);
1208 /* Get the mft record number. */
1209 mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
1210 >> rec_size_bits;
1211 if (mft_no < vol->mftmirr_size)
1212 ntfs_sync_mft_mirror(vol, mft_no,
1213 (MFT_RECORD*)(kaddr + ofs),
1214 sync);
1215 }
1216 if (!sync)
1217 goto do_wait;
1218 }
1219 /* Remove the mst protection fixups again. */
1220 for (i = 0; i < nr_bhs; i++) {
1221 if (!(i % bhs_per_rec)) {
1222 tbh = bhs[i];
1223 if (!tbh)
1224 continue;
1225 post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1226 bh_offset(tbh)));
1227 }
1228 }
1229 flush_dcache_page(page);
1230unm_done:
1231 /* Unlock any locked inodes. */
1232 while (nr_locked_nis-- > 0) {
1233 ntfs_inode *tni, *base_tni;
1234
1235 tni = locked_nis[nr_locked_nis];
1236 /* Get the base inode. */
1237 down(&tni->extent_lock);
1238 if (tni->nr_extents >= 0)
1239 base_tni = tni;
1240 else {
1241 base_tni = tni->ext.base_ntfs_ino;
1242 BUG_ON(!base_tni);
1243 }
1244 up(&tni->extent_lock);
1245 ntfs_debug("Unlocking %s inode 0x%lx.",
1246 tni == base_tni ? "base" : "extent",
1247 tni->mft_no);
1248 up(&tni->mrec_lock);
1249 atomic_dec(&tni->count);
1250 iput(VFS_I(base_tni));
1251 }
1252 SetPageUptodate(page);
1253 kunmap(page);
1254done:
1255 if (unlikely(err && err != -ENOMEM)) {
1256 /*
1257 * Set page error if there is only one ntfs record in the page.
1258 * Otherwise we would loose per-record granularity.
1259 */
1260 if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
1261 SetPageError(page);
1262 NVolSetErrors(vol);
1263 }
1264 if (page_is_dirty) {
1265 ntfs_debug("Page still contains one or more dirty ntfs "
1266 "records. Redirtying the page starting at "
1267 "record 0x%lx.", page->index <<
1268 (PAGE_CACHE_SHIFT - rec_size_bits));
1269 redirty_page_for_writepage(wbc, page);
1270 unlock_page(page);
1271 } else {
1272 /*
1273 * Keep the VM happy. This must be done otherwise the
1274 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1275 * the page is clean.
1276 */
1277 BUG_ON(PageWriteback(page));
1278 set_page_writeback(page);
1279 unlock_page(page);
1280 end_page_writeback(page);
1281 }
1282 if (likely(!err))
1283 ntfs_debug("Done.");
1284 return err;
1285}
1286
1287/**
1288 * ntfs_writepage - write a @page to the backing store
1289 * @page: page cache page to write out
1290 * @wbc: writeback control structure
1291 *
1292 * This is called from the VM when it wants to have a dirty ntfs page cache
1293 * page cleaned. The VM has already locked the page and marked it clean.
1294 *
1295 * For non-resident attributes, ntfs_writepage() writes the @page by calling
1296 * the ntfs version of the generic block_write_full_page() function,
1297 * ntfs_write_block(), which in turn if necessary creates and writes the
1298 * buffers associated with the page asynchronously.
1299 *
1300 * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1301 * the data to the mft record (which at this stage is most likely in memory).
1302 * The mft record is then marked dirty and written out asynchronously via the
1303 * vfs inode dirty code path for the inode the mft record belongs to or via the
1304 * vm page dirty code path for the page the mft record is in.
1305 *
1306 * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
1307 *
1308 * Return 0 on success and -errno on error.
1309 */
1310static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1311{
1312 loff_t i_size;
149f0c52
AA
1313 struct inode *vi = page->mapping->host;
1314 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1da177e4 1315 char *kaddr;
149f0c52
AA
1316 ntfs_attr_search_ctx *ctx = NULL;
1317 MFT_RECORD *m = NULL;
1da177e4
LT
1318 u32 attr_len;
1319 int err;
1320
905685f6 1321retry_writepage:
1da177e4 1322 BUG_ON(!PageLocked(page));
1da177e4 1323 i_size = i_size_read(vi);
1da177e4
LT
1324 /* Is the page fully outside i_size? (truncate in progress) */
1325 if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
1326 PAGE_CACHE_SHIFT)) {
1327 /*
1328 * The page may have dirty, unmapped buffers. Make them
1329 * freeable here, so the page does not leak.
1330 */
1331 block_invalidatepage(page, 0);
1332 unlock_page(page);
1333 ntfs_debug("Write outside i_size - truncated?");
1334 return 0;
1335 }
bd45fdd2
AA
1336 /*
1337 * Only $DATA attributes can be encrypted and only unnamed $DATA
1338 * attributes can be compressed. Index root can have the flags set but
1339 * this means to create compressed/encrypted files, not that the
1340 * attribute is compressed/encrypted.
1341 */
1342 if (ni->type != AT_INDEX_ROOT) {
1343 /* If file is encrypted, deny access, just like NT4. */
1344 if (NInoEncrypted(ni)) {
1345 unlock_page(page);
1346 BUG_ON(ni->type != AT_DATA);
1347 ntfs_debug("Denying write access to encrypted "
1348 "file.");
1349 return -EACCES;
1350 }
1351 /* Compressed data streams are handled in compress.c. */
1352 if (NInoNonResident(ni) && NInoCompressed(ni)) {
1353 BUG_ON(ni->type != AT_DATA);
1354 BUG_ON(ni->name_len);
1355 // TODO: Implement and replace this with
1356 // return ntfs_write_compressed_block(page);
1357 unlock_page(page);
1358 ntfs_error(vi->i_sb, "Writing to compressed files is "
1359 "not supported yet. Sorry.");
1360 return -EOPNOTSUPP;
1361 }
1362 // TODO: Implement and remove this check.
1363 if (NInoNonResident(ni) && NInoSparse(ni)) {
1364 unlock_page(page);
1365 ntfs_error(vi->i_sb, "Writing to sparse files is not "
1366 "supported yet. Sorry.");
1367 return -EOPNOTSUPP;
1368 }
1369 }
1da177e4
LT
1370 /* NInoNonResident() == NInoIndexAllocPresent() */
1371 if (NInoNonResident(ni)) {
1da177e4
LT
1372 /* We have to zero every time due to mmap-at-end-of-file. */
1373 if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
1374 /* The page straddles i_size. */
1375 unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
1376 kaddr = kmap_atomic(page, KM_USER0);
1377 memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
1378 flush_dcache_page(page);
1379 kunmap_atomic(kaddr, KM_USER0);
1380 }
1381 /* Handle mst protected attributes. */
1382 if (NInoMstProtected(ni))
1383 return ntfs_write_mst_block(page, wbc);
bd45fdd2 1384 /* Normal, non-resident data stream. */
1da177e4
LT
1385 return ntfs_write_block(page, wbc);
1386 }
1387 /*
bd45fdd2
AA
1388 * Attribute is resident, implying it is not compressed, encrypted, or
1389 * mst protected. This also means the attribute is smaller than an mft
1390 * record and hence smaller than a page, so can simply return error on
1391 * any pages with index above 0. Note the attribute can actually be
1392 * marked compressed but if it is resident the actual data is not
1393 * compressed so we are ok to ignore the compressed flag here.
1da177e4
LT
1394 */
1395 BUG_ON(page_has_buffers(page));
1396 BUG_ON(!PageUptodate(page));
1397 if (unlikely(page->index > 0)) {
1398 ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. "
1399 "Aborting write.", page->index);
1400 BUG_ON(PageWriteback(page));
1401 set_page_writeback(page);
1402 unlock_page(page);
1403 end_page_writeback(page);
1404 return -EIO;
1405 }
1406 if (!NInoAttr(ni))
1407 base_ni = ni;
1408 else
1409 base_ni = ni->ext.base_ntfs_ino;
1410 /* Map, pin, and lock the mft record. */
1411 m = map_mft_record(base_ni);
1412 if (IS_ERR(m)) {
1413 err = PTR_ERR(m);
1414 m = NULL;
1415 ctx = NULL;
1416 goto err_out;
1417 }
905685f6
AA
1418 /*
1419 * If a parallel write made the attribute non-resident, drop the mft
1420 * record and retry the writepage.
1421 */
1422 if (unlikely(NInoNonResident(ni))) {
1423 unmap_mft_record(base_ni);
1424 goto retry_writepage;
1425 }
1da177e4
LT
1426 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1427 if (unlikely(!ctx)) {
1428 err = -ENOMEM;
1429 goto err_out;
1430 }
1431 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1432 CASE_SENSITIVE, 0, NULL, 0, ctx);
1433 if (unlikely(err))
1434 goto err_out;
1435 /*
1436 * Keep the VM happy. This must be done otherwise the radix-tree tag
1437 * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
1438 */
1439 BUG_ON(PageWriteback(page));
1440 set_page_writeback(page);
1441 unlock_page(page);
1da177e4 1442 /*
bd45fdd2
AA
1443 * Here, we do not need to zero the out of bounds area everytime
1444 * because the below memcpy() already takes care of the
1445 * mmap-at-end-of-file requirements. If the file is converted to a
1446 * non-resident one, then the code path use is switched to the
1447 * non-resident one where the zeroing happens on each ntfs_writepage()
1448 * invocation.
1da177e4 1449 */
1da177e4 1450 attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
07a4e2da 1451 i_size = i_size_read(vi);
1da177e4 1452 if (unlikely(attr_len > i_size)) {
1da177e4 1453 attr_len = i_size;
f40661be 1454 ctx->attr->data.resident.value_length = cpu_to_le32(attr_len);
1da177e4 1455 }
f40661be 1456 kaddr = kmap_atomic(page, KM_USER0);
1da177e4
LT
1457 /* Copy the data from the page to the mft record. */
1458 memcpy((u8*)ctx->attr +
1459 le16_to_cpu(ctx->attr->data.resident.value_offset),
1460 kaddr, attr_len);
1461 flush_dcache_mft_record_page(ctx->ntfs_ino);
1462 /* Zero out of bounds area in the page cache page. */
1463 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1464 flush_dcache_page(page);
1465 kunmap_atomic(kaddr, KM_USER0);
1466
1467 end_page_writeback(page);
1468
1469 /* Mark the mft record dirty, so it gets written back. */
1470 mark_mft_record_dirty(ctx->ntfs_ino);
1471 ntfs_attr_put_search_ctx(ctx);
1472 unmap_mft_record(base_ni);
1473 return 0;
1474err_out:
1475 if (err == -ENOMEM) {
1476 ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1477 "page so we try again later.");
1478 /*
1479 * Put the page back on mapping->dirty_pages, but leave its
1480 * buffers' dirty state as-is.
1481 */
1482 redirty_page_for_writepage(wbc, page);
1483 err = 0;
1484 } else {
1485 ntfs_error(vi->i_sb, "Resident attribute write failed with "
149f0c52 1486 "error %i.", err);
1da177e4 1487 SetPageError(page);
149f0c52
AA
1488 NVolSetErrors(ni->vol);
1489 make_bad_inode(vi);
1da177e4
LT
1490 }
1491 unlock_page(page);
1492 if (ctx)
1493 ntfs_attr_put_search_ctx(ctx);
1494 if (m)
1495 unmap_mft_record(base_ni);
1496 return err;
1497}
1498
1499/**
1500 * ntfs_prepare_nonresident_write -
1501 *
1502 */
1503static int ntfs_prepare_nonresident_write(struct page *page,
1504 unsigned from, unsigned to)
1505{
1506 VCN vcn;
1507 LCN lcn;
07a4e2da
AA
1508 s64 initialized_size;
1509 loff_t i_size;
1da177e4
LT
1510 sector_t block, ablock, iblock;
1511 struct inode *vi;
1512 ntfs_inode *ni;
1513 ntfs_volume *vol;
1514 runlist_element *rl;
1515 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
07a4e2da 1516 unsigned long flags;
1da177e4
LT
1517 unsigned int vcn_ofs, block_start, block_end, blocksize;
1518 int err;
1519 BOOL is_retry;
1520 unsigned char blocksize_bits;
1521
1522 vi = page->mapping->host;
1523 ni = NTFS_I(vi);
1524 vol = ni->vol;
1525
1526 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1527 "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
1528 page->index, from, to);
1529
1530 BUG_ON(!NInoNonResident(ni));
1531
1532 blocksize_bits = vi->i_blkbits;
1533 blocksize = 1 << blocksize_bits;
1534
1535 /*
1536 * create_empty_buffers() will create uptodate/dirty buffers if the
1537 * page is uptodate/dirty.
1538 */
1539 if (!page_has_buffers(page))
1540 create_empty_buffers(page, blocksize, 0);
1541 bh = head = page_buffers(page);
1542 if (unlikely(!bh))
1543 return -ENOMEM;
1544
1545 /* The first block in the page. */
1546 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1547
07a4e2da 1548 read_lock_irqsave(&ni->size_lock, flags);
1da177e4 1549 /*
b6ad6c52 1550 * The first out of bounds block for the allocated size. No need to
1da177e4
LT
1551 * round up as allocated_size is in multiples of cluster size and the
1552 * minimum cluster size is 512 bytes, which is equal to the smallest
1553 * blocksize.
1554 */
1555 ablock = ni->allocated_size >> blocksize_bits;
07a4e2da
AA
1556 i_size = i_size_read(vi);
1557 initialized_size = ni->initialized_size;
1558 read_unlock_irqrestore(&ni->size_lock, flags);
1559
1da177e4 1560 /* The last (fully or partially) initialized block. */
07a4e2da 1561 iblock = initialized_size >> blocksize_bits;
1da177e4
LT
1562
1563 /* Loop through all the buffers in the page. */
1564 block_start = 0;
1565 rl = NULL;
1566 err = 0;
1567 do {
1568 block_end = block_start + blocksize;
1569 /*
1570 * If buffer @bh is outside the write, just mark it uptodate
1571 * if the page is uptodate and continue with the next buffer.
1572 */
1573 if (block_end <= from || block_start >= to) {
1574 if (PageUptodate(page)) {
1575 if (!buffer_uptodate(bh))
1576 set_buffer_uptodate(bh);
1577 }
1578 continue;
1579 }
1580 /*
1581 * @bh is at least partially being written to.
1582 * Make sure it is not marked as new.
1583 */
1584 //if (buffer_new(bh))
1585 // clear_buffer_new(bh);
1586
1587 if (block >= ablock) {
1588 // TODO: block is above allocated_size, need to
1589 // allocate it. Best done in one go to accommodate not
1590 // only block but all above blocks up to and including:
1591 // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1592 // - 1) >> blobksize_bits. Obviously will need to round
1593 // up to next cluster boundary, too. This should be
1594 // done with a helper function, so it can be reused.
1595 ntfs_error(vol->sb, "Writing beyond allocated size "
1596 "is not supported yet. Sorry.");
1597 err = -EOPNOTSUPP;
1598 goto err_out;
1599 // Need to update ablock.
1600 // Need to set_buffer_new() on all block bhs that are
1601 // newly allocated.
1602 }
1603 /*
1604 * Now we have enough allocated size to fulfill the whole
1605 * request, i.e. block < ablock is true.
1606 */
1607 if (unlikely((block >= iblock) &&
07a4e2da 1608 (initialized_size < i_size))) {
1da177e4
LT
1609 /*
1610 * If this page is fully outside initialized size, zero
1611 * out all pages between the current initialized size
1612 * and the current page. Just use ntfs_readpage() to do
1613 * the zeroing transparently.
1614 */
1615 if (block > iblock) {
1616 // TODO:
1617 // For each page do:
1618 // - read_cache_page()
1619 // Again for each page do:
1620 // - wait_on_page_locked()
1621 // - Check (PageUptodate(page) &&
1622 // !PageError(page))
1623 // Update initialized size in the attribute and
1624 // in the inode.
1625 // Again, for each page do:
1626 // __set_page_dirty_buffers();
1627 // page_cache_release()
1628 // We don't need to wait on the writes.
1629 // Update iblock.
1630 }
1631 /*
1632 * The current page straddles initialized size. Zero
1633 * all non-uptodate buffers and set them uptodate (and
1634 * dirty?). Note, there aren't any non-uptodate buffers
1635 * if the page is uptodate.
1636 * FIXME: For an uptodate page, the buffers may need to
1637 * be written out because they were not initialized on
1638 * disk before.
1639 */
1640 if (!PageUptodate(page)) {
1641 // TODO:
1642 // Zero any non-uptodate buffers up to i_size.
1643 // Set them uptodate and dirty.
1644 }
1645 // TODO:
1646 // Update initialized size in the attribute and in the
1647 // inode (up to i_size).
1648 // Update iblock.
1649 // FIXME: This is inefficient. Try to batch the two
1650 // size changes to happen in one go.
1651 ntfs_error(vol->sb, "Writing beyond initialized size "
1652 "is not supported yet. Sorry.");
1653 err = -EOPNOTSUPP;
1654 goto err_out;
1655 // Do NOT set_buffer_new() BUT DO clear buffer range
1656 // outside write request range.
1657 // set_buffer_uptodate() on complete buffers as well as
1658 // set_buffer_dirty().
1659 }
1660
1661 /* Need to map unmapped buffers. */
1662 if (!buffer_mapped(bh)) {
1663 /* Unmapped buffer. Need to map it. */
1664 bh->b_bdev = vol->sb->s_bdev;
1665
1666 /* Convert block into corresponding vcn and offset. */
1667 vcn = (VCN)block << blocksize_bits >>
1668 vol->cluster_size_bits;
1669 vcn_ofs = ((VCN)block << blocksize_bits) &
1670 vol->cluster_size_mask;
1671
1672 is_retry = FALSE;
1673 if (!rl) {
1674lock_retry_remap:
1675 down_read(&ni->runlist.lock);
1676 rl = ni->runlist.rl;
1677 }
1678 if (likely(rl != NULL)) {
1679 /* Seek to element containing target vcn. */
1680 while (rl->length && rl[1].vcn <= vcn)
1681 rl++;
1682 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1683 } else
1684 lcn = LCN_RL_NOT_MAPPED;
1685 if (unlikely(lcn < 0)) {
1686 /*
1687 * We extended the attribute allocation above.
1688 * If we hit an ENOENT here it means that the
1689 * allocation was insufficient which is a bug.
1690 */
1691 BUG_ON(lcn == LCN_ENOENT);
1692
1693 /* It is a hole, need to instantiate it. */
1694 if (lcn == LCN_HOLE) {
1695 // TODO: Instantiate the hole.
1696 // clear_buffer_new(bh);
1697 // unmap_underlying_metadata(bh->b_bdev,
1698 // bh->b_blocknr);
1699 // For non-uptodate buffers, need to
1700 // zero out the region outside the
1701 // request in this bh or all bhs,
1702 // depending on what we implemented
1703 // above.
1704 // Need to flush_dcache_page().
1705 // Or could use set_buffer_new()
1706 // instead?
1707 ntfs_error(vol->sb, "Writing into "
1708 "sparse regions is "
1709 "not supported yet. "
1710 "Sorry.");
1711 err = -EOPNOTSUPP;
9f993fe4
AA
1712 if (!rl)
1713 up_read(&ni->runlist.lock);
1da177e4
LT
1714 goto err_out;
1715 } else if (!is_retry &&
1716 lcn == LCN_RL_NOT_MAPPED) {
1717 is_retry = TRUE;
1718 /*
1719 * Attempt to map runlist, dropping
1720 * lock for the duration.
1721 */
1722 up_read(&ni->runlist.lock);
1723 err = ntfs_map_runlist(ni, vcn);
1724 if (likely(!err))
1725 goto lock_retry_remap;
1726 rl = NULL;
1727 lcn = err;
9f993fe4
AA
1728 } else if (!rl)
1729 up_read(&ni->runlist.lock);
1da177e4
LT
1730 /*
1731 * Failed to map the buffer, even after
1732 * retrying.
1733 */
1734 bh->b_blocknr = -1;
1735 ntfs_error(vol->sb, "Failed to write to inode "
1736 "0x%lx, attribute type 0x%x, "
1737 "vcn 0x%llx, offset 0x%x "
1738 "because its location on disk "
1739 "could not be determined%s "
1740 "(error code %lli).",
1741 ni->mft_no, ni->type,
1742 (unsigned long long)vcn,
1743 vcn_ofs, is_retry ? " even "
1744 "after retrying" : "",
1745 (long long)lcn);
1746 if (!err)
1747 err = -EIO;
1748 goto err_out;
1749 }
1750 /* We now have a successful remap, i.e. lcn >= 0. */
1751
1752 /* Setup buffer head to correct block. */
1753 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1754 + vcn_ofs) >> blocksize_bits;
1755 set_buffer_mapped(bh);
1756
1757 // FIXME: Something analogous to this is needed for
1758 // each newly allocated block, i.e. BH_New.
1759 // FIXME: Might need to take this out of the
1760 // if (!buffer_mapped(bh)) {}, depending on how we
1761 // implement things during the allocated_size and
1762 // initialized_size extension code above.
1763 if (buffer_new(bh)) {
1764 clear_buffer_new(bh);
1765 unmap_underlying_metadata(bh->b_bdev,
1766 bh->b_blocknr);
1767 if (PageUptodate(page)) {
1768 set_buffer_uptodate(bh);
1769 continue;
1770 }
1771 /*
1772 * Page is _not_ uptodate, zero surrounding
1773 * region. NOTE: This is how we decide if to
1774 * zero or not!
1775 */
1776 if (block_end > to || block_start < from) {
1777 void *kaddr;
1778
1779 kaddr = kmap_atomic(page, KM_USER0);
1780 if (block_end > to)
1781 memset(kaddr + to, 0,
1782 block_end - to);
1783 if (block_start < from)
1784 memset(kaddr + block_start, 0,
1785 from -
1786 block_start);
1787 flush_dcache_page(page);
1788 kunmap_atomic(kaddr, KM_USER0);
1789 }
1790 continue;
1791 }
1792 }
1793 /* @bh is mapped, set it uptodate if the page is uptodate. */
1794 if (PageUptodate(page)) {
1795 if (!buffer_uptodate(bh))
1796 set_buffer_uptodate(bh);
1797 continue;
1798 }
1799 /*
1800 * The page is not uptodate. The buffer is mapped. If it is not
1801 * uptodate, and it is only partially being written to, we need
1802 * to read the buffer in before the write, i.e. right now.
1803 */
1804 if (!buffer_uptodate(bh) &&
1805 (block_start < from || block_end > to)) {
1806 ll_rw_block(READ, 1, &bh);
1807 *wait_bh++ = bh;
1808 }
1809 } while (block++, block_start = block_end,
1810 (bh = bh->b_this_page) != head);
1811
1812 /* Release the lock if we took it. */
1813 if (rl) {
1814 up_read(&ni->runlist.lock);
1815 rl = NULL;
1816 }
1817
1818 /* If we issued read requests, let them complete. */
1819 while (wait_bh > wait) {
1820 wait_on_buffer(*--wait_bh);
1821 if (!buffer_uptodate(*wait_bh))
1822 return -EIO;
1823 }
1824
1825 ntfs_debug("Done.");
1826 return 0;
1827err_out:
1828 /*
1829 * Zero out any newly allocated blocks to avoid exposing stale data.
1830 * If BH_New is set, we know that the block was newly allocated in the
1831 * above loop.
1832 * FIXME: What about initialized_size increments? Have we done all the
1833 * required zeroing above? If not this error handling is broken, and
1834 * in particular the if (block_end <= from) check is completely bogus.
1835 */
1836 bh = head;
1837 block_start = 0;
1838 is_retry = FALSE;
1839 do {
1840 block_end = block_start + blocksize;
1841 if (block_end <= from)
1842 continue;
1843 if (block_start >= to)
1844 break;
1845 if (buffer_new(bh)) {
1846 void *kaddr;
1847
1848 clear_buffer_new(bh);
1849 kaddr = kmap_atomic(page, KM_USER0);
1850 memset(kaddr + block_start, 0, bh->b_size);
1851 kunmap_atomic(kaddr, KM_USER0);
1852 set_buffer_uptodate(bh);
1853 mark_buffer_dirty(bh);
1854 is_retry = TRUE;
1855 }
1856 } while (block_start = block_end, (bh = bh->b_this_page) != head);
1857 if (is_retry)
1858 flush_dcache_page(page);
1859 if (rl)
1860 up_read(&ni->runlist.lock);
1861 return err;
1862}
1863
1864/**
1865 * ntfs_prepare_write - prepare a page for receiving data
1866 *
1867 * This is called from generic_file_write() with i_sem held on the inode
1868 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
1869 * data has not yet been copied into the @page.
1870 *
1871 * Need to extend the attribute/fill in holes if necessary, create blocks and
1872 * make partially overwritten blocks uptodate,
1873 *
1874 * i_size is not to be modified yet.
1875 *
1876 * Return 0 on success or -errno on error.
1877 *
1878 * Should be using block_prepare_write() [support for sparse files] or
1879 * cont_prepare_write() [no support for sparse files]. Cannot do that due to
1880 * ntfs specifics but can look at them for implementation guidance.
1881 *
1882 * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1883 * the first byte in the page that will be written to and @to is the first byte
1884 * after the last byte that will be written to.
1885 */
1886static int ntfs_prepare_write(struct file *file, struct page *page,
1887 unsigned from, unsigned to)
1888{
1889 s64 new_size;
f40661be 1890 loff_t i_size;
1da177e4
LT
1891 struct inode *vi = page->mapping->host;
1892 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1893 ntfs_volume *vol = ni->vol;
1894 ntfs_attr_search_ctx *ctx = NULL;
1895 MFT_RECORD *m = NULL;
1896 ATTR_RECORD *a;
1897 u8 *kaddr;
1898 u32 attr_len;
1899 int err;
1900
1901 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1902 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1903 page->index, from, to);
1904 BUG_ON(!PageLocked(page));
1905 BUG_ON(from > PAGE_CACHE_SIZE);
1906 BUG_ON(to > PAGE_CACHE_SIZE);
1907 BUG_ON(from > to);
1908 BUG_ON(NInoMstProtected(ni));
1909 /*
1910 * If a previous ntfs_truncate() failed, repeat it and abort if it
1911 * fails again.
1912 */
1913 if (unlikely(NInoTruncateFailed(ni))) {
1914 down_write(&vi->i_alloc_sem);
1915 err = ntfs_truncate(vi);
1916 up_write(&vi->i_alloc_sem);
1917 if (err || NInoTruncateFailed(ni)) {
1918 if (!err)
1919 err = -EIO;
1920 goto err_out;
1921 }
1922 }
1923 /* If the attribute is not resident, deal with it elsewhere. */
1924 if (NInoNonResident(ni)) {
1925 /*
1926 * Only unnamed $DATA attributes can be compressed, encrypted,
1927 * and/or sparse.
1928 */
1929 if (ni->type == AT_DATA && !ni->name_len) {
1930 /* If file is encrypted, deny access, just like NT4. */
1931 if (NInoEncrypted(ni)) {
1932 ntfs_debug("Denying write access to encrypted "
1933 "file.");
1934 return -EACCES;
1935 }
1936 /* Compressed data streams are handled in compress.c. */
1937 if (NInoCompressed(ni)) {
1938 // TODO: Implement and replace this check with
1939 // return ntfs_write_compressed_block(page);
1940 ntfs_error(vi->i_sb, "Writing to compressed "
1941 "files is not supported yet. "
1942 "Sorry.");
1943 return -EOPNOTSUPP;
1944 }
1945 // TODO: Implement and remove this check.
1946 if (NInoSparse(ni)) {
1947 ntfs_error(vi->i_sb, "Writing to sparse files "
1948 "is not supported yet. Sorry.");
1949 return -EOPNOTSUPP;
1950 }
1951 }
1952 /* Normal data stream. */
1953 return ntfs_prepare_nonresident_write(page, from, to);
1954 }
1955 /*
1956 * Attribute is resident, implying it is not compressed, encrypted, or
1957 * sparse.
1958 */
1959 BUG_ON(page_has_buffers(page));
1960 new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1961 /* If we do not need to resize the attribute allocation we are done. */
07a4e2da 1962 if (new_size <= i_size_read(vi))
1da177e4 1963 goto done;
1da177e4
LT
1964 /* Map, pin, and lock the (base) mft record. */
1965 if (!NInoAttr(ni))
1966 base_ni = ni;
1967 else
1968 base_ni = ni->ext.base_ntfs_ino;
1969 m = map_mft_record(base_ni);
1970 if (IS_ERR(m)) {
1971 err = PTR_ERR(m);
1972 m = NULL;
1973 ctx = NULL;
1974 goto err_out;
1975 }
1976 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1977 if (unlikely(!ctx)) {
1978 err = -ENOMEM;
1979 goto err_out;
1980 }
1981 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1982 CASE_SENSITIVE, 0, NULL, 0, ctx);
1983 if (unlikely(err)) {
1984 if (err == -ENOENT)
1985 err = -EIO;
1986 goto err_out;
1987 }
1988 m = ctx->mrec;
1989 a = ctx->attr;
1990 /* The total length of the attribute value. */
1991 attr_len = le32_to_cpu(a->data.resident.value_length);
946929d8 1992 /* Fix an eventual previous failure of ntfs_commit_write(). */
f40661be
AA
1993 i_size = i_size_read(vi);
1994 if (unlikely(attr_len > i_size)) {
1995 attr_len = i_size;
946929d8 1996 a->data.resident.value_length = cpu_to_le32(attr_len);
946929d8 1997 }
946929d8
AA
1998 /* If we do not need to resize the attribute allocation we are done. */
1999 if (new_size <= attr_len)
2000 goto done_unm;
1da177e4
LT
2001 /* Check if new size is allowed in $AttrDef. */
2002 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
2003 if (unlikely(err)) {
2004 if (err == -ERANGE) {
2005 ntfs_error(vol->sb, "Write would cause the inode "
2006 "0x%lx to exceed the maximum size for "
2007 "its attribute type (0x%x). Aborting "
2008 "write.", vi->i_ino,
2009 le32_to_cpu(ni->type));
2010 } else {
2011 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
2012 "attribute type 0x%x. Aborting "
2013 "write.", vi->i_ino,
2014 le32_to_cpu(ni->type));
2015 err = -EIO;
2016 }
2017 goto err_out2;
2018 }
2019 /*
2020 * Extend the attribute record to be able to store the new attribute
2021 * size.
2022 */
2023 if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
2024 le16_to_cpu(a->data.resident.value_offset) +
2025 new_size)) {
2026 /* Not enough space in the mft record. */
2027 ntfs_error(vol->sb, "Not enough space in the mft record for "
2028 "the resized attribute value. This is not "
2029 "supported yet. Aborting write.");
2030 err = -EOPNOTSUPP;
2031 goto err_out2;
2032 }
2033 /*
2034 * We have enough space in the mft record to fit the write. This
2035 * implies the attribute is smaller than the mft record and hence the
2036 * attribute must be in a single page and hence page->index must be 0.
2037 */
2038 BUG_ON(page->index);
2039 /*
2040 * If the beginning of the write is past the old size, enlarge the
2041 * attribute value up to the beginning of the write and fill it with
2042 * zeroes.
2043 */
2044 if (from > attr_len) {
2045 memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
2046 attr_len, 0, from - attr_len);
2047 a->data.resident.value_length = cpu_to_le32(from);
2048 /* Zero the corresponding area in the page as well. */
2049 if (PageUptodate(page)) {
2050 kaddr = kmap_atomic(page, KM_USER0);
2051 memset(kaddr + attr_len, 0, from - attr_len);
2052 kunmap_atomic(kaddr, KM_USER0);
2053 flush_dcache_page(page);
2054 }
2055 }
2056 flush_dcache_mft_record_page(ctx->ntfs_ino);
2057 mark_mft_record_dirty(ctx->ntfs_ino);
946929d8 2058done_unm:
1da177e4
LT
2059 ntfs_attr_put_search_ctx(ctx);
2060 unmap_mft_record(base_ni);
2061 /*
2062 * Because resident attributes are handled by memcpy() to/from the
2063 * corresponding MFT record, and because this form of i/o is byte
2064 * aligned rather than block aligned, there is no need to bring the
2065 * page uptodate here as in the non-resident case where we need to
2066 * bring the buffers straddled by the write uptodate before
2067 * generic_file_write() does the copying from userspace.
2068 *
2069 * We thus defer the uptodate bringing of the page region outside the
2070 * region written to to ntfs_commit_write(), which makes the code
2071 * simpler and saves one atomic kmap which is good.
2072 */
2073done:
2074 ntfs_debug("Done.");
2075 return 0;
2076err_out:
2077 if (err == -ENOMEM)
2078 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2079 "prepare the write.");
2080 else {
2081 ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
2082 "with error %i.", err);
2083 NVolSetErrors(vol);
2084 make_bad_inode(vi);
2085 }
2086err_out2:
2087 if (ctx)
2088 ntfs_attr_put_search_ctx(ctx);
2089 if (m)
2090 unmap_mft_record(base_ni);
2091 return err;
2092}
2093
2094/**
2095 * ntfs_commit_nonresident_write -
2096 *
2097 */
2098static int ntfs_commit_nonresident_write(struct page *page,
2099 unsigned from, unsigned to)
2100{
2101 s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2102 struct inode *vi = page->mapping->host;
2103 struct buffer_head *bh, *head;
2104 unsigned int block_start, block_end, blocksize;
2105 BOOL partial;
2106
2107 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2108 "0x%lx, from = %u, to = %u.", vi->i_ino,
2109 NTFS_I(vi)->type, page->index, from, to);
2110 blocksize = 1 << vi->i_blkbits;
2111
2112 // FIXME: We need a whole slew of special cases in here for compressed
2113 // files for example...
2114 // For now, we know ntfs_prepare_write() would have failed so we can't
2115 // get here in any of the cases which we have to special case, so we
2116 // are just a ripped off, unrolled generic_commit_write().
2117
2118 bh = head = page_buffers(page);
2119 block_start = 0;
2120 partial = FALSE;
2121 do {
2122 block_end = block_start + blocksize;
2123 if (block_end <= from || block_start >= to) {
2124 if (!buffer_uptodate(bh))
2125 partial = TRUE;
2126 } else {
2127 set_buffer_uptodate(bh);
2128 mark_buffer_dirty(bh);
2129 }
2130 } while (block_start = block_end, (bh = bh->b_this_page) != head);
2131 /*
2132 * If this is a partial write which happened to make all buffers
2133 * uptodate then we can optimize away a bogus ->readpage() for the next
2134 * read(). Here we 'discover' whether the page went uptodate as a
2135 * result of this (potentially partial) write.
2136 */
2137 if (!partial)
2138 SetPageUptodate(page);
2139 /*
2140 * Not convinced about this at all. See disparity comment above. For
2141 * now we know ntfs_prepare_write() would have failed in the write
2142 * exceeds i_size case, so this will never trigger which is fine.
2143 */
07a4e2da 2144 if (pos > i_size_read(vi)) {
1da177e4
LT
2145 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
2146 "not supported yet. Sorry.");
2147 return -EOPNOTSUPP;
2148 // vi->i_size = pos;
2149 // mark_inode_dirty(vi);
2150 }
2151 ntfs_debug("Done.");
2152 return 0;
2153}
2154
2155/**
2156 * ntfs_commit_write - commit the received data
2157 *
2158 * This is called from generic_file_write() with i_sem held on the inode
2159 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
2160 * data has already been copied into the @page. ntfs_prepare_write() has been
2161 * called before the data copied and it returned success so we can take the
2162 * results of various BUG checks and some error handling for granted.
2163 *
2164 * Need to mark modified blocks dirty so they get written out later when
2165 * ntfs_writepage() is invoked by the VM.
2166 *
2167 * Return 0 on success or -errno on error.
2168 *
2169 * Should be using generic_commit_write(). This marks buffers uptodate and
2170 * dirty, sets the page uptodate if all buffers in the page are uptodate, and
2171 * updates i_size if the end of io is beyond i_size. In that case, it also
2172 * marks the inode dirty.
2173 *
2174 * Cannot use generic_commit_write() due to ntfs specialities but can look at
2175 * it for implementation guidance.
2176 *
2177 * If things have gone as outlined in ntfs_prepare_write(), then we do not
2178 * need to do any page content modifications here at all, except in the write
2179 * to resident attribute case, where we need to do the uptodate bringing here
2180 * which we combine with the copying into the mft record which means we save
2181 * one atomic kmap.
2182 */
2183static int ntfs_commit_write(struct file *file, struct page *page,
2184 unsigned from, unsigned to)
2185{
2186 struct inode *vi = page->mapping->host;
2187 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2188 char *kaddr, *kattr;
2189 ntfs_attr_search_ctx *ctx;
2190 MFT_RECORD *m;
2191 ATTR_RECORD *a;
2192 u32 attr_len;
2193 int err;
2194
2195 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2196 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
2197 page->index, from, to);
2198 /* If the attribute is not resident, deal with it elsewhere. */
2199 if (NInoNonResident(ni)) {
2200 /* Only unnamed $DATA attributes can be compressed/encrypted. */
2201 if (ni->type == AT_DATA && !ni->name_len) {
2202 /* Encrypted files need separate handling. */
2203 if (NInoEncrypted(ni)) {
2204 // We never get here at present!
2205 BUG();
2206 }
2207 /* Compressed data streams are handled in compress.c. */
2208 if (NInoCompressed(ni)) {
2209 // TODO: Implement this!
2210 // return ntfs_write_compressed_block(page);
2211 // We never get here at present!
2212 BUG();
2213 }
2214 }
2215 /* Normal data stream. */
2216 return ntfs_commit_nonresident_write(page, from, to);
2217 }
2218 /*
2219 * Attribute is resident, implying it is not compressed, encrypted, or
2220 * sparse.
2221 */
2222 if (!NInoAttr(ni))
2223 base_ni = ni;
2224 else
2225 base_ni = ni->ext.base_ntfs_ino;
2226 /* Map, pin, and lock the mft record. */
2227 m = map_mft_record(base_ni);
2228 if (IS_ERR(m)) {
2229 err = PTR_ERR(m);
2230 m = NULL;
2231 ctx = NULL;
2232 goto err_out;
2233 }
2234 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2235 if (unlikely(!ctx)) {
2236 err = -ENOMEM;
2237 goto err_out;
2238 }
2239 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2240 CASE_SENSITIVE, 0, NULL, 0, ctx);
2241 if (unlikely(err)) {
2242 if (err == -ENOENT)
2243 err = -EIO;
2244 goto err_out;
2245 }
2246 a = ctx->attr;
2247 /* The total length of the attribute value. */
2248 attr_len = le32_to_cpu(a->data.resident.value_length);
2249 BUG_ON(from > attr_len);
2250 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
2251 kaddr = kmap_atomic(page, KM_USER0);
2252 /* Copy the received data from the page to the mft record. */
2253 memcpy(kattr + from, kaddr + from, to - from);
2254 /* Update the attribute length if necessary. */
2255 if (to > attr_len) {
2256 attr_len = to;
2257 a->data.resident.value_length = cpu_to_le32(attr_len);
2258 }
2259 /*
2260 * If the page is not uptodate, bring the out of bounds area(s)
2261 * uptodate by copying data from the mft record to the page.
2262 */
2263 if (!PageUptodate(page)) {
2264 if (from > 0)
2265 memcpy(kaddr, kattr, from);
2266 if (to < attr_len)
2267 memcpy(kaddr + to, kattr + to, attr_len - to);
2268 /* Zero the region outside the end of the attribute value. */
2269 if (attr_len < PAGE_CACHE_SIZE)
2270 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
2271 /*
2272 * The probability of not having done any of the above is
2273 * extremely small, so we just flush unconditionally.
2274 */
2275 flush_dcache_page(page);
2276 SetPageUptodate(page);
2277 }
2278 kunmap_atomic(kaddr, KM_USER0);
2279 /* Update i_size if necessary. */
07a4e2da
AA
2280 if (i_size_read(vi) < attr_len) {
2281 unsigned long flags;
2282
2283 write_lock_irqsave(&ni->size_lock, flags);
1da177e4
LT
2284 ni->allocated_size = ni->initialized_size = attr_len;
2285 i_size_write(vi, attr_len);
07a4e2da 2286 write_unlock_irqrestore(&ni->size_lock, flags);
1da177e4
LT
2287 }
2288 /* Mark the mft record dirty, so it gets written back. */
2289 flush_dcache_mft_record_page(ctx->ntfs_ino);
2290 mark_mft_record_dirty(ctx->ntfs_ino);
2291 ntfs_attr_put_search_ctx(ctx);
2292 unmap_mft_record(base_ni);
2293 ntfs_debug("Done.");
2294 return 0;
2295err_out:
2296 if (err == -ENOMEM) {
2297 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2298 "commit the write.");
2299 if (PageUptodate(page)) {
2300 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
2301 "dirty so the write will be retried "
2302 "later on by the VM.");
2303 /*
2304 * Put the page on mapping->dirty_pages, but leave its
2305 * buffers' dirty state as-is.
2306 */
2307 __set_page_dirty_nobuffers(page);
2308 err = 0;
2309 } else
2310 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
2311 "data has been lost.");
2312 } else {
2313 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
2314 "with error %i.", err);
2315 NVolSetErrors(ni->vol);
2316 make_bad_inode(vi);
2317 }
2318 if (ctx)
2319 ntfs_attr_put_search_ctx(ctx);
2320 if (m)
2321 unmap_mft_record(base_ni);
2322 return err;
2323}
2324
2325#endif /* NTFS_RW */
2326
2327/**
2328 * ntfs_aops - general address space operations for inodes and attributes
2329 */
2330struct address_space_operations ntfs_aops = {
2331 .readpage = ntfs_readpage, /* Fill page with data. */
2332 .sync_page = block_sync_page, /* Currently, just unplugs the
2333 disk request queue. */
2334#ifdef NTFS_RW
2335 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2336 .prepare_write = ntfs_prepare_write, /* Prepare page and buffers
2337 ready to receive data. */
2338 .commit_write = ntfs_commit_write, /* Commit received data. */
2339#endif /* NTFS_RW */
2340};
2341
2342/**
2343 * ntfs_mst_aops - general address space operations for mst protecteed inodes
2344 * and attributes
2345 */
2346struct address_space_operations ntfs_mst_aops = {
2347 .readpage = ntfs_readpage, /* Fill page with data. */
2348 .sync_page = block_sync_page, /* Currently, just unplugs the
2349 disk request queue. */
2350#ifdef NTFS_RW
2351 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2352 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
2353 without touching the buffers
2354 belonging to the page. */
2355#endif /* NTFS_RW */
2356};
2357
2358#ifdef NTFS_RW
2359
2360/**
2361 * mark_ntfs_record_dirty - mark an ntfs record dirty
2362 * @page: page containing the ntfs record to mark dirty
2363 * @ofs: byte offset within @page at which the ntfs record begins
2364 *
2365 * Set the buffers and the page in which the ntfs record is located dirty.
2366 *
2367 * The latter also marks the vfs inode the ntfs record belongs to dirty
2368 * (I_DIRTY_PAGES only).
2369 *
2370 * If the page does not have buffers, we create them and set them uptodate.
2371 * The page may not be locked which is why we need to handle the buffers under
2372 * the mapping->private_lock. Once the buffers are marked dirty we no longer
2373 * need the lock since try_to_free_buffers() does not free dirty buffers.
2374 */
2375void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
2376 struct address_space *mapping = page->mapping;
2377 ntfs_inode *ni = NTFS_I(mapping->host);
2378 struct buffer_head *bh, *head, *buffers_to_free = NULL;
2379 unsigned int end, bh_size, bh_ofs;
2380
2381 BUG_ON(!PageUptodate(page));
2382 end = ofs + ni->itype.index.block_size;
2383 bh_size = 1 << VFS_I(ni)->i_blkbits;
2384 spin_lock(&mapping->private_lock);
2385 if (unlikely(!page_has_buffers(page))) {
2386 spin_unlock(&mapping->private_lock);
2387 bh = head = alloc_page_buffers(page, bh_size, 1);
2388 spin_lock(&mapping->private_lock);
2389 if (likely(!page_has_buffers(page))) {
2390 struct buffer_head *tail;
2391
2392 do {
2393 set_buffer_uptodate(bh);
2394 tail = bh;
2395 bh = bh->b_this_page;
2396 } while (bh);
2397 tail->b_this_page = head;
2398 attach_page_buffers(page, head);
2399 } else
2400 buffers_to_free = bh;
2401 }
2402 bh = head = page_buffers(page);
a01ac532 2403 BUG_ON(!bh);
1da177e4
LT
2404 do {
2405 bh_ofs = bh_offset(bh);
2406 if (bh_ofs + bh_size <= ofs)
2407 continue;
2408 if (unlikely(bh_ofs >= end))
2409 break;
2410 set_buffer_dirty(bh);
2411 } while ((bh = bh->b_this_page) != head);
2412 spin_unlock(&mapping->private_lock);
2413 __set_page_dirty_nobuffers(page);
2414 if (unlikely(buffers_to_free)) {
2415 do {
2416 bh = buffers_to_free->b_this_page;
2417 free_buffer_head(buffers_to_free);
2418 buffers_to_free = bh;
2419 } while (buffers_to_free);
2420 }
2421}
2422
2423#endif /* NTFS_RW */