Commit | Line | Data |
---|---|---|
d475c634 MW |
1 | /* |
2 | * fs/dax.c - Direct Access filesystem code | |
3 | * Copyright (c) 2013-2014 Intel Corporation | |
4 | * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> | |
5 | * Author: Ross Zwisler <ross.zwisler@linux.intel.com> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify it | |
8 | * under the terms and conditions of the GNU General Public License, | |
9 | * version 2, as published by the Free Software Foundation. | |
10 | * | |
11 | * This program is distributed in the hope it will be useful, but WITHOUT | |
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
14 | * more details. | |
15 | */ | |
16 | ||
17 | #include <linux/atomic.h> | |
18 | #include <linux/blkdev.h> | |
19 | #include <linux/buffer_head.h> | |
20 | #include <linux/fs.h> | |
21 | #include <linux/genhd.h> | |
22 | #include <linux/mutex.h> | |
289c6aed | 23 | #include <linux/sched.h> |
d475c634 MW |
24 | #include <linux/uio.h> |
25 | ||
289c6aed MW |
26 | int dax_clear_blocks(struct inode *inode, sector_t block, long size) |
27 | { | |
28 | struct block_device *bdev = inode->i_sb->s_bdev; | |
29 | sector_t sector = block << (inode->i_blkbits - 9); | |
30 | ||
31 | might_sleep(); | |
32 | do { | |
33 | void *addr; | |
34 | unsigned long pfn; | |
35 | long count; | |
36 | ||
37 | count = bdev_direct_access(bdev, sector, &addr, &pfn, size); | |
38 | if (count < 0) | |
39 | return count; | |
40 | BUG_ON(size < count); | |
41 | while (count > 0) { | |
42 | unsigned pgsz = PAGE_SIZE - offset_in_page(addr); | |
43 | if (pgsz > count) | |
44 | pgsz = count; | |
45 | if (pgsz < PAGE_SIZE) | |
46 | memset(addr, 0, pgsz); | |
47 | else | |
48 | clear_page(addr); | |
49 | addr += pgsz; | |
50 | size -= pgsz; | |
51 | count -= pgsz; | |
52 | BUG_ON(pgsz & 511); | |
53 | sector += pgsz / 512; | |
54 | cond_resched(); | |
55 | } | |
56 | } while (size); | |
57 | ||
58 | return 0; | |
59 | } | |
60 | EXPORT_SYMBOL_GPL(dax_clear_blocks); | |
61 | ||
d475c634 MW |
62 | static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) |
63 | { | |
64 | unsigned long pfn; | |
65 | sector_t sector = bh->b_blocknr << (blkbits - 9); | |
66 | return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); | |
67 | } | |
68 | ||
69 | static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, | |
70 | loff_t end) | |
71 | { | |
72 | loff_t final = end - pos + first; /* The final byte of the buffer */ | |
73 | ||
74 | if (first > 0) | |
75 | memset(addr, 0, first); | |
76 | if (final < size) | |
77 | memset(addr + final, 0, size - final); | |
78 | } | |
79 | ||
80 | static bool buffer_written(struct buffer_head *bh) | |
81 | { | |
82 | return buffer_mapped(bh) && !buffer_unwritten(bh); | |
83 | } | |
84 | ||
85 | /* | |
86 | * When ext4 encounters a hole, it returns without modifying the buffer_head | |
87 | * which means that we can't trust b_size. To cope with this, we set b_state | |
88 | * to 0 before calling get_block and, if any bit is set, we know we can trust | |
89 | * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is | |
90 | * and would save us time calling get_block repeatedly. | |
91 | */ | |
92 | static bool buffer_size_valid(struct buffer_head *bh) | |
93 | { | |
94 | return bh->b_state != 0; | |
95 | } | |
96 | ||
97 | static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, | |
98 | loff_t start, loff_t end, get_block_t get_block, | |
99 | struct buffer_head *bh) | |
100 | { | |
101 | ssize_t retval = 0; | |
102 | loff_t pos = start; | |
103 | loff_t max = start; | |
104 | loff_t bh_max = start; | |
105 | void *addr; | |
106 | bool hole = false; | |
107 | ||
108 | if (rw != WRITE) | |
109 | end = min(end, i_size_read(inode)); | |
110 | ||
111 | while (pos < end) { | |
112 | unsigned len; | |
113 | if (pos == max) { | |
114 | unsigned blkbits = inode->i_blkbits; | |
115 | sector_t block = pos >> blkbits; | |
116 | unsigned first = pos - (block << blkbits); | |
117 | long size; | |
118 | ||
119 | if (pos == bh_max) { | |
120 | bh->b_size = PAGE_ALIGN(end - pos); | |
121 | bh->b_state = 0; | |
122 | retval = get_block(inode, block, bh, | |
123 | rw == WRITE); | |
124 | if (retval) | |
125 | break; | |
126 | if (!buffer_size_valid(bh)) | |
127 | bh->b_size = 1 << blkbits; | |
128 | bh_max = pos - first + bh->b_size; | |
129 | } else { | |
130 | unsigned done = bh->b_size - | |
131 | (bh_max - (pos - first)); | |
132 | bh->b_blocknr += done >> blkbits; | |
133 | bh->b_size -= done; | |
134 | } | |
135 | ||
136 | hole = (rw != WRITE) && !buffer_written(bh); | |
137 | if (hole) { | |
138 | addr = NULL; | |
139 | size = bh->b_size - first; | |
140 | } else { | |
141 | retval = dax_get_addr(bh, &addr, blkbits); | |
142 | if (retval < 0) | |
143 | break; | |
144 | if (buffer_unwritten(bh) || buffer_new(bh)) | |
145 | dax_new_buf(addr, retval, first, pos, | |
146 | end); | |
147 | addr += first; | |
148 | size = retval - first; | |
149 | } | |
150 | max = min(pos + size, end); | |
151 | } | |
152 | ||
153 | if (rw == WRITE) | |
154 | len = copy_from_iter(addr, max - pos, iter); | |
155 | else if (!hole) | |
156 | len = copy_to_iter(addr, max - pos, iter); | |
157 | else | |
158 | len = iov_iter_zero(max - pos, iter); | |
159 | ||
160 | if (!len) | |
161 | break; | |
162 | ||
163 | pos += len; | |
164 | addr += len; | |
165 | } | |
166 | ||
167 | return (pos == start) ? retval : pos - start; | |
168 | } | |
169 | ||
170 | /** | |
171 | * dax_do_io - Perform I/O to a DAX file | |
172 | * @rw: READ to read or WRITE to write | |
173 | * @iocb: The control block for this I/O | |
174 | * @inode: The file which the I/O is directed at | |
175 | * @iter: The addresses to do I/O from or to | |
176 | * @pos: The file offset where the I/O starts | |
177 | * @get_block: The filesystem method used to translate file offsets to blocks | |
178 | * @end_io: A filesystem callback for I/O completion | |
179 | * @flags: See below | |
180 | * | |
181 | * This function uses the same locking scheme as do_blockdev_direct_IO: | |
182 | * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the | |
183 | * caller for writes. For reads, we take and release the i_mutex ourselves. | |
184 | * If DIO_LOCKING is not set, the filesystem takes care of its own locking. | |
185 | * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O | |
186 | * is in progress. | |
187 | */ | |
188 | ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, | |
189 | struct iov_iter *iter, loff_t pos, | |
190 | get_block_t get_block, dio_iodone_t end_io, int flags) | |
191 | { | |
192 | struct buffer_head bh; | |
193 | ssize_t retval = -EINVAL; | |
194 | loff_t end = pos + iov_iter_count(iter); | |
195 | ||
196 | memset(&bh, 0, sizeof(bh)); | |
197 | ||
198 | if ((flags & DIO_LOCKING) && (rw == READ)) { | |
199 | struct address_space *mapping = inode->i_mapping; | |
200 | mutex_lock(&inode->i_mutex); | |
201 | retval = filemap_write_and_wait_range(mapping, pos, end - 1); | |
202 | if (retval) { | |
203 | mutex_unlock(&inode->i_mutex); | |
204 | goto out; | |
205 | } | |
206 | } | |
207 | ||
208 | /* Protects against truncate */ | |
209 | atomic_inc(&inode->i_dio_count); | |
210 | ||
211 | retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); | |
212 | ||
213 | if ((flags & DIO_LOCKING) && (rw == READ)) | |
214 | mutex_unlock(&inode->i_mutex); | |
215 | ||
216 | if ((retval > 0) && end_io) | |
217 | end_io(iocb, pos, retval, bh.b_private); | |
218 | ||
219 | inode_dio_done(inode); | |
220 | out: | |
221 | return retval; | |
222 | } | |
223 | EXPORT_SYMBOL_GPL(dax_do_io); |