wrappers for ->i_mutex access
[linux-2.6-block.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
1dc563a6 30 * Copyright (c) 2011, 2015, Intel Corporation.
d7e09d03
PT
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
67a235f5
GKH
44#include "../include/lustre_dlm.h"
45#include "../include/lustre_lite.h"
d7e09d03
PT
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
67a235f5 49#include "../include/lustre/ll_fiemap.h"
d7e09d03 50
67a235f5 51#include "../include/cl_object.h"
d7e09d03 52
2d95f10e
JH
53static int
54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 bool *lease_broken);
58
59static enum llioc_iter
60ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
62
63static struct ll_file_data *ll_file_data_get(void)
d7e09d03
PT
64{
65 struct ll_file_data *fd;
66
ccaabce1 67 fd = kmem_cache_alloc(ll_file_data_slab, GFP_NOFS | __GFP_ZERO);
73863d83
JH
68 if (fd == NULL)
69 return NULL;
d7e09d03
PT
70 fd->fd_write_failed = false;
71 return fd;
72}
73
74static void ll_file_data_put(struct ll_file_data *fd)
75{
76 if (fd != NULL)
50d30362 77 kmem_cache_free(ll_file_data_slab, fd);
d7e09d03
PT
78}
79
80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
82{
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 if (fh)
94 op_data->op_handle = *fh;
d7e09d03 95
1f6eaf83 96 if (ll_i2info(inode)->lli_flags & LLIF_DATA_MODIFIED)
d7e09d03
PT
97 op_data->op_bias |= MDS_DATA_MODIFIED;
98}
99
100/**
101 * Closes the IO epoch and packs all the attributes into @op_data for
102 * the CLOSE rpc.
103 */
104static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
105 struct obd_client_handle *och)
106{
f57d9a72
EL
107 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 ATTR_MTIME | ATTR_MTIME_SET |
109 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
110
111 if (!(och->och_flags & FMODE_WRITE))
112 goto out;
113
114 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
115 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
116 else
117 ll_ioepoch_close(inode, op_data, &och, 0);
118
119out:
120 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
121 ll_prep_md_op_data(op_data, inode, NULL, NULL,
122 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
123}
124
125static int ll_close_inode_openhandle(struct obd_export *md_exp,
126 struct inode *inode,
48d23e61
JX
127 struct obd_client_handle *och,
128 const __u64 *data_version)
d7e09d03
PT
129{
130 struct obd_export *exp = ll_i2mdexp(inode);
131 struct md_op_data *op_data;
132 struct ptlrpc_request *req = NULL;
133 struct obd_device *obd = class_exp2obd(exp);
134 int epoch_close = 1;
135 int rc;
d7e09d03
PT
136
137 if (obd == NULL) {
138 /*
139 * XXX: in case of LMV, is this correct to access
140 * ->exp_handle?
141 */
55f5a824 142 CERROR("Invalid MDC connection handle %#llx\n",
d7e09d03 143 ll_i2mdexp(inode)->exp_handle.h_cookie);
34e1f2bb
JL
144 rc = 0;
145 goto out;
d7e09d03
PT
146 }
147
496a51bd
JL
148 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
149 if (!op_data) {
34e1f2bb
JL
150 /* XXX We leak openhandle and request here. */
151 rc = -ENOMEM;
152 goto out;
153 }
d7e09d03
PT
154
155 ll_prepare_close(inode, op_data, och);
48d23e61
JX
156 if (data_version != NULL) {
157 /* Pass in data_version implies release. */
158 op_data->op_bias |= MDS_HSM_RELEASE;
159 op_data->op_data_version = *data_version;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
162 }
b6ee3824 163 epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
d7e09d03
PT
164 rc = md_close(md_exp, op_data, och->och_mod, &req);
165 if (rc == -EAGAIN) {
166 /* This close must have the epoch closed. */
167 LASSERT(epoch_close);
168 /* MDS has instructed us to obtain Size-on-MDS attribute from
169 * OSTs and send setattr to back to MDS. */
170 rc = ll_som_update(inode, op_data);
171 if (rc) {
2d00bd17
JP
172 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
173 inode->i_ino, rc);
d7e09d03
PT
174 rc = 0;
175 }
176 } else if (rc) {
177 CERROR("inode %lu mdc close failed: rc = %d\n",
178 inode->i_ino, rc);
179 }
180
181 /* DATA_MODIFIED flag was successfully sent on close, cancel data
182 * modification flag. */
183 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
184 struct ll_inode_info *lli = ll_i2info(inode);
185
186 spin_lock(&lli->lli_lock);
187 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
188 spin_unlock(&lli->lli_lock);
189 }
190
d7e09d03
PT
191 if (rc == 0) {
192 rc = ll_objects_destroy(req, inode);
193 if (rc)
194 CERROR("inode %lu ll_objects destroy: rc = %d\n",
195 inode->i_ino, rc);
196 }
48d23e61
JX
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
cea812cd 199
48d23e61
JX
200 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201 if (!(body->valid & OBD_MD_FLRELEASED))
202 rc = -EBUSY;
203 }
204
205 ll_finish_md_op_data(op_data);
d7e09d03 206
d7e09d03 207out:
d7e09d03
PT
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211 } else {
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
97903a26 215 kfree(och);
d7e09d03
PT
216 }
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
219 return rc;
220}
221
45b2a010 222int ll_md_real_close(struct inode *inode, fmode_t fmode)
d7e09d03
PT
223{
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
227 __u64 *och_usecount;
228 int rc = 0;
d7e09d03 229
45b2a010 230 if (fmode & FMODE_WRITE) {
d7e09d03
PT
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
45b2a010 233 } else if (fmode & FMODE_EXEC) {
d7e09d03
PT
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
236 } else {
45b2a010 237 LASSERT(fmode & FMODE_READ);
d7e09d03
PT
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
240 }
241
242 mutex_lock(&lli->lli_och_mutex);
45b2a010
JH
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
245 * freeing it. */
d7e09d03 246 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 247 return 0;
d7e09d03 248 }
45b2a010 249
57303e76 250 och = *och_p;
d7e09d03
PT
251 *och_p = NULL;
252 mutex_unlock(&lli->lli_och_mutex);
253
45b2a010
JH
254 if (och != NULL) {
255 /* There might be a race and this handle may already
256 be closed. */
d7e09d03 257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61 258 inode, och, NULL);
d7e09d03
PT
259 }
260
0a3bdb00 261 return rc;
d7e09d03
PT
262}
263
2d95f10e
JH
264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
265 struct file *file)
d7e09d03
PT
266{
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
74d01958
AV
269 int lockmode;
270 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
271 struct lustre_handle lockh;
8369cfff 272 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN} };
d7e09d03 273 int rc = 0;
d7e09d03
PT
274
275 /* clear group lock, if present */
276 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
277 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
278
d3a8a4e2
JX
279 if (fd->fd_lease_och != NULL) {
280 bool lease_broken;
281
282 /* Usually the lease is not released when the
283 * application crashed, we need to release here. */
284 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
285 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
286 PFID(&lli->lli_fid), rc, lease_broken);
287
288 fd->fd_lease_och = NULL;
289 }
290
291 if (fd->fd_och != NULL) {
48d23e61 292 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
d3a8a4e2 293 fd->fd_och = NULL;
34e1f2bb 294 goto out;
d3a8a4e2
JX
295 }
296
d7e09d03
PT
297 /* Let's see if we have good enough OPEN lock on the file and if
298 we can skip talking to MDS */
d7e09d03 299
74d01958
AV
300 mutex_lock(&lli->lli_och_mutex);
301 if (fd->fd_omode & FMODE_WRITE) {
302 lockmode = LCK_CW;
303 LASSERT(lli->lli_open_fd_write_count);
304 lli->lli_open_fd_write_count--;
305 } else if (fd->fd_omode & FMODE_EXEC) {
306 lockmode = LCK_PR;
307 LASSERT(lli->lli_open_fd_exec_count);
308 lli->lli_open_fd_exec_count--;
d7e09d03 309 } else {
74d01958
AV
310 lockmode = LCK_CR;
311 LASSERT(lli->lli_open_fd_read_count);
312 lli->lli_open_fd_read_count--;
d7e09d03 313 }
74d01958
AV
314 mutex_unlock(&lli->lli_och_mutex);
315
316 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
317 LDLM_IBITS, &policy, lockmode, &lockh))
318 rc = ll_md_real_close(inode, fd->fd_omode);
d7e09d03 319
d3a8a4e2 320out:
d7e09d03
PT
321 LUSTRE_FPRIVATE(file) = NULL;
322 ll_file_data_put(fd);
d7e09d03 323
0a3bdb00 324 return rc;
d7e09d03
PT
325}
326
327/* While this returns an error code, fput() the caller does not, so we need
328 * to make every effort to clean up all of our state here. Also, applications
329 * rarely check close errors and even if an error is returned they will not
330 * re-try the close call.
331 */
332int ll_file_release(struct inode *inode, struct file *file)
333{
334 struct ll_file_data *fd;
335 struct ll_sb_info *sbi = ll_i2sbi(inode);
336 struct ll_inode_info *lli = ll_i2info(inode);
337 int rc;
d7e09d03
PT
338
339 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
340 inode->i_generation, inode);
341
342#ifdef CONFIG_FS_POSIX_ACL
f76c23da 343 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
d7e09d03
PT
344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
345
346 LASSERT(fd != NULL);
347 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
348 fd->fd_flags &= ~LL_FILE_RMTACL;
349 rct_del(&sbi->ll_rct, current_pid());
350 et_search_free(&sbi->ll_et, current_pid());
351 }
352 }
353#endif
354
f76c23da 355 if (!is_root_inode(inode))
d7e09d03
PT
356 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
357 fd = LUSTRE_FPRIVATE(file);
358 LASSERT(fd != NULL);
359
f09b372b 360 /* The last ref on @file, maybe not the owner pid of statahead.
d7e09d03
PT
361 * Different processes can open the same dir, "ll_opendir_key" means:
362 * it is me that should stop the statahead thread. */
363 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
364 lli->lli_opendir_pid != 0)
365 ll_stop_statahead(inode, lli->lli_opendir_key);
366
f76c23da 367 if (is_root_inode(inode)) {
d7e09d03
PT
368 LUSTRE_FPRIVATE(file) = NULL;
369 ll_file_data_put(fd);
0a3bdb00 370 return 0;
d7e09d03
PT
371 }
372
373 if (!S_ISDIR(inode->i_mode)) {
374 lov_read_and_clear_async_rc(lli->lli_clob);
375 lli->lli_async_rc = 0;
376 }
377
378 rc = ll_md_close(sbi->ll_md_exp, inode, file);
379
380 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
381 libcfs_debug_dumplog();
382
0a3bdb00 383 return rc;
d7e09d03
PT
384}
385
48eddfd5 386static int ll_intent_file_open(struct dentry *dentry, void *lmm,
d7e09d03
PT
387 int lmmsize, struct lookup_intent *itp)
388{
2b0143b5 389 struct inode *inode = d_inode(dentry);
48eddfd5
AV
390 struct ll_sb_info *sbi = ll_i2sbi(inode);
391 struct dentry *parent = dentry->d_parent;
392 const char *name = dentry->d_name.name;
393 const int len = dentry->d_name.len;
d7e09d03
PT
394 struct md_op_data *op_data;
395 struct ptlrpc_request *req;
396 __u32 opc = LUSTRE_OPC_ANY;
397 int rc;
d7e09d03 398
d7e09d03
PT
399 /* Usually we come here only for NFSD, and we want open lock.
400 But we can also get here with pre 2.6.15 patchless kernels, and in
401 that case that lock is also ok */
402 /* We can also get here if there was cached open handle in revalidate_it
403 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 404 * But this means this file was closed and immediately opened which
d7e09d03
PT
405 * makes a good candidate for using OPEN lock */
406 /* If lmmsize & lmm are not 0, we are just setting stripe info
407 * parameters. No need for the open lock */
408 if (lmm == NULL && lmmsize == 0) {
409 itp->it_flags |= MDS_OPEN_LOCK;
410 if (itp->it_flags & FMODE_WRITE)
411 opc = LUSTRE_OPC_CREATE;
412 }
413
2b0143b5 414 op_data = ll_prep_md_op_data(NULL, d_inode(parent),
48eddfd5 415 inode, name, len,
d7e09d03
PT
416 O_RDWR, opc, NULL);
417 if (IS_ERR(op_data))
0a3bdb00 418 return PTR_ERR(op_data);
d7e09d03
PT
419
420 itp->it_flags |= MDS_OPEN_BY_FID;
421 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
422 0 /*unused */, &req, ll_md_blocking_ast, 0);
423 ll_finish_md_op_data(op_data);
424 if (rc == -ESTALE) {
425 /* reason for keep own exit path - don`t flood log
426 * with messages with -ESTALE errors.
427 */
428 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
429 it_open_error(DISP_OPEN_OPEN, itp))
34e1f2bb 430 goto out;
e22fdcc8 431 ll_release_openhandle(inode, itp);
34e1f2bb 432 goto out;
d7e09d03
PT
433 }
434
34e1f2bb
JL
435 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
436 rc = -ENOENT;
437 goto out;
438 }
d7e09d03
PT
439
440 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
441 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
442 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
34e1f2bb 443 goto out;
d7e09d03
PT
444 }
445
48eddfd5 446 rc = ll_prep_inode(&inode, req, NULL, itp);
d7e09d03 447 if (!rc && itp->d.lustre.it_lock_mode)
48eddfd5 448 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
d7e09d03
PT
449
450out:
f236f69b 451 ptlrpc_req_finished(req);
d7e09d03
PT
452 ll_intent_drop_lock(itp);
453
0a3bdb00 454 return rc;
d7e09d03
PT
455}
456
457/**
458 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
459 * not believe attributes if a few ioepoch holders exist. Attributes for
460 * previous ioepoch if new one is opened are also skipped by MDS.
461 */
462void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
463{
464 if (ioepoch && lli->lli_ioepoch != ioepoch) {
465 lli->lli_ioepoch = ioepoch;
b0f5aad5 466 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
d7e09d03
PT
467 ioepoch, PFID(&lli->lli_fid));
468 }
469}
470
ea1db081
JH
471static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
472 struct obd_client_handle *och)
d7e09d03
PT
473{
474 struct ptlrpc_request *req = it->d.lustre.it_data;
475 struct mdt_body *body;
476
d7e09d03 477 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
478 och->och_fh = body->handle;
479 och->och_fid = body->fid1;
d3a8a4e2 480 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
d7e09d03 481 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 482 och->och_flags = it->it_flags;
d7e09d03 483
63d42578 484 return md_set_open_replay_data(md_exp, och, it);
d7e09d03
PT
485}
486
2d95f10e
JH
487static int ll_local_open(struct file *file, struct lookup_intent *it,
488 struct ll_file_data *fd, struct obd_client_handle *och)
d7e09d03 489{
2a8a3597 490 struct inode *inode = file_inode(file);
d7e09d03 491 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
492
493 LASSERT(!LUSTRE_FPRIVATE(file));
494
495 LASSERT(fd != NULL);
496
497 if (och) {
498 struct ptlrpc_request *req = it->d.lustre.it_data;
499 struct mdt_body *body;
500 int rc;
501
ea1db081
JH
502 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
503 if (rc != 0)
0a3bdb00 504 return rc;
d7e09d03
PT
505
506 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 507 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
508 }
509
510 LUSTRE_FPRIVATE(file) = fd;
511 ll_readahead_init(inode, &fd->fd_ras);
d3a8a4e2 512 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
0a3bdb00 513 return 0;
d7e09d03
PT
514}
515
516/* Open a file, and (for the very first open) create objects on the OSTs at
517 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
518 * creation or open until ll_lov_setstripe() ioctl is called.
519 *
520 * If we already have the stripe MD locally then we don't request it in
521 * md_open(), by passing a lmm_size = 0.
522 *
523 * It is up to the application to ensure no other processes open this file
524 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
525 * used. We might be able to avoid races of that sort by getting lli_open_sem
526 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
527 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
528 */
529int ll_file_open(struct inode *inode, struct file *file)
530{
531 struct ll_inode_info *lli = ll_i2info(inode);
532 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
533 .it_flags = file->f_flags };
534 struct obd_client_handle **och_p = NULL;
535 __u64 *och_usecount = NULL;
536 struct ll_file_data *fd;
537 int rc = 0, opendir_set = 0;
d7e09d03
PT
538
539 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
540 inode->i_generation, inode, file->f_flags);
541
542 it = file->private_data; /* XXX: compat macro */
543 file->private_data = NULL; /* prevent ll_local_open assertion */
544
545 fd = ll_file_data_get();
34e1f2bb
JL
546 if (fd == NULL) {
547 rc = -ENOMEM;
548 goto out_openerr;
549 }
d7e09d03
PT
550
551 fd->fd_file = file;
552 if (S_ISDIR(inode->i_mode)) {
553 spin_lock(&lli->lli_sa_lock);
554 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
555 lli->lli_opendir_pid == 0) {
556 lli->lli_opendir_key = fd;
557 lli->lli_opendir_pid = current_pid();
558 opendir_set = 1;
559 }
560 spin_unlock(&lli->lli_sa_lock);
561 }
562
f76c23da 563 if (is_root_inode(inode)) {
d7e09d03 564 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 565 return 0;
d7e09d03
PT
566 }
567
568 if (!it || !it->d.lustre.it_disposition) {
569 /* Convert f_flags into access mode. We cannot use file->f_mode,
570 * because everything but O_ACCMODE mask was stripped from
571 * there */
572 if ((oit.it_flags + 1) & O_ACCMODE)
573 oit.it_flags++;
574 if (file->f_flags & O_TRUNC)
575 oit.it_flags |= FMODE_WRITE;
576
577 /* kernel only call f_op->open in dentry_open. filp_open calls
578 * dentry_open after call to open_namei that checks permissions.
579 * Only nfsd_open call dentry_open directly without checking
580 * permissions and because of that this code below is safe. */
581 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
582 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
583
584 /* We do not want O_EXCL here, presumably we opened the file
585 * already? XXX - NFS implications? */
586 oit.it_flags &= ~O_EXCL;
587
588 /* bug20584, if "it_flags" contains O_CREAT, the file will be
589 * created if necessary, then "IT_CREAT" should be set to keep
590 * consistent with it */
591 if (oit.it_flags & O_CREAT)
592 oit.it_op |= IT_CREAT;
593
594 it = &oit;
595 }
596
597restart:
598 /* Let's see if we have file open on MDS already. */
599 if (it->it_flags & FMODE_WRITE) {
600 och_p = &lli->lli_mds_write_och;
601 och_usecount = &lli->lli_open_fd_write_count;
602 } else if (it->it_flags & FMODE_EXEC) {
603 och_p = &lli->lli_mds_exec_och;
604 och_usecount = &lli->lli_open_fd_exec_count;
605 } else {
606 och_p = &lli->lli_mds_read_och;
607 och_usecount = &lli->lli_open_fd_read_count;
608 }
609
610 mutex_lock(&lli->lli_och_mutex);
611 if (*och_p) { /* Open handle is present */
612 if (it_disposition(it, DISP_OPEN_OPEN)) {
613 /* Well, there's extra open request that we do not need,
614 let's close it somehow. This will decref request. */
615 rc = it_open_error(DISP_OPEN_OPEN, it);
616 if (rc) {
617 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 618 goto out_openerr;
d7e09d03
PT
619 }
620
e22fdcc8 621 ll_release_openhandle(inode, it);
d7e09d03
PT
622 }
623 (*och_usecount)++;
624
625 rc = ll_local_open(file, it, fd, NULL);
626 if (rc) {
627 (*och_usecount)--;
628 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 629 goto out_openerr;
d7e09d03
PT
630 }
631 } else {
632 LASSERT(*och_usecount == 0);
633 if (!it->d.lustre.it_disposition) {
634 /* We cannot just request lock handle now, new ELC code
635 means that one of other OPEN locks for this file
636 could be cancelled, and since blocking ast handler
637 would attempt to grab och_mutex as well, that would
638 result in a deadlock */
639 mutex_unlock(&lli->lli_och_mutex);
640 it->it_create_mode |= M_CHECK_STALE;
48eddfd5 641 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
d7e09d03
PT
642 it->it_create_mode &= ~M_CHECK_STALE;
643 if (rc)
34e1f2bb 644 goto out_openerr;
d7e09d03
PT
645
646 goto restart;
647 }
496a51bd 648 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
34e1f2bb
JL
649 if (!*och_p) {
650 rc = -ENOMEM;
651 goto out_och_free;
652 }
d7e09d03
PT
653
654 (*och_usecount)++;
655
656 /* md_intent_lock() didn't get a request ref if there was an
657 * open error, so don't do cleanup on the request here
658 * (bug 3430) */
659 /* XXX (green): Should not we bail out on any error here, not
660 * just open error? */
661 rc = it_open_error(DISP_OPEN_OPEN, it);
662 if (rc)
34e1f2bb 663 goto out_och_free;
d7e09d03
PT
664
665 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
666
667 rc = ll_local_open(file, it, fd, *och_p);
668 if (rc)
34e1f2bb 669 goto out_och_free;
d7e09d03
PT
670 }
671 mutex_unlock(&lli->lli_och_mutex);
672 fd = NULL;
673
674 /* Must do this outside lli_och_mutex lock to prevent deadlock where
675 different kind of OPEN lock for this same inode gets cancelled
676 by ldlm_cancel_lru */
677 if (!S_ISREG(inode->i_mode))
34e1f2bb 678 goto out_och_free;
d7e09d03 679
38585ccc
AD
680 if (!lli->lli_has_smd &&
681 (cl_is_lov_delay_create(file->f_flags) ||
682 (file->f_mode & FMODE_WRITE) == 0)) {
683 CDEBUG(D_INODE, "object creation was delayed\n");
34e1f2bb 684 goto out_och_free;
d7e09d03 685 }
38585ccc 686 cl_lov_delay_create_clear(&file->f_flags);
34e1f2bb 687 goto out_och_free;
d7e09d03
PT
688
689out_och_free:
690 if (rc) {
691 if (och_p && *och_p) {
97903a26 692 kfree(*och_p);
c0a2472f 693 *och_p = NULL;
d7e09d03
PT
694 (*och_usecount)--;
695 }
696 mutex_unlock(&lli->lli_och_mutex);
697
698out_openerr:
699 if (opendir_set != 0)
700 ll_stop_statahead(inode, lli->lli_opendir_key);
a5cb8880 701 ll_file_data_put(fd);
d7e09d03
PT
702 } else {
703 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
704 }
705
706 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
707 ptlrpc_req_finished(it->d.lustre.it_data);
708 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
709 }
710
711 return rc;
712}
713
d3a8a4e2
JX
714static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
715 struct ldlm_lock_desc *desc, void *data, int flag)
716{
717 int rc;
718 struct lustre_handle lockh;
719
720 switch (flag) {
721 case LDLM_CB_BLOCKING:
722 ldlm_lock2handle(lock, &lockh);
723 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
724 if (rc < 0) {
725 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
726 return rc;
727 }
728 break;
729 case LDLM_CB_CANCELING:
730 /* do nothing */
731 break;
732 }
733 return 0;
734}
735
736/**
737 * Acquire a lease and open the file.
738 */
2d95f10e
JH
739static struct obd_client_handle *
740ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
741 __u64 open_flags)
d3a8a4e2
JX
742{
743 struct lookup_intent it = { .it_op = IT_OPEN };
744 struct ll_sb_info *sbi = ll_i2sbi(inode);
745 struct md_op_data *op_data;
746 struct ptlrpc_request *req;
747 struct lustre_handle old_handle = { 0 };
748 struct obd_client_handle *och = NULL;
749 int rc;
750 int rc2;
751
752 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
753 return ERR_PTR(-EINVAL);
754
755 if (file != NULL) {
756 struct ll_inode_info *lli = ll_i2info(inode);
757 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
758 struct obd_client_handle **och_p;
759 __u64 *och_usecount;
760
761 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
762 return ERR_PTR(-EPERM);
763
764 /* Get the openhandle of the file */
765 rc = -EBUSY;
766 mutex_lock(&lli->lli_och_mutex);
767 if (fd->fd_lease_och != NULL) {
768 mutex_unlock(&lli->lli_och_mutex);
769 return ERR_PTR(rc);
770 }
771
772 if (fd->fd_och == NULL) {
773 if (file->f_mode & FMODE_WRITE) {
774 LASSERT(lli->lli_mds_write_och != NULL);
775 och_p = &lli->lli_mds_write_och;
776 och_usecount = &lli->lli_open_fd_write_count;
777 } else {
778 LASSERT(lli->lli_mds_read_och != NULL);
779 och_p = &lli->lli_mds_read_och;
780 och_usecount = &lli->lli_open_fd_read_count;
781 }
782 if (*och_usecount == 1) {
783 fd->fd_och = *och_p;
784 *och_p = NULL;
785 *och_usecount = 0;
786 rc = 0;
787 }
788 }
789 mutex_unlock(&lli->lli_och_mutex);
790 if (rc < 0) /* more than 1 opener */
791 return ERR_PTR(rc);
792
793 LASSERT(fd->fd_och != NULL);
794 old_handle = fd->fd_och->och_fh;
795 }
796
496a51bd
JL
797 och = kzalloc(sizeof(*och), GFP_NOFS);
798 if (!och)
d3a8a4e2
JX
799 return ERR_PTR(-ENOMEM);
800
801 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
802 LUSTRE_OPC_ANY, NULL);
34e1f2bb
JL
803 if (IS_ERR(op_data)) {
804 rc = PTR_ERR(op_data);
805 goto out;
806 }
d3a8a4e2
JX
807
808 /* To tell the MDT this openhandle is from the same owner */
809 op_data->op_handle = old_handle;
810
48d23e61
JX
811 it.it_flags = fmode | open_flags;
812 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
d3a8a4e2
JX
813 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
814 ll_md_blocking_lease_ast,
815 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
816 * it can be cancelled which may mislead applications that the lease is
817 * broken;
818 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
819 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
820 * doesn't deal with openhandle, so normal openhandle will be leaked. */
821 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
822 ll_finish_md_op_data(op_data);
f236f69b 823 ptlrpc_req_finished(req);
d3a8a4e2 824 if (rc < 0)
34e1f2bb 825 goto out_release_it;
d3a8a4e2 826
34e1f2bb
JL
827 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
828 rc = -ENOENT;
829 goto out_release_it;
830 }
d3a8a4e2
JX
831
832 rc = it_open_error(DISP_OPEN_OPEN, &it);
833 if (rc)
34e1f2bb 834 goto out_release_it;
d3a8a4e2
JX
835
836 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
837 ll_och_fill(sbi->ll_md_exp, &it, och);
838
34e1f2bb
JL
839 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
840 rc = -EOPNOTSUPP;
841 goto out_close;
842 }
d3a8a4e2
JX
843
844 /* already get lease, handle lease lock */
845 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
846 if (it.d.lustre.it_lock_mode == 0 ||
847 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
848 /* open lock must return for lease */
849 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
850 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
851 it.d.lustre.it_lock_bits);
34e1f2bb
JL
852 rc = -EPROTO;
853 goto out_close;
d3a8a4e2
JX
854 }
855
856 ll_intent_release(&it);
857 return och;
858
859out_close:
48d23e61 860 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
d3a8a4e2
JX
861 if (rc2)
862 CERROR("Close openhandle returned %d\n", rc2);
863
864 /* cancel open lock */
865 if (it.d.lustre.it_lock_mode != 0) {
866 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
867 it.d.lustre.it_lock_mode);
868 it.d.lustre.it_lock_mode = 0;
869 }
870out_release_it:
871 ll_intent_release(&it);
872out:
97903a26 873 kfree(och);
d3a8a4e2
JX
874 return ERR_PTR(rc);
875}
d3a8a4e2
JX
876
877/**
878 * Release lease and close the file.
879 * It will check if the lease has ever broken.
880 */
2d95f10e
JH
881static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
882 bool *lease_broken)
d3a8a4e2
JX
883{
884 struct ldlm_lock *lock;
885 bool cancelled = true;
886 int rc;
887
888 lock = ldlm_handle2lock(&och->och_lease_handle);
889 if (lock != NULL) {
890 lock_res_and_lock(lock);
891 cancelled = ldlm_is_cancel(lock);
892 unlock_res_and_lock(lock);
893 ldlm_lock_put(lock);
894 }
895
896 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
897 PFID(&ll_i2info(inode)->lli_fid), cancelled);
898
899 if (!cancelled)
900 ldlm_cli_cancel(&och->och_lease_handle, 0);
901 if (lease_broken != NULL)
902 *lease_broken = cancelled;
903
48d23e61
JX
904 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
905 NULL);
d3a8a4e2
JX
906 return rc;
907}
d3a8a4e2 908
d7e09d03
PT
909/* Fills the obdo with the attributes for the lsm */
910static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
ef2e0f55 911 struct obdo *obdo, __u64 ioepoch, int sync)
d7e09d03
PT
912{
913 struct ptlrpc_request_set *set;
45efd655 914 struct obd_info oinfo = { };
d7e09d03
PT
915 int rc;
916
d7e09d03
PT
917 LASSERT(lsm != NULL);
918
919 oinfo.oi_md = lsm;
920 oinfo.oi_oa = obdo;
921 oinfo.oi_oa->o_oi = lsm->lsm_oi;
922 oinfo.oi_oa->o_mode = S_IFREG;
923 oinfo.oi_oa->o_ioepoch = ioepoch;
924 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
925 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
926 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
927 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
928 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
929 OBD_MD_FLDATAVERSION;
d7e09d03
PT
930 if (sync) {
931 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
932 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
933 }
934
935 set = ptlrpc_prep_set();
936 if (set == NULL) {
937 CERROR("can't allocate ptlrpc set\n");
938 rc = -ENOMEM;
939 } else {
940 rc = obd_getattr_async(exp, &oinfo, set);
941 if (rc == 0)
942 rc = ptlrpc_set_wait(set);
943 ptlrpc_set_destroy(set);
944 }
945 if (rc == 0)
946 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
947 OBD_MD_FLATIME | OBD_MD_FLMTIME |
948 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
949 OBD_MD_FLDATAVERSION);
0a3bdb00 950 return rc;
d7e09d03
PT
951}
952
953/**
954 * Performs the getattr on the inode and updates its fields.
955 * If @sync != 0, perform the getattr under the server-side lock.
956 */
957int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
958 __u64 ioepoch, int sync)
959{
d7e09d03
PT
960 struct lov_stripe_md *lsm;
961 int rc;
d7e09d03
PT
962
963 lsm = ccc_inode_lsm_get(inode);
964 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
ef2e0f55 965 obdo, ioepoch, sync);
d7e09d03
PT
966 if (rc == 0) {
967 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
968
969 obdo_refresh_inode(inode, obdo, obdo->o_valid);
2d00bd17
JP
970 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
971 POSTID(oi), i_size_read(inode),
d7e09d03 972 (unsigned long long)inode->i_blocks,
16e0631d 973 1UL << inode->i_blkbits);
d7e09d03
PT
974 }
975 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 976 return rc;
d7e09d03
PT
977}
978
979int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
980{
981 struct ll_inode_info *lli = ll_i2info(inode);
982 struct cl_object *obj = lli->lli_clob;
983 struct cl_attr *attr = ccc_env_thread_attr(env);
984 struct ost_lvb lvb;
985 int rc = 0;
986
d7e09d03
PT
987 ll_inode_size_lock(inode);
988 /* merge timestamps the most recently obtained from mds with
989 timestamps obtained from osts */
990 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
991 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
992 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
376ef86b
JH
993
994 lvb.lvb_size = i_size_read(inode);
1cc30ab9
GD
995 lvb.lvb_blocks = inode->i_blocks;
996 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
997 lvb.lvb_atime = LTIME_S(inode->i_atime);
998 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
d7e09d03
PT
999
1000 cl_object_attr_lock(obj);
1001 rc = cl_object_attr_get(env, obj, attr);
1002 cl_object_attr_unlock(obj);
1003
1004 if (rc == 0) {
1005 if (lvb.lvb_atime < attr->cat_atime)
1006 lvb.lvb_atime = attr->cat_atime;
1007 if (lvb.lvb_ctime < attr->cat_ctime)
1008 lvb.lvb_ctime = attr->cat_ctime;
1009 if (lvb.lvb_mtime < attr->cat_mtime)
1010 lvb.lvb_mtime = attr->cat_mtime;
1011
b0f5aad5 1012 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
d7e09d03
PT
1013 PFID(&lli->lli_fid), attr->cat_size);
1014 cl_isize_write_nolock(inode, attr->cat_size);
1015
1016 inode->i_blocks = attr->cat_blocks;
1017
1018 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1019 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1020 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1021 }
1022 ll_inode_size_unlock(inode);
1023
0a3bdb00 1024 return rc;
d7e09d03
PT
1025}
1026
1027int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1028 lstat_t *st)
1029{
1030 struct obdo obdo = { 0 };
1031 int rc;
1032
ef2e0f55 1033 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, &obdo, 0, 0);
d7e09d03
PT
1034 if (rc == 0) {
1035 st->st_size = obdo.o_size;
1036 st->st_blocks = obdo.o_blocks;
1037 st->st_mtime = obdo.o_mtime;
1038 st->st_atime = obdo.o_atime;
1039 st->st_ctime = obdo.o_ctime;
1040 }
1041 return rc;
1042}
1043
ec9bca9c
JH
1044static bool file_is_noatime(const struct file *file)
1045{
1046 const struct vfsmount *mnt = file->f_path.mnt;
2a8a3597 1047 const struct inode *inode = file_inode(file);
ec9bca9c
JH
1048
1049 /* Adapted from file_accessed() and touch_atime().*/
1050 if (file->f_flags & O_NOATIME)
1051 return true;
1052
1053 if (inode->i_flags & S_NOATIME)
1054 return true;
1055
1056 if (IS_NOATIME(inode))
1057 return true;
1058
1059 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1060 return true;
1061
1062 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1063 return true;
1064
1065 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1066 return true;
1067
1068 return false;
1069}
1070
d7e09d03
PT
1071void ll_io_init(struct cl_io *io, const struct file *file, int write)
1072{
2a8a3597 1073 struct inode *inode = file_inode(file);
d7e09d03
PT
1074
1075 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1076 if (write) {
1077 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1078 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1079 file->f_flags & O_DIRECT ||
1080 IS_SYNC(inode);
1081 }
1082 io->ci_obj = ll_i2info(inode)->lli_clob;
1083 io->ci_lockreq = CILR_MAYBE;
1084 if (ll_file_nolock(file)) {
1085 io->ci_lockreq = CILR_NEVER;
1086 io->ci_no_srvlock = 1;
1087 } else if (file->f_flags & O_APPEND) {
1088 io->ci_lockreq = CILR_MANDATORY;
1089 }
ec9bca9c
JH
1090
1091 io->ci_noatime = file_is_noatime(file);
d7e09d03
PT
1092}
1093
1094static ssize_t
1095ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1096 struct file *file, enum cl_io_type iot,
1097 loff_t *ppos, size_t count)
1098{
2a8a3597 1099 struct ll_inode_info *lli = ll_i2info(file_inode(file));
d7e09d03
PT
1100 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1101 struct cl_io *io;
1102 ssize_t result;
d7e09d03
PT
1103
1104restart:
1105 io = ccc_env_thread_io(env);
1106 ll_io_init(io, file, iot == CIT_WRITE);
1107
1108 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1109 struct vvp_io *vio = vvp_env_io(env);
1110 struct ccc_io *cio = ccc_env_io(env);
1111 int write_mutex_locked = 0;
1112
1113 cio->cui_fd = LUSTRE_FPRIVATE(file);
1114 vio->cui_io_subtype = args->via_io_subtype;
1115
1116 switch (vio->cui_io_subtype) {
1117 case IO_NORMAL:
b42b15fd 1118 cio->cui_iter = args->u.normal.via_iter;
d7e09d03
PT
1119 cio->cui_iocb = args->u.normal.via_iocb;
1120 if ((iot == CIT_WRITE) &&
1121 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1122 if (mutex_lock_interruptible(&lli->
34e1f2bb
JL
1123 lli_write_mutex)) {
1124 result = -ERESTARTSYS;
1125 goto out;
1126 }
d7e09d03
PT
1127 write_mutex_locked = 1;
1128 } else if (iot == CIT_READ) {
1129 down_read(&lli->lli_trunc_sem);
1130 }
1131 break;
d7e09d03
PT
1132 case IO_SPLICE:
1133 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1134 vio->u.splice.cui_flags = args->u.splice.via_flags;
1135 break;
1136 default:
d0a0acc3 1137 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
d7e09d03
PT
1138 LBUG();
1139 }
1140 result = cl_io_loop(env, io);
1141 if (write_mutex_locked)
1142 mutex_unlock(&lli->lli_write_mutex);
1143 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1144 up_read(&lli->lli_trunc_sem);
1145 } else {
1146 /* cl_io_rw_init() handled IO */
1147 result = io->ci_result;
1148 }
1149
1150 if (io->ci_nob > 0) {
1151 result = io->ci_nob;
1152 *ppos = io->u.ci_wr.wr.crw_pos;
1153 }
34e1f2bb 1154 goto out;
d7e09d03
PT
1155out:
1156 cl_io_fini(env, io);
1157 /* If any bit been read/written (result != 0), we just return
1158 * short read/write instead of restart io. */
5ea17d6c 1159 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
09561a53 1160 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
d7e09d03 1161 iot == CIT_READ ? "read" : "write",
09561a53 1162 file, *ppos, count);
d7e09d03
PT
1163 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1164 goto restart;
1165 }
1166
1167 if (iot == CIT_READ) {
1168 if (result >= 0)
2a8a3597 1169 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
d7e09d03
PT
1170 LPROC_LL_READ_BYTES, result);
1171 } else if (iot == CIT_WRITE) {
1172 if (result >= 0) {
2a8a3597 1173 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
d7e09d03
PT
1174 LPROC_LL_WRITE_BYTES, result);
1175 fd->fd_write_failed = false;
1176 } else if (result != -ERESTARTSYS) {
1177 fd->fd_write_failed = true;
1178 }
1179 }
1180
1181 return result;
1182}
1183
b42b15fd 1184static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
d7e09d03
PT
1185{
1186 struct lu_env *env;
1187 struct vvp_io_args *args;
d7e09d03
PT
1188 ssize_t result;
1189 int refcheck;
d7e09d03 1190
d7e09d03
PT
1191 env = cl_env_get(&refcheck);
1192 if (IS_ERR(env))
0a3bdb00 1193 return PTR_ERR(env);
d7e09d03
PT
1194
1195 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1196 args->u.normal.via_iter = to;
d7e09d03
PT
1197 args->u.normal.via_iocb = iocb;
1198
1199 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
b42b15fd 1200 &iocb->ki_pos, iov_iter_count(to));
d7e09d03 1201 cl_env_put(env, &refcheck);
0a3bdb00 1202 return result;
d7e09d03
PT
1203}
1204
1205/*
1206 * Write to a file (through the page cache).
1207 */
b42b15fd 1208static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
d7e09d03
PT
1209{
1210 struct lu_env *env;
1211 struct vvp_io_args *args;
d7e09d03
PT
1212 ssize_t result;
1213 int refcheck;
d7e09d03 1214
d7e09d03
PT
1215 env = cl_env_get(&refcheck);
1216 if (IS_ERR(env))
0a3bdb00 1217 return PTR_ERR(env);
d7e09d03
PT
1218
1219 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1220 args->u.normal.via_iter = from;
d7e09d03
PT
1221 args->u.normal.via_iocb = iocb;
1222
1223 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
b42b15fd 1224 &iocb->ki_pos, iov_iter_count(from));
d7e09d03 1225 cl_env_put(env, &refcheck);
0a3bdb00 1226 return result;
d7e09d03
PT
1227}
1228
d7e09d03
PT
1229/*
1230 * Send file content (through pagecache) somewhere with helper
1231 */
1232static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1233 struct pipe_inode_info *pipe, size_t count,
1234 unsigned int flags)
1235{
1236 struct lu_env *env;
1237 struct vvp_io_args *args;
1238 ssize_t result;
1239 int refcheck;
d7e09d03
PT
1240
1241 env = cl_env_get(&refcheck);
1242 if (IS_ERR(env))
0a3bdb00 1243 return PTR_ERR(env);
d7e09d03
PT
1244
1245 args = vvp_env_args(env, IO_SPLICE);
1246 args->u.splice.via_pipe = pipe;
1247 args->u.splice.via_flags = flags;
1248
1249 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1250 cl_env_put(env, &refcheck);
0a3bdb00 1251 return result;
d7e09d03
PT
1252}
1253
21aef7d9 1254static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
d7e09d03
PT
1255{
1256 struct obd_export *exp = ll_i2dtexp(inode);
1257 struct obd_trans_info oti = { 0 };
1258 struct obdo *oa = NULL;
1259 int lsm_size;
1260 int rc = 0;
1261 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03 1262
131637b8 1263 oa = kmem_cache_alloc(obdo_cachep, GFP_NOFS | __GFP_ZERO);
d7e09d03 1264 if (oa == NULL)
0a3bdb00 1265 return -ENOMEM;
d7e09d03
PT
1266
1267 lsm = ccc_inode_lsm_get(inode);
34e1f2bb
JL
1268 if (!lsm_has_objects(lsm)) {
1269 rc = -ENOENT;
1270 goto out;
1271 }
d7e09d03
PT
1272
1273 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1274 (lsm->lsm_stripe_count));
1275
e958f49b 1276 lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
34e1f2bb
JL
1277 if (lsm2 == NULL) {
1278 rc = -ENOMEM;
1279 goto out;
1280 }
d7e09d03
PT
1281
1282 oa->o_oi = *oi;
1283 oa->o_nlink = ost_idx;
1284 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1285 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1286 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1287 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1288 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1289 memcpy(lsm2, lsm, lsm_size);
1290 ll_inode_size_lock(inode);
1291 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1292 ll_inode_size_unlock(inode);
1293
e958f49b 1294 kvfree(lsm2);
34e1f2bb 1295 goto out;
d7e09d03
PT
1296out:
1297 ccc_inode_lsm_put(inode, lsm);
2ba262fb 1298 kmem_cache_free(obdo_cachep, oa);
d7e09d03
PT
1299 return rc;
1300}
1301
1302static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1303{
1304 struct ll_recreate_obj ucreat;
1305 struct ost_id oi;
d7e09d03 1306
2eb90a75 1307 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1308 return -EPERM;
d7e09d03
PT
1309
1310 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1311 sizeof(ucreat)))
0a3bdb00 1312 return -EFAULT;
d7e09d03
PT
1313
1314 ostid_set_seq_mdt0(&oi);
1315 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1316 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1317}
1318
1319static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1320{
1321 struct lu_fid fid;
1322 struct ost_id oi;
21aef7d9 1323 u32 ost_idx;
d7e09d03 1324
2eb90a75 1325 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1326 return -EPERM;
d7e09d03
PT
1327
1328 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1329 return -EFAULT;
d7e09d03
PT
1330
1331 fid_to_ostid(&fid, &oi);
1332 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1333 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1334}
1335
c139f3ce 1336int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
d7e09d03
PT
1337 int flags, struct lov_user_md *lum, int lum_size)
1338{
1339 struct lov_stripe_md *lsm = NULL;
1340 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1341 int rc = 0;
d7e09d03
PT
1342
1343 lsm = ccc_inode_lsm_get(inode);
1344 if (lsm != NULL) {
1345 ccc_inode_lsm_put(inode, lsm);
1346 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1347 inode->i_ino);
34e1f2bb
JL
1348 rc = -EEXIST;
1349 goto out;
d7e09d03
PT
1350 }
1351
1352 ll_inode_size_lock(inode);
c139f3ce 1353 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
d7e09d03 1354 if (rc)
34e1f2bb 1355 goto out_unlock;
d7e09d03
PT
1356 rc = oit.d.lustre.it_status;
1357 if (rc < 0)
34e1f2bb 1358 goto out_req_free;
d7e09d03 1359
e22fdcc8 1360 ll_release_openhandle(inode, &oit);
d7e09d03 1361
38585ccc 1362out_unlock:
d7e09d03
PT
1363 ll_inode_size_unlock(inode);
1364 ll_intent_release(&oit);
1365 ccc_inode_lsm_put(inode, lsm);
38585ccc 1366out:
0a3bdb00 1367 return rc;
d7e09d03
PT
1368out_req_free:
1369 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1370 goto out;
1371}
1372
1373int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1374 struct lov_mds_md **lmmp, int *lmm_size,
1375 struct ptlrpc_request **request)
1376{
1377 struct ll_sb_info *sbi = ll_i2sbi(inode);
1378 struct mdt_body *body;
1379 struct lov_mds_md *lmm = NULL;
1380 struct ptlrpc_request *req = NULL;
1381 struct md_op_data *op_data;
1382 int rc, lmmsize;
1383
44779340 1384 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03 1385 if (rc)
0a3bdb00 1386 return rc;
d7e09d03
PT
1387
1388 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1389 strlen(filename), lmmsize,
1390 LUSTRE_OPC_ANY, NULL);
1391 if (IS_ERR(op_data))
0a3bdb00 1392 return PTR_ERR(op_data);
d7e09d03
PT
1393
1394 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1395 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1396 ll_finish_md_op_data(op_data);
1397 if (rc < 0) {
2d00bd17
JP
1398 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1399 filename, rc);
34e1f2bb 1400 goto out;
d7e09d03
PT
1401 }
1402
1403 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1404 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1405
1406 lmmsize = body->eadatasize;
1407
1408 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1409 lmmsize == 0) {
34e1f2bb
JL
1410 rc = -ENODATA;
1411 goto out;
d7e09d03
PT
1412 }
1413
1414 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1415 LASSERT(lmm != NULL);
1416
1417 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1418 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
34e1f2bb
JL
1419 rc = -EPROTO;
1420 goto out;
d7e09d03
PT
1421 }
1422
1423 /*
1424 * This is coming from the MDS, so is probably in
1425 * little endian. We convert it to host endian before
1426 * passing it to userspace.
1427 */
1f6eaf83 1428 if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
5dd16419
JX
1429 int stripe_count;
1430
1431 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1432 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1433 stripe_count = 0;
1434
d7e09d03
PT
1435 /* if function called for directory - we should
1436 * avoid swab not existent lsm objects */
1437 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1438 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1439 if (S_ISREG(body->mode))
1440 lustre_swab_lov_user_md_objects(
1441 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1442 stripe_count);
d7e09d03
PT
1443 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1444 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1445 if (S_ISREG(body->mode))
1446 lustre_swab_lov_user_md_objects(
1447 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1448 stripe_count);
d7e09d03
PT
1449 }
1450 }
1451
1452out:
1453 *lmmp = lmm;
1454 *lmm_size = lmmsize;
1455 *request = req;
1456 return rc;
1457}
1458
1459static int ll_lov_setea(struct inode *inode, struct file *file,
1460 unsigned long arg)
1461{
1462 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1463 struct lov_user_md *lump;
1464 int lum_size = sizeof(struct lov_user_md) +
1465 sizeof(struct lov_user_ost_data);
1466 int rc;
d7e09d03 1467
2eb90a75 1468 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1469 return -EPERM;
d7e09d03 1470
e958f49b 1471 lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
d7e09d03 1472 if (lump == NULL)
0a3bdb00 1473 return -ENOMEM;
d7e09d03 1474
bdbb0512 1475 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
e958f49b 1476 kvfree(lump);
0a3bdb00 1477 return -EFAULT;
d7e09d03
PT
1478 }
1479
c139f3ce
AV
1480 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1481 lum_size);
1482 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03 1483
e958f49b 1484 kvfree(lump);
0a3bdb00 1485 return rc;
d7e09d03
PT
1486}
1487
1488static int ll_lov_setstripe(struct inode *inode, struct file *file,
1489 unsigned long arg)
1490{
1491 struct lov_user_md_v3 lumv3;
1492 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1493 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1494 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1495 int lum_size, rc;
1496 int flags = FMODE_WRITE;
d7e09d03
PT
1497
1498 /* first try with v1 which is smaller than v3 */
1499 lum_size = sizeof(struct lov_user_md_v1);
1500 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1501 return -EFAULT;
d7e09d03
PT
1502
1503 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1504 lum_size = sizeof(struct lov_user_md_v3);
1505 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1506 return -EFAULT;
d7e09d03
PT
1507 }
1508
c139f3ce
AV
1509 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1510 lum_size);
1511 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03
PT
1512 if (rc == 0) {
1513 struct lov_stripe_md *lsm;
1514 __u32 gen;
1515
1516 put_user(0, &lumv1p->lmm_stripe_count);
1517
1518 ll_layout_refresh(inode, &gen);
1519 lsm = ccc_inode_lsm_get(inode);
1520 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1521 0, lsm, (void *)arg);
1522 ccc_inode_lsm_put(inode, lsm);
1523 }
0a3bdb00 1524 return rc;
d7e09d03
PT
1525}
1526
1527static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1528{
1529 struct lov_stripe_md *lsm;
1530 int rc = -ENODATA;
d7e09d03
PT
1531
1532 lsm = ccc_inode_lsm_get(inode);
1533 if (lsm != NULL)
1534 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1535 lsm, (void *)arg);
1536 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1537 return rc;
d7e09d03
PT
1538}
1539
2d95f10e
JH
1540static int
1541ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
d7e09d03
PT
1542{
1543 struct ll_inode_info *lli = ll_i2info(inode);
1544 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1545 struct ccc_grouplock grouplock;
1546 int rc;
d7e09d03 1547
431b5678
PF
1548 if (arg == 0) {
1549 CWARN("group id for group lock must not be 0\n");
1550 return -EINVAL;
1551 }
1552
d7e09d03 1553 if (ll_file_nolock(file))
0a3bdb00 1554 return -EOPNOTSUPP;
d7e09d03
PT
1555
1556 spin_lock(&lli->lli_lock);
1557 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1558 CWARN("group lock already existed with gid %lu\n",
1559 fd->fd_grouplock.cg_gid);
1560 spin_unlock(&lli->lli_lock);
0a3bdb00 1561 return -EINVAL;
d7e09d03
PT
1562 }
1563 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1564 spin_unlock(&lli->lli_lock);
1565
1566 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1567 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1568 if (rc)
0a3bdb00 1569 return rc;
d7e09d03
PT
1570
1571 spin_lock(&lli->lli_lock);
1572 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1573 spin_unlock(&lli->lli_lock);
1574 CERROR("another thread just won the race\n");
1575 cl_put_grouplock(&grouplock);
0a3bdb00 1576 return -EINVAL;
d7e09d03
PT
1577 }
1578
1579 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1580 fd->fd_grouplock = grouplock;
1581 spin_unlock(&lli->lli_lock);
1582
1583 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1584 return 0;
d7e09d03
PT
1585}
1586
920b4f2e
LC
1587static int ll_put_grouplock(struct inode *inode, struct file *file,
1588 unsigned long arg)
d7e09d03
PT
1589{
1590 struct ll_inode_info *lli = ll_i2info(inode);
1591 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1592 struct ccc_grouplock grouplock;
d7e09d03
PT
1593
1594 spin_lock(&lli->lli_lock);
1595 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1596 spin_unlock(&lli->lli_lock);
1597 CWARN("no group lock held\n");
0a3bdb00 1598 return -EINVAL;
d7e09d03
PT
1599 }
1600 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1601
1602 if (fd->fd_grouplock.cg_gid != arg) {
1603 CWARN("group lock %lu doesn't match current id %lu\n",
1604 arg, fd->fd_grouplock.cg_gid);
1605 spin_unlock(&lli->lli_lock);
0a3bdb00 1606 return -EINVAL;
d7e09d03
PT
1607 }
1608
1609 grouplock = fd->fd_grouplock;
1610 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1611 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1612 spin_unlock(&lli->lli_lock);
1613
1614 cl_put_grouplock(&grouplock);
1615 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1616 return 0;
d7e09d03
PT
1617}
1618
1619/**
1620 * Close inode open handle
1621 *
e22fdcc8 1622 * \param inode [in] inode in question
d7e09d03
PT
1623 * \param it [in,out] intent which contains open info and result
1624 *
1625 * \retval 0 success
1626 * \retval <0 failure
1627 */
e22fdcc8 1628int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
d7e09d03 1629{
d7e09d03
PT
1630 struct obd_client_handle *och;
1631 int rc;
d7e09d03
PT
1632
1633 LASSERT(inode);
1634
1635 /* Root ? Do nothing. */
f76c23da 1636 if (is_root_inode(inode))
0a3bdb00 1637 return 0;
d7e09d03
PT
1638
1639 /* No open handle to close? Move away */
1640 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1641 return 0;
d7e09d03
PT
1642
1643 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1644
496a51bd 1645 och = kzalloc(sizeof(*och), GFP_NOFS);
34e1f2bb
JL
1646 if (!och) {
1647 rc = -ENOMEM;
1648 goto out;
1649 }
d7e09d03 1650
ea1db081 1651 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1652
1653 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61
JX
1654 inode, och, NULL);
1655out:
d7e09d03
PT
1656 /* this one is in place of ll_file_open */
1657 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1658 ptlrpc_req_finished(it->d.lustre.it_data);
1659 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1660 }
0a3bdb00 1661 return rc;
d7e09d03
PT
1662}
1663
1664/**
1665 * Get size for inode for which FIEMAP mapping is requested.
1666 * Make the FIEMAP get_info call and returns the result.
1667 */
2d95f10e 1668static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
ebdc4fc5 1669 size_t num_bytes)
d7e09d03
PT
1670{
1671 struct obd_export *exp = ll_i2dtexp(inode);
1672 struct lov_stripe_md *lsm = NULL;
1673 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
ebdc4fc5 1674 __u32 vallen = num_bytes;
d7e09d03 1675 int rc;
d7e09d03
PT
1676
1677 /* Checks for fiemap flags */
1678 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1679 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1680 return -EBADR;
1681 }
1682
1683 /* Check for FIEMAP_FLAG_SYNC */
1684 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1685 rc = filemap_fdatawrite(inode->i_mapping);
1686 if (rc)
1687 return rc;
1688 }
1689
1690 lsm = ccc_inode_lsm_get(inode);
1691 if (lsm == NULL)
1692 return -ENOENT;
1693
1694 /* If the stripe_count > 1 and the application does not understand
1695 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1696 */
1697 if (lsm->lsm_stripe_count > 1 &&
34e1f2bb
JL
1698 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1699 rc = -EOPNOTSUPP;
1700 goto out;
1701 }
d7e09d03
PT
1702
1703 fm_key.oa.o_oi = lsm->lsm_oi;
1704 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1705
a915ffda
LD
1706 if (i_size_read(inode) == 0) {
1707 rc = ll_glimpse_size(inode);
1708 if (rc)
1709 goto out;
1710 }
1711
d7e09d03
PT
1712 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1713 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1714 /* If filesize is 0, then there would be no objects for mapping */
1715 if (fm_key.oa.o_size == 0) {
1716 fiemap->fm_mapped_extents = 0;
34e1f2bb
JL
1717 rc = 0;
1718 goto out;
d7e09d03
PT
1719 }
1720
1721 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1722
1723 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1724 fiemap, lsm);
1725 if (rc)
1726 CERROR("obd_get_info failed: rc = %d\n", rc);
1727
1728out:
1729 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1730 return rc;
d7e09d03
PT
1731}
1732
2b358b4e 1733int ll_fid2path(struct inode *inode, void __user *arg)
d7e09d03 1734{
2b358b4e
FZ
1735 struct obd_export *exp = ll_i2mdexp(inode);
1736 const struct getinfo_fid2path __user *gfin = arg;
1737 struct getinfo_fid2path *gfout;
1738 u32 pathlen;
1739 size_t outsize;
1740 int rc;
d7e09d03 1741
2eb90a75 1742 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
d7e09d03 1743 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1744 return -EPERM;
d7e09d03 1745
2b358b4e
FZ
1746 /* Only need to get the buflen */
1747 if (get_user(pathlen, &gfin->gf_pathlen))
0a3bdb00 1748 return -EFAULT;
d7e09d03 1749
c7b09efa
OD
1750 if (pathlen > PATH_MAX)
1751 return -EINVAL;
1752
2b358b4e
FZ
1753 outsize = sizeof(*gfout) + pathlen;
1754
496a51bd
JL
1755 gfout = kzalloc(outsize, GFP_NOFS);
1756 if (!gfout)
0a3bdb00 1757 return -ENOMEM;
2b358b4e 1758
34e1f2bb
JL
1759 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1760 rc = -EFAULT;
1761 goto gf_free;
1762 }
d7e09d03
PT
1763
1764 /* Call mdc_iocontrol */
1765 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2b358b4e 1766 if (rc != 0)
34e1f2bb 1767 goto gf_free;
d7e09d03
PT
1768
1769 if (copy_to_user(arg, gfout, outsize))
1770 rc = -EFAULT;
1771
1772gf_free:
97903a26 1773 kfree(gfout);
0a3bdb00 1774 return rc;
d7e09d03
PT
1775}
1776
1777static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1778{
1779 struct ll_user_fiemap *fiemap_s;
1780 size_t num_bytes, ret_bytes;
1781 unsigned int extent_count;
1782 int rc = 0;
1783
1784 /* Get the extent count so we can calculate the size of
1785 * required fiemap buffer */
1786 if (get_user(extent_count,
1787 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1788 return -EFAULT;
7bc3dfa3
VO
1789
1790 if (extent_count >=
1791 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1792 return -EINVAL;
d7e09d03
PT
1793 num_bytes = sizeof(*fiemap_s) + (extent_count *
1794 sizeof(struct ll_fiemap_extent));
1795
e958f49b 1796 fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
d7e09d03 1797 if (fiemap_s == NULL)
0a3bdb00 1798 return -ENOMEM;
d7e09d03
PT
1799
1800 /* get the fiemap value */
1801 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
34e1f2bb
JL
1802 sizeof(*fiemap_s))) {
1803 rc = -EFAULT;
1804 goto error;
1805 }
d7e09d03
PT
1806
1807 /* If fm_extent_count is non-zero, read the first extent since
1808 * it is used to calculate end_offset and device from previous
1809 * fiemap call. */
1810 if (extent_count) {
1811 if (copy_from_user(&fiemap_s->fm_extents[0],
1812 (char __user *)arg + sizeof(*fiemap_s),
34e1f2bb
JL
1813 sizeof(struct ll_fiemap_extent))) {
1814 rc = -EFAULT;
1815 goto error;
1816 }
d7e09d03
PT
1817 }
1818
1819 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1820 if (rc)
34e1f2bb 1821 goto error;
d7e09d03
PT
1822
1823 ret_bytes = sizeof(struct ll_user_fiemap);
1824
1825 if (extent_count != 0)
1826 ret_bytes += (fiemap_s->fm_mapped_extents *
1827 sizeof(struct ll_fiemap_extent));
1828
1829 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1830 rc = -EFAULT;
1831
1832error:
e958f49b 1833 kvfree(fiemap_s);
0a3bdb00 1834 return rc;
d7e09d03
PT
1835}
1836
1837/*
1838 * Read the data_version for inode.
1839 *
1840 * This value is computed using stripe object version on OST.
1841 * Version is computed using server side locking.
1842 *
1843 * @param extent_lock Take extent lock. Not needed if a process is already
1844 * holding the OST object group locks.
1845 */
1846int ll_data_version(struct inode *inode, __u64 *data_version,
1847 int extent_lock)
1848{
1849 struct lov_stripe_md *lsm = NULL;
1850 struct ll_sb_info *sbi = ll_i2sbi(inode);
1851 struct obdo *obdo = NULL;
1852 int rc;
d7e09d03
PT
1853
1854 /* If no stripe, we consider version is 0. */
1855 lsm = ccc_inode_lsm_get(inode);
5dd16419 1856 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1857 *data_version = 0;
1858 CDEBUG(D_INODE, "No object for inode\n");
34e1f2bb
JL
1859 rc = 0;
1860 goto out;
d7e09d03
PT
1861 }
1862
496a51bd
JL
1863 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1864 if (!obdo) {
34e1f2bb
JL
1865 rc = -ENOMEM;
1866 goto out;
1867 }
d7e09d03 1868
ef2e0f55 1869 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, obdo, 0, extent_lock);
5dd16419 1870 if (rc == 0) {
d7e09d03
PT
1871 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1872 rc = -EOPNOTSUPP;
1873 else
1874 *data_version = obdo->o_data_version;
1875 }
1876
97903a26 1877 kfree(obdo);
5dd16419 1878out:
d7e09d03 1879 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1880 return rc;
d7e09d03
PT
1881}
1882
48d23e61
JX
1883/*
1884 * Trigger a HSM release request for the provided inode.
1885 */
1886int ll_hsm_release(struct inode *inode)
1887{
1888 struct cl_env_nest nest;
1889 struct lu_env *env;
1890 struct obd_client_handle *och = NULL;
1891 __u64 data_version = 0;
1892 int rc;
1893
48d23e61
JX
1894 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1895 ll_get_fsname(inode->i_sb, NULL, 0),
1896 PFID(&ll_i2info(inode)->lli_fid));
1897
1898 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
34e1f2bb
JL
1899 if (IS_ERR(och)) {
1900 rc = PTR_ERR(och);
1901 goto out;
1902 }
48d23e61
JX
1903
1904 /* Grab latest data_version and [am]time values */
1905 rc = ll_data_version(inode, &data_version, 1);
1906 if (rc != 0)
34e1f2bb 1907 goto out;
48d23e61
JX
1908
1909 env = cl_env_nested_get(&nest);
34e1f2bb
JL
1910 if (IS_ERR(env)) {
1911 rc = PTR_ERR(env);
1912 goto out;
1913 }
48d23e61
JX
1914
1915 ll_merge_lvb(env, inode);
1916 cl_env_nested_put(&nest, env);
1917
1918 /* Release the file.
1919 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1920 * we still need it to pack l_remote_handle to MDT. */
1921 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1922 &data_version);
1923 och = NULL;
1924
48d23e61
JX
1925out:
1926 if (och != NULL && !IS_ERR(och)) /* close the file */
1927 ll_lease_close(och, inode, NULL);
1928
1929 return rc;
1930}
1931
d7e09d03
PT
1932struct ll_swap_stack {
1933 struct iattr ia1, ia2;
1934 __u64 dv1, dv2;
1935 struct inode *inode1, *inode2;
1936 bool check_dv1, check_dv2;
1937};
1938
1939static int ll_swap_layouts(struct file *file1, struct file *file2,
1940 struct lustre_swap_layouts *lsl)
1941{
1942 struct mdc_swap_layouts msl;
1943 struct md_op_data *op_data;
1944 __u32 gid;
1945 __u64 dv;
1946 struct ll_swap_stack *llss = NULL;
1947 int rc;
1948
496a51bd
JL
1949 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1950 if (!llss)
0a3bdb00 1951 return -ENOMEM;
d7e09d03 1952
2a8a3597
AV
1953 llss->inode1 = file_inode(file1);
1954 llss->inode2 = file_inode(file2);
d7e09d03 1955
34e1f2bb
JL
1956 if (!S_ISREG(llss->inode2->i_mode)) {
1957 rc = -EINVAL;
1958 goto free;
1959 }
d7e09d03 1960
9c5fb72c 1961 if (inode_permission(llss->inode1, MAY_WRITE) ||
34e1f2bb
JL
1962 inode_permission(llss->inode2, MAY_WRITE)) {
1963 rc = -EPERM;
1964 goto free;
1965 }
d7e09d03 1966
34e1f2bb
JL
1967 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1968 rc = -EXDEV;
1969 goto free;
1970 }
d7e09d03
PT
1971
1972 /* we use 2 bool because it is easier to swap than 2 bits */
1973 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1974 llss->check_dv1 = true;
1975
1976 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1977 llss->check_dv2 = true;
1978
1979 /* we cannot use lsl->sl_dvX directly because we may swap them */
1980 llss->dv1 = lsl->sl_dv1;
1981 llss->dv2 = lsl->sl_dv2;
1982
1983 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
34e1f2bb
JL
1984 if (rc == 0) /* same file, done! */ {
1985 rc = 0;
1986 goto free;
1987 }
d7e09d03
PT
1988
1989 if (rc < 0) { /* sequentialize it */
1990 swap(llss->inode1, llss->inode2);
1991 swap(file1, file2);
1992 swap(llss->dv1, llss->dv2);
1993 swap(llss->check_dv1, llss->check_dv2);
1994 }
1995
1996 gid = lsl->sl_gid;
1997 if (gid != 0) { /* application asks to flush dirty cache */
1998 rc = ll_get_grouplock(llss->inode1, file1, gid);
1999 if (rc < 0)
34e1f2bb 2000 goto free;
d7e09d03
PT
2001
2002 rc = ll_get_grouplock(llss->inode2, file2, gid);
2003 if (rc < 0) {
2004 ll_put_grouplock(llss->inode1, file1, gid);
34e1f2bb 2005 goto free;
d7e09d03
PT
2006 }
2007 }
2008
2009 /* to be able to restore mtime and atime after swap
2010 * we need to first save them */
2011 if (lsl->sl_flags &
2012 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2013 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2014 llss->ia1.ia_atime = llss->inode1->i_atime;
2015 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2016 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2017 llss->ia2.ia_atime = llss->inode2->i_atime;
2018 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2019 }
2020
d0a0acc3 2021 /* ultimate check, before swapping the layouts we check if
d7e09d03
PT
2022 * dataversion has changed (if requested) */
2023 if (llss->check_dv1) {
2024 rc = ll_data_version(llss->inode1, &dv, 0);
2025 if (rc)
34e1f2bb
JL
2026 goto putgl;
2027 if (dv != llss->dv1) {
2028 rc = -EAGAIN;
2029 goto putgl;
2030 }
d7e09d03
PT
2031 }
2032
2033 if (llss->check_dv2) {
2034 rc = ll_data_version(llss->inode2, &dv, 0);
2035 if (rc)
34e1f2bb
JL
2036 goto putgl;
2037 if (dv != llss->dv2) {
2038 rc = -EAGAIN;
2039 goto putgl;
2040 }
d7e09d03
PT
2041 }
2042
2043 /* struct md_op_data is used to send the swap args to the mdt
2044 * only flags is missing, so we use struct mdc_swap_layouts
2045 * through the md_op_data->op_data */
2046 /* flags from user space have to be converted before they are send to
2047 * server, no flag is sent today, they are only used on the client */
2048 msl.msl_flags = 0;
2049 rc = -ENOMEM;
2050 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2051 0, LUSTRE_OPC_ANY, &msl);
34e1f2bb
JL
2052 if (IS_ERR(op_data)) {
2053 rc = PTR_ERR(op_data);
2054 goto free;
2055 }
79a8726a
JH
2056
2057 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2058 sizeof(*op_data), op_data, NULL);
2059 ll_finish_md_op_data(op_data);
d7e09d03
PT
2060
2061putgl:
2062 if (gid != 0) {
2063 ll_put_grouplock(llss->inode2, file2, gid);
2064 ll_put_grouplock(llss->inode1, file1, gid);
2065 }
2066
2067 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2068 if (rc != 0)
34e1f2bb 2069 goto free;
d7e09d03
PT
2070
2071 /* clear useless flags */
2072 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2073 llss->ia1.ia_valid &= ~ATTR_MTIME;
2074 llss->ia2.ia_valid &= ~ATTR_MTIME;
2075 }
2076
2077 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2078 llss->ia1.ia_valid &= ~ATTR_ATIME;
2079 llss->ia2.ia_valid &= ~ATTR_ATIME;
2080 }
2081
2082 /* update time if requested */
2083 rc = 0;
2084 if (llss->ia2.ia_valid != 0) {
5955102c 2085 inode_lock(llss->inode1);
b583043e 2086 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
5955102c 2087 inode_unlock(llss->inode1);
d7e09d03
PT
2088 }
2089
2090 if (llss->ia1.ia_valid != 0) {
2091 int rc1;
2092
5955102c 2093 inode_lock(llss->inode2);
b583043e 2094 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
5955102c 2095 inode_unlock(llss->inode2);
d7e09d03
PT
2096 if (rc == 0)
2097 rc = rc1;
2098 }
2099
2100free:
e6b9a3b2 2101 kfree(llss);
d7e09d03 2102
0a3bdb00 2103 return rc;
d7e09d03
PT
2104}
2105
a720b790
JL
2106static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2107{
2108 struct md_op_data *op_data;
2109 int rc;
2110
2742c75e
BF
2111 /* Detect out-of range masks */
2112 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2113 return -EINVAL;
2114
a720b790
JL
2115 /* Non-root users are forbidden to set or clear flags which are
2116 * NOT defined in HSM_USER_MASK. */
2117 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2eb90a75 2118 !capable(CFS_CAP_SYS_ADMIN))
a720b790
JL
2119 return -EPERM;
2120
2742c75e
BF
2121 /* Detect out-of range archive id */
2122 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2123 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2124 return -EINVAL;
2125
a720b790
JL
2126 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2127 LUSTRE_OPC_ANY, hss);
2128 if (IS_ERR(op_data))
2129 return PTR_ERR(op_data);
2130
2131 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2132 sizeof(*op_data), op_data, NULL);
2133
2134 ll_finish_md_op_data(op_data);
2135
2136 return rc;
2137}
2138
2139static int ll_hsm_import(struct inode *inode, struct file *file,
2140 struct hsm_user_import *hui)
2141{
2142 struct hsm_state_set *hss = NULL;
2143 struct iattr *attr = NULL;
2144 int rc;
2145
a720b790
JL
2146 if (!S_ISREG(inode->i_mode))
2147 return -EINVAL;
2148
2149 /* set HSM flags */
496a51bd 2150 hss = kzalloc(sizeof(*hss), GFP_NOFS);
e6b9a3b2
JL
2151 if (!hss)
2152 return -ENOMEM;
a720b790
JL
2153
2154 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2155 hss->hss_archive_id = hui->hui_archive_id;
2156 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2157 rc = ll_hsm_state_set(inode, hss);
2158 if (rc != 0)
e6b9a3b2 2159 goto free_hss;
a720b790 2160
496a51bd
JL
2161 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2162 if (!attr) {
34e1f2bb 2163 rc = -ENOMEM;
e6b9a3b2 2164 goto free_hss;
34e1f2bb 2165 }
a720b790
JL
2166
2167 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2168 attr->ia_mode |= S_IFREG;
2169 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2170 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2171 attr->ia_size = hui->hui_size;
2172 attr->ia_mtime.tv_sec = hui->hui_mtime;
2173 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2174 attr->ia_atime.tv_sec = hui->hui_atime;
2175 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2176
2177 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2178 ATTR_UID | ATTR_GID |
2179 ATTR_MTIME | ATTR_MTIME_SET |
2180 ATTR_ATIME | ATTR_ATIME_SET;
2181
5955102c 2182 inode_lock(inode);
b6ee56fe 2183
b583043e 2184 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
a720b790
JL
2185 if (rc == -ENODATA)
2186 rc = 0;
2187
5955102c 2188 inode_unlock(inode);
b6ee56fe 2189
e6b9a3b2
JL
2190 kfree(attr);
2191free_hss:
2192 kfree(hss);
a720b790
JL
2193 return rc;
2194}
2195
2d95f10e
JH
2196static long
2197ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
d7e09d03 2198{
2a8a3597 2199 struct inode *inode = file_inode(file);
d7e09d03
PT
2200 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2201 int flags, rc;
d7e09d03
PT
2202
2203 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2204 inode->i_generation, inode, cmd);
2205 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2206
2207 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2208 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 2209 return -ENOTTY;
d7e09d03 2210
a58a38ac 2211 switch (cmd) {
d7e09d03
PT
2212 case LL_IOC_GETFLAGS:
2213 /* Get the current value of the file flags */
2214 return put_user(fd->fd_flags, (int *)arg);
2215 case LL_IOC_SETFLAGS:
2216 case LL_IOC_CLRFLAGS:
2217 /* Set or clear specific file flags */
2218 /* XXX This probably needs checks to ensure the flags are
2219 * not abused, and to handle any flag side effects.
2220 */
2221 if (get_user(flags, (int *) arg))
0a3bdb00 2222 return -EFAULT;
d7e09d03
PT
2223
2224 if (cmd == LL_IOC_SETFLAGS) {
2225 if ((flags & LL_FILE_IGNORE_LOCK) &&
2226 !(file->f_flags & O_DIRECT)) {
2d00bd17
JP
2227 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2228 current->comm);
0a3bdb00 2229 return -EINVAL;
d7e09d03
PT
2230 }
2231
2232 fd->fd_flags |= flags;
2233 } else {
2234 fd->fd_flags &= ~flags;
2235 }
0a3bdb00 2236 return 0;
d7e09d03 2237 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 2238 return ll_lov_setstripe(inode, file, arg);
d7e09d03 2239 case LL_IOC_LOV_SETEA:
0a3bdb00 2240 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
2241 case LL_IOC_LOV_SWAP_LAYOUTS: {
2242 struct file *file2;
2243 struct lustre_swap_layouts lsl;
2244
2245 if (copy_from_user(&lsl, (char *)arg,
2246 sizeof(struct lustre_swap_layouts)))
0a3bdb00 2247 return -EFAULT;
d7e09d03
PT
2248
2249 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 2250 return -EPERM;
d7e09d03
PT
2251
2252 file2 = fget(lsl.sl_fd);
2253 if (file2 == NULL)
0a3bdb00 2254 return -EBADF;
d7e09d03
PT
2255
2256 rc = -EPERM;
2257 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2258 rc = ll_swap_layouts(file, file2, &lsl);
2259 fput(file2);
0a3bdb00 2260 return rc;
d7e09d03
PT
2261 }
2262 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 2263 return ll_lov_getstripe(inode, arg);
d7e09d03 2264 case LL_IOC_RECREATE_OBJ:
0a3bdb00 2265 return ll_lov_recreate_obj(inode, arg);
d7e09d03 2266 case LL_IOC_RECREATE_FID:
0a3bdb00 2267 return ll_lov_recreate_fid(inode, arg);
d7e09d03 2268 case FSFILT_IOC_FIEMAP:
0a3bdb00 2269 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
2270 case FSFILT_IOC_GETFLAGS:
2271 case FSFILT_IOC_SETFLAGS:
0a3bdb00 2272 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
2273 case FSFILT_IOC_GETVERSION_OLD:
2274 case FSFILT_IOC_GETVERSION:
0a3bdb00 2275 return put_user(inode->i_generation, (int *)arg);
d7e09d03 2276 case LL_IOC_GROUP_LOCK:
0a3bdb00 2277 return ll_get_grouplock(inode, file, arg);
d7e09d03 2278 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 2279 return ll_put_grouplock(inode, file, arg);
d7e09d03 2280 case IOC_OBD_STATFS:
0a3bdb00 2281 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
2282
2283 /* We need to special case any other ioctls we want to handle,
2284 * to send them to the MDS/OST as appropriate and to properly
2285 * network encode the arg field.
2286 case FSFILT_IOC_SETVERSION_OLD:
2287 case FSFILT_IOC_SETVERSION:
2288 */
2289 case LL_IOC_FLUSHCTX:
0a3bdb00 2290 return ll_flush_ctx(inode);
d7e09d03
PT
2291 case LL_IOC_PATH2FID: {
2292 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2293 sizeof(struct lu_fid)))
0a3bdb00 2294 return -EFAULT;
d7e09d03 2295
0a3bdb00 2296 return 0;
d7e09d03
PT
2297 }
2298 case OBD_IOC_FID2PATH:
0a3bdb00 2299 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
2300 case LL_IOC_DATA_VERSION: {
2301 struct ioc_data_version idv;
2302 int rc;
2303
2304 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 2305 return -EFAULT;
d7e09d03
PT
2306
2307 rc = ll_data_version(inode, &idv.idv_version,
2308 !(idv.idv_flags & LL_DV_NOFLUSH));
2309
2310 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 2311 return -EFAULT;
d7e09d03 2312
0a3bdb00 2313 return rc;
d7e09d03
PT
2314 }
2315
2316 case LL_IOC_GET_MDTIDX: {
2317 int mdtidx;
2318
2319 mdtidx = ll_get_mdt_idx(inode);
2320 if (mdtidx < 0)
0a3bdb00 2321 return mdtidx;
d7e09d03 2322
bdbb0512 2323 if (put_user((int)mdtidx, (int *)arg))
0a3bdb00 2324 return -EFAULT;
d7e09d03 2325
0a3bdb00 2326 return 0;
d7e09d03
PT
2327 }
2328 case OBD_IOC_GETDTNAME:
2329 case OBD_IOC_GETMDNAME:
0a3bdb00 2330 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
2331 case LL_IOC_HSM_STATE_GET: {
2332 struct md_op_data *op_data;
2333 struct hsm_user_state *hus;
2334 int rc;
2335
496a51bd
JL
2336 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2337 if (!hus)
0a3bdb00 2338 return -ENOMEM;
d7e09d03
PT
2339
2340 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2341 LUSTRE_OPC_ANY, hus);
79a8726a 2342 if (IS_ERR(op_data)) {
97903a26 2343 kfree(hus);
0a3bdb00 2344 return PTR_ERR(op_data);
d7e09d03
PT
2345 }
2346
2347 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2348 op_data, NULL);
2349
2350 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2351 rc = -EFAULT;
2352
2353 ll_finish_md_op_data(op_data);
97903a26 2354 kfree(hus);
0a3bdb00 2355 return rc;
d7e09d03
PT
2356 }
2357 case LL_IOC_HSM_STATE_SET: {
d7e09d03
PT
2358 struct hsm_state_set *hss;
2359 int rc;
2360
0c027bc3
AH
2361 hss = memdup_user((char *)arg, sizeof(*hss));
2362 if (IS_ERR(hss))
2363 return PTR_ERR(hss);
d7e09d03 2364
a720b790 2365 rc = ll_hsm_state_set(inode, hss);
d7e09d03 2366
97903a26 2367 kfree(hss);
0a3bdb00 2368 return rc;
d7e09d03
PT
2369 }
2370 case LL_IOC_HSM_ACTION: {
2371 struct md_op_data *op_data;
2372 struct hsm_current_action *hca;
2373 int rc;
2374
496a51bd
JL
2375 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2376 if (!hca)
0a3bdb00 2377 return -ENOMEM;
d7e09d03
PT
2378
2379 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2380 LUSTRE_OPC_ANY, hca);
79a8726a 2381 if (IS_ERR(op_data)) {
97903a26 2382 kfree(hca);
0a3bdb00 2383 return PTR_ERR(op_data);
d7e09d03
PT
2384 }
2385
2386 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2387 op_data, NULL);
2388
2389 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2390 rc = -EFAULT;
2391
2392 ll_finish_md_op_data(op_data);
97903a26 2393 kfree(hca);
0a3bdb00 2394 return rc;
d7e09d03 2395 }
d3a8a4e2
JX
2396 case LL_IOC_SET_LEASE: {
2397 struct ll_inode_info *lli = ll_i2info(inode);
2398 struct obd_client_handle *och = NULL;
2399 bool lease_broken;
2400 fmode_t mode = 0;
2401
2402 switch (arg) {
2403 case F_WRLCK:
2404 if (!(file->f_mode & FMODE_WRITE))
2405 return -EPERM;
2406 mode = FMODE_WRITE;
2407 break;
2408 case F_RDLCK:
2409 if (!(file->f_mode & FMODE_READ))
2410 return -EPERM;
2411 mode = FMODE_READ;
2412 break;
2413 case F_UNLCK:
2414 mutex_lock(&lli->lli_och_mutex);
2415 if (fd->fd_lease_och != NULL) {
2416 och = fd->fd_lease_och;
2417 fd->fd_lease_och = NULL;
2418 }
2419 mutex_unlock(&lli->lli_och_mutex);
2420
2421 if (och != NULL) {
2422 mode = och->och_flags &
2423 (FMODE_READ|FMODE_WRITE);
2424 rc = ll_lease_close(och, inode, &lease_broken);
2425 if (rc == 0 && lease_broken)
2426 mode = 0;
2427 } else {
2428 rc = -ENOLCK;
2429 }
2430
2431 /* return the type of lease or error */
2432 return rc < 0 ? rc : (int)mode;
2433 default:
2434 return -EINVAL;
2435 }
2436
2437 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2438
2439 /* apply for lease */
48d23e61 2440 och = ll_lease_open(inode, file, mode, 0);
d3a8a4e2
JX
2441 if (IS_ERR(och))
2442 return PTR_ERR(och);
2443
2444 rc = 0;
2445 mutex_lock(&lli->lli_och_mutex);
2446 if (fd->fd_lease_och == NULL) {
2447 fd->fd_lease_och = och;
2448 och = NULL;
2449 }
2450 mutex_unlock(&lli->lli_och_mutex);
2451 if (och != NULL) {
2452 /* impossible now that only excl is supported for now */
2453 ll_lease_close(och, inode, &lease_broken);
2454 rc = -EBUSY;
2455 }
2456 return rc;
2457 }
2458 case LL_IOC_GET_LEASE: {
2459 struct ll_inode_info *lli = ll_i2info(inode);
2460 struct ldlm_lock *lock = NULL;
2461
2462 rc = 0;
2463 mutex_lock(&lli->lli_och_mutex);
2464 if (fd->fd_lease_och != NULL) {
2465 struct obd_client_handle *och = fd->fd_lease_och;
2466
2467 lock = ldlm_handle2lock(&och->och_lease_handle);
2468 if (lock != NULL) {
2469 lock_res_and_lock(lock);
2470 if (!ldlm_is_cancel(lock))
2471 rc = och->och_flags &
2472 (FMODE_READ | FMODE_WRITE);
2473 unlock_res_and_lock(lock);
2474 ldlm_lock_put(lock);
2475 }
2476 }
2477 mutex_unlock(&lli->lli_och_mutex);
a720b790
JL
2478 return rc;
2479 }
2480 case LL_IOC_HSM_IMPORT: {
2481 struct hsm_user_import *hui;
2482
0c027bc3
AH
2483 hui = memdup_user((void *)arg, sizeof(*hui));
2484 if (IS_ERR(hui))
2485 return PTR_ERR(hui);
a720b790
JL
2486
2487 rc = ll_hsm_import(inode, file, hui);
d3a8a4e2 2488
97903a26 2489 kfree(hui);
d3a8a4e2
JX
2490 return rc;
2491 }
d7e09d03
PT
2492 default: {
2493 int err;
2494
1f6eaf83
JL
2495 if (ll_iocontrol_call(inode, file, cmd, arg, &err) ==
2496 LLIOC_STOP)
0a3bdb00 2497 return err;
d7e09d03 2498
0a3bdb00
GKH
2499 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2500 (void *)arg);
d7e09d03
PT
2501 }
2502 }
2503}
2504
2d95f10e 2505static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
d7e09d03 2506{
2a8a3597 2507 struct inode *inode = file_inode(file);
d7e09d03
PT
2508 loff_t retval, eof = 0;
2509
d7e09d03
PT
2510 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2511 (origin == SEEK_CUR) ? file->f_pos : 0);
2512 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2513 inode->i_ino, inode->i_generation, inode, retval, retval,
2514 origin);
2515 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2516
2517 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2518 retval = ll_glimpse_size(inode);
2519 if (retval != 0)
0a3bdb00 2520 return retval;
d7e09d03
PT
2521 eof = i_size_read(inode);
2522 }
2523
6f014339 2524 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2525 ll_file_maxbytes(inode), eof);
0a3bdb00 2526 return retval;
d7e09d03
PT
2527}
2528
2d95f10e 2529static int ll_flush(struct file *file, fl_owner_t id)
d7e09d03 2530{
2a8a3597 2531 struct inode *inode = file_inode(file);
d7e09d03
PT
2532 struct ll_inode_info *lli = ll_i2info(inode);
2533 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2534 int rc, err;
2535
2536 LASSERT(!S_ISDIR(inode->i_mode));
2537
2538 /* catch async errors that were recorded back when async writeback
2539 * failed for pages in this mapping. */
2540 rc = lli->lli_async_rc;
2541 lli->lli_async_rc = 0;
2542 err = lov_read_and_clear_async_rc(lli->lli_clob);
2543 if (rc == 0)
2544 rc = err;
2545
2546 /* The application has been told write failure already.
2547 * Do not report failure again. */
2548 if (fd->fd_write_failed)
2549 return 0;
2550 return rc ? -EIO : 0;
2551}
2552
2553/**
2554 * Called to make sure a portion of file has been written out.
05289927 2555 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
d7e09d03
PT
2556 *
2557 * Return how many pages have been written.
2558 */
2559int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2560 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2561{
2562 struct cl_env_nest nest;
2563 struct lu_env *env;
2564 struct cl_io *io;
d7e09d03
PT
2565 struct cl_fsync_io *fio;
2566 int result;
d7e09d03
PT
2567
2568 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2569 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2570 return -EINVAL;
d7e09d03
PT
2571
2572 env = cl_env_nested_get(&nest);
2573 if (IS_ERR(env))
0a3bdb00 2574 return PTR_ERR(env);
d7e09d03 2575
d7e09d03
PT
2576 io = ccc_env_thread_io(env);
2577 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2578 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2579
2580 /* initialize parameters for sync */
2581 fio = &io->u.ci_fsync;
d7e09d03
PT
2582 fio->fi_start = start;
2583 fio->fi_end = end;
2584 fio->fi_fid = ll_inode2fid(inode);
2585 fio->fi_mode = mode;
2586 fio->fi_nr_written = 0;
2587
2588 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2589 result = cl_io_loop(env, io);
2590 else
2591 result = io->ci_result;
2592 if (result == 0)
2593 result = fio->fi_nr_written;
2594 cl_io_fini(env, io);
2595 cl_env_nested_put(&nest, env);
2596
0a3bdb00 2597 return result;
d7e09d03
PT
2598}
2599
d7e09d03
PT
2600int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2601{
2a8a3597 2602 struct inode *inode = file_inode(file);
d7e09d03
PT
2603 struct ll_inode_info *lli = ll_i2info(inode);
2604 struct ptlrpc_request *req;
d7e09d03 2605 int rc, err;
d7e09d03
PT
2606
2607 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2608 inode->i_generation, inode);
2609 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2610
2611 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
5955102c 2612 inode_lock(inode);
d7e09d03
PT
2613
2614 /* catch async errors that were recorded back when async writeback
2615 * failed for pages in this mapping. */
2616 if (!S_ISDIR(inode->i_mode)) {
2617 err = lli->lli_async_rc;
2618 lli->lli_async_rc = 0;
2619 if (rc == 0)
2620 rc = err;
2621 err = lov_read_and_clear_async_rc(lli->lli_clob);
2622 if (rc == 0)
2623 rc = err;
2624 }
2625
ef2e0f55 2626 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
d7e09d03
PT
2627 if (!rc)
2628 rc = err;
2629 if (!err)
2630 ptlrpc_req_finished(req);
2631
8d97deb9 2632 if (S_ISREG(inode->i_mode)) {
d7e09d03
PT
2633 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2634
05289927 2635 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
d7e09d03
PT
2636 if (rc == 0 && err < 0)
2637 rc = err;
2638 if (rc < 0)
2639 fd->fd_write_failed = true;
2640 else
2641 fd->fd_write_failed = false;
2642 }
2643
5955102c 2644 inode_unlock(inode);
0a3bdb00 2645 return rc;
d7e09d03
PT
2646}
2647
2d95f10e
JH
2648static int
2649ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2650{
2a8a3597 2651 struct inode *inode = file_inode(file);
d7e09d03 2652 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2653 struct ldlm_enqueue_info einfo = {
2654 .ei_type = LDLM_FLOCK,
2655 .ei_cb_cp = ldlm_flock_completion_ast,
2656 .ei_cbdata = file_lock,
2657 };
d7e09d03
PT
2658 struct md_op_data *op_data;
2659 struct lustre_handle lockh = {0};
8369cfff 2660 ldlm_policy_data_t flock = { {0} };
875332d4 2661 __u64 flags = 0;
d7e09d03
PT
2662 int rc;
2663 int rc2 = 0;
d7e09d03
PT
2664
2665 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2666 inode->i_ino, file_lock);
2667
2668 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2669
130d1f95 2670 if (file_lock->fl_flags & FL_FLOCK)
d7e09d03 2671 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
130d1f95 2672 else if (!(file_lock->fl_flags & FL_POSIX))
0a3bdb00 2673 return -EINVAL;
130d1f95
JL
2674
2675 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
d7e09d03 2676 flock.l_flock.pid = file_lock->fl_pid;
130d1f95
JL
2677 flock.l_flock.start = file_lock->fl_start;
2678 flock.l_flock.end = file_lock->fl_end;
d7e09d03
PT
2679
2680 /* Somewhat ugly workaround for svc lockd.
2681 * lockd installs custom fl_lmops->lm_compare_owner that checks
2682 * for the fl_owner to be the same (which it always is on local node
2683 * I guess between lockd processes) and then compares pid.
2684 * As such we assign pid to the owner field to make it all work,
2685 * conflict with normal locks is unlikely since pid space and
2686 * pointer space for current->files are not intersecting */
2687 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2688 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2689
2690 switch (file_lock->fl_type) {
2691 case F_RDLCK:
2692 einfo.ei_mode = LCK_PR;
2693 break;
2694 case F_UNLCK:
2695 /* An unlock request may or may not have any relation to
2696 * existing locks so we may not be able to pass a lock handle
2697 * via a normal ldlm_lock_cancel() request. The request may even
2698 * unlock a byte range in the middle of an existing lock. In
2699 * order to process an unlock request we need all of the same
2700 * information that is given with a normal read or write record
2701 * lock request. To avoid creating another ldlm unlock (cancel)
2702 * message we'll treat a LCK_NL flock request as an unlock. */
2703 einfo.ei_mode = LCK_NL;
2704 break;
2705 case F_WRLCK:
2706 einfo.ei_mode = LCK_PW;
2707 break;
2708 default:
2709 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2710 file_lock->fl_type);
0a3bdb00 2711 return -ENOTSUPP;
d7e09d03
PT
2712 }
2713
2714 switch (cmd) {
2715 case F_SETLKW:
2716#ifdef F_SETLKW64
2717 case F_SETLKW64:
2718#endif
2719 flags = 0;
2720 break;
2721 case F_SETLK:
2722#ifdef F_SETLK64
2723 case F_SETLK64:
2724#endif
2725 flags = LDLM_FL_BLOCK_NOWAIT;
2726 break;
2727 case F_GETLK:
2728#ifdef F_GETLK64
2729 case F_GETLK64:
2730#endif
2731 flags = LDLM_FL_TEST_LOCK;
2732 /* Save the old mode so that if the mode in the lock changes we
2733 * can decrement the appropriate reader or writer refcount. */
2734 file_lock->fl_type = einfo.ei_mode;
2735 break;
2736 default:
2737 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2738 return -EINVAL;
d7e09d03
PT
2739 }
2740
2741 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2742 LUSTRE_OPC_ANY, NULL);
2743 if (IS_ERR(op_data))
0a3bdb00 2744 return PTR_ERR(op_data);
d7e09d03 2745
b0f5aad5
GKH
2746 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2747 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2748 flock.l_flock.start, flock.l_flock.end);
d7e09d03
PT
2749
2750 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2751 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2752
4f656367 2753 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
d7e09d03 2754 !(flags & LDLM_FL_TEST_LOCK))
4f656367 2755 rc2 = locks_lock_file_wait(file, file_lock);
d7e09d03
PT
2756
2757 if (rc2 && file_lock->fl_type != F_UNLCK) {
2758 einfo.ei_mode = LCK_NL;
2759 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2760 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2761 rc = rc2;
2762 }
2763
2764 ll_finish_md_op_data(op_data);
2765
0a3bdb00 2766 return rc;
d7e09d03
PT
2767}
2768
2d95f10e
JH
2769static int
2770ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2771{
0a3bdb00 2772 return -ENOSYS;
d7e09d03
PT
2773}
2774
2775/**
2776 * test if some locks matching bits and l_req_mode are acquired
2777 * - bits can be in different locks
2778 * - if found clear the common lock bits in *bits
2779 * - the bits not found, are kept in *bits
2780 * \param inode [IN]
2781 * \param bits [IN] searched lock bits [IN]
2782 * \param l_req_mode [IN] searched lock mode
2783 * \retval boolean, true iff all bits are found
2784 */
2785int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2786{
2787 struct lustre_handle lockh;
2788 ldlm_policy_data_t policy;
2789 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2790 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2791 struct lu_fid *fid;
2792 __u64 flags;
2793 int i;
d7e09d03
PT
2794
2795 if (!inode)
ef075edc 2796 return 0;
d7e09d03
PT
2797
2798 fid = &ll_i2info(inode)->lli_fid;
2799 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2800 ldlm_lockname[mode]);
2801
2802 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2803 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2804 policy.l_inodebits.bits = *bits & (1 << i);
2805 if (policy.l_inodebits.bits == 0)
2806 continue;
2807
2808 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2809 &policy, mode, &lockh)) {
2810 struct ldlm_lock *lock;
2811
2812 lock = ldlm_handle2lock(&lockh);
2813 if (lock) {
2814 *bits &=
2815 ~(lock->l_policy_data.l_inodebits.bits);
2816 LDLM_LOCK_PUT(lock);
2817 } else {
2818 *bits &= ~policy.l_inodebits.bits;
2819 }
2820 }
2821 }
0a3bdb00 2822 return *bits == 0;
d7e09d03
PT
2823}
2824
2825ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
7fc1f831
AP
2826 struct lustre_handle *lockh, __u64 flags,
2827 ldlm_mode_t mode)
d7e09d03 2828{
57303e76 2829 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
d7e09d03
PT
2830 struct lu_fid *fid;
2831 ldlm_mode_t rc;
d7e09d03
PT
2832
2833 fid = &ll_i2info(inode)->lli_fid;
2834 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2835
1f6eaf83 2836 rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED,
7fc1f831
AP
2837 fid, LDLM_IBITS, &policy, mode, lockh);
2838
0a3bdb00 2839 return rc;
d7e09d03
PT
2840}
2841
2842static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2843{
2844 /* Already unlinked. Just update nlink and return success */
2845 if (rc == -ENOENT) {
2846 clear_nlink(inode);
2847 /* This path cannot be hit for regular files unless in
bef31c78
MI
2848 * case of obscure races, so no need to validate size.
2849 */
d7e09d03
PT
2850 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2851 return 0;
2852 } else if (rc != 0) {
e49634bb
AD
2853 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2854 "%s: revalidate FID "DFID" error: rc = %d\n",
2855 ll_get_fsname(inode->i_sb, NULL, 0),
2856 PFID(ll_inode2fid(inode)), rc);
d7e09d03
PT
2857 }
2858
2859 return rc;
2860}
2861
2d95f10e 2862static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03 2863{
2b0143b5 2864 struct inode *inode = d_inode(dentry);
d7e09d03
PT
2865 struct ptlrpc_request *req = NULL;
2866 struct obd_export *exp;
2867 int rc = 0;
d7e09d03
PT
2868
2869 LASSERT(inode != NULL);
2870
09561a53
AV
2871 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2872 inode->i_ino, inode->i_generation, inode, dentry);
d7e09d03
PT
2873
2874 exp = ll_i2mdexp(inode);
2875
2876 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2877 * But under CMD case, it caused some lock issues, should be fixed
2878 * with new CMD ibits lock. See bug 12718 */
2879 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2880 struct lookup_intent oit = { .it_op = IT_GETATTR };
2881 struct md_op_data *op_data;
2882
2883 if (ibits == MDS_INODELOCK_LOOKUP)
2884 oit.it_op = IT_LOOKUP;
2885
2886 /* Call getattr by fid, so do not provide name at all. */
dbca51dd
AV
2887 op_data = ll_prep_md_op_data(NULL, inode,
2888 inode, NULL, 0, 0,
d7e09d03
PT
2889 LUSTRE_OPC_ANY, NULL);
2890 if (IS_ERR(op_data))
0a3bdb00 2891 return PTR_ERR(op_data);
d7e09d03
PT
2892
2893 oit.it_create_mode |= M_CHECK_STALE;
2894 rc = md_intent_lock(exp, op_data, NULL, 0,
2895 /* we are not interested in name
2896 based lookup */
2897 &oit, 0, &req,
2898 ll_md_blocking_ast, 0);
2899 ll_finish_md_op_data(op_data);
2900 oit.it_create_mode &= ~M_CHECK_STALE;
2901 if (rc < 0) {
2902 rc = ll_inode_revalidate_fini(inode, rc);
34e1f2bb 2903 goto out;
d7e09d03
PT
2904 }
2905
dbca51dd 2906 rc = ll_revalidate_it_finish(req, &oit, inode);
d7e09d03
PT
2907 if (rc != 0) {
2908 ll_intent_release(&oit);
34e1f2bb 2909 goto out;
d7e09d03
PT
2910 }
2911
2912 /* Unlinked? Unhash dentry, so it is not picked up later by
2913 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2914 here to preserve get_cwd functionality on 2.6.
2915 Bug 10503 */
2b0143b5 2916 if (!d_inode(dentry)->i_nlink)
b1d2a127 2917 d_lustre_invalidate(dentry, 0);
d7e09d03 2918
dbca51dd 2919 ll_lookup_finish_locks(&oit, inode);
2b0143b5
DH
2920 } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2921 struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
21aef7d9 2922 u64 valid = OBD_MD_FLGETATTR;
d7e09d03
PT
2923 struct md_op_data *op_data;
2924 int ealen = 0;
2925
2926 if (S_ISREG(inode->i_mode)) {
44779340 2927 rc = ll_get_default_mdsize(sbi, &ealen);
d7e09d03 2928 if (rc)
0a3bdb00 2929 return rc;
d7e09d03
PT
2930 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2931 }
2932
2933 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2934 0, ealen, LUSTRE_OPC_ANY,
2935 NULL);
2936 if (IS_ERR(op_data))
0a3bdb00 2937 return PTR_ERR(op_data);
d7e09d03
PT
2938
2939 op_data->op_valid = valid;
d7e09d03
PT
2940 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2941 ll_finish_md_op_data(op_data);
2942 if (rc) {
2943 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2944 return rc;
d7e09d03
PT
2945 }
2946
2947 rc = ll_prep_inode(&inode, req, NULL, NULL);
2948 }
2949out:
2950 ptlrpc_req_finished(req);
2951 return rc;
2952}
2953
2d95f10e 2954static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03 2955{
2b0143b5 2956 struct inode *inode = d_inode(dentry);
d7e09d03 2957 int rc;
d7e09d03 2958
2d95f10e 2959 rc = __ll_inode_revalidate(dentry, ibits);
d7e09d03 2960 if (rc != 0)
0a3bdb00 2961 return rc;
d7e09d03
PT
2962
2963 /* if object isn't regular file, don't validate size */
2964 if (!S_ISREG(inode->i_mode)) {
2965 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2966 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2967 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2968 } else {
5ea17d6c
JL
2969 /* In case of restore, the MDT has the right size and has
2970 * already send it back without granting the layout lock,
2971 * inode is up-to-date so glimpse is useless.
2972 * Also to glimpse we need the layout, in case of a running
2973 * restore the MDT holds the layout lock so the glimpse will
2974 * block up to the end of restore (getattr will block)
2975 */
2976 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2977 rc = ll_glimpse_size(inode);
d7e09d03 2978 }
0a3bdb00 2979 return rc;
d7e09d03
PT
2980}
2981
2d95f10e 2982int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
d7e09d03 2983{
2b0143b5 2984 struct inode *inode = d_inode(de);
d7e09d03
PT
2985 struct ll_sb_info *sbi = ll_i2sbi(inode);
2986 struct ll_inode_info *lli = ll_i2info(inode);
f82ced5d 2987 int res;
d7e09d03 2988
2d95f10e
JH
2989 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
2990 MDS_INODELOCK_LOOKUP);
d7e09d03
PT
2991 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2992
2993 if (res)
2994 return res;
2995
2996 stat->dev = inode->i_sb->s_dev;
2997 if (ll_need_32bit_api(sbi))
2998 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2999 else
3000 stat->ino = inode->i_ino;
3001 stat->mode = inode->i_mode;
3002 stat->nlink = inode->i_nlink;
3003 stat->uid = inode->i_uid;
3004 stat->gid = inode->i_gid;
3005 stat->rdev = inode->i_rdev;
3006 stat->atime = inode->i_atime;
3007 stat->mtime = inode->i_mtime;
3008 stat->ctime = inode->i_ctime;
3009 stat->blksize = 1 << inode->i_blkbits;
3010
3011 stat->size = i_size_read(inode);
3012 stat->blocks = inode->i_blocks;
3013
3014 return 0;
3015}
d7e09d03 3016
2d95f10e
JH
3017static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3018 __u64 start, __u64 len)
89580e37
PT
3019{
3020 int rc;
3021 size_t num_bytes;
3022 struct ll_user_fiemap *fiemap;
3023 unsigned int extent_count = fieinfo->fi_extents_max;
3024
3025 num_bytes = sizeof(*fiemap) + (extent_count *
3026 sizeof(struct ll_fiemap_extent));
e958f49b 3027 fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
89580e37
PT
3028
3029 if (fiemap == NULL)
3030 return -ENOMEM;
3031
3032 fiemap->fm_flags = fieinfo->fi_flags;
3033 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3034 fiemap->fm_start = start;
3035 fiemap->fm_length = len;
ebdc4fc5
BJ
3036 if (extent_count > 0)
3037 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3038 sizeof(struct ll_fiemap_extent));
89580e37
PT
3039
3040 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3041
3042 fieinfo->fi_flags = fiemap->fm_flags;
3043 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
ebdc4fc5
BJ
3044 if (extent_count > 0)
3045 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3046 fiemap->fm_mapped_extents *
3047 sizeof(struct ll_fiemap_extent));
89580e37 3048
e958f49b 3049 kvfree(fiemap);
89580e37
PT
3050 return rc;
3051}
d7e09d03 3052
2d95f10e 3053struct posix_acl *ll_get_acl(struct inode *inode, int type)
d7e09d03
PT
3054{
3055 struct ll_inode_info *lli = ll_i2info(inode);
3056 struct posix_acl *acl = NULL;
d7e09d03
PT
3057
3058 spin_lock(&lli->lli_lock);
3059 /* VFS' acl_permission_check->check_acl will release the refcount */
3060 acl = posix_acl_dup(lli->lli_posix_acl);
3061 spin_unlock(&lli->lli_lock);
3062
0a3bdb00 3063 return acl;
d7e09d03
PT
3064}
3065
d7e09d03
PT
3066int ll_inode_permission(struct inode *inode, int mask)
3067{
3068 int rc = 0;
d7e09d03
PT
3069
3070#ifdef MAY_NOT_BLOCK
3071 if (mask & MAY_NOT_BLOCK)
3072 return -ECHILD;
3073#endif
3074
3075 /* as root inode are NOT getting validated in lookup operation,
3076 * need to do it before permission check. */
3077
f76c23da 3078 if (is_root_inode(inode)) {
2d95f10e
JH
3079 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3080 MDS_INODELOCK_LOOKUP);
d7e09d03 3081 if (rc)
0a3bdb00 3082 return rc;
d7e09d03
PT
3083 }
3084
3085 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3086 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3087
3088 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3089 return lustre_check_remote_perm(inode, mask);
3090
3091 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 3092 rc = generic_permission(inode, mask);
d7e09d03 3093
0a3bdb00 3094 return rc;
d7e09d03
PT
3095}
3096
d7e09d03
PT
3097/* -o localflock - only provides locally consistent flock locks */
3098struct file_operations ll_file_operations = {
b42b15fd 3099 .read_iter = ll_file_read_iter,
b42b15fd 3100 .write_iter = ll_file_write_iter,
d7e09d03
PT
3101 .unlocked_ioctl = ll_file_ioctl,
3102 .open = ll_file_open,
3103 .release = ll_file_release,
3104 .mmap = ll_file_mmap,
3105 .llseek = ll_file_seek,
3106 .splice_read = ll_file_splice_read,
3107 .fsync = ll_fsync,
3108 .flush = ll_flush
3109};
3110
3111struct file_operations ll_file_operations_flock = {
b42b15fd 3112 .read_iter = ll_file_read_iter,
b42b15fd 3113 .write_iter = ll_file_write_iter,
d7e09d03
PT
3114 .unlocked_ioctl = ll_file_ioctl,
3115 .open = ll_file_open,
3116 .release = ll_file_release,
3117 .mmap = ll_file_mmap,
3118 .llseek = ll_file_seek,
3119 .splice_read = ll_file_splice_read,
3120 .fsync = ll_fsync,
3121 .flush = ll_flush,
3122 .flock = ll_file_flock,
3123 .lock = ll_file_flock
3124};
3125
3126/* These are for -o noflock - to return ENOSYS on flock calls */
3127struct file_operations ll_file_operations_noflock = {
b42b15fd 3128 .read_iter = ll_file_read_iter,
b42b15fd 3129 .write_iter = ll_file_write_iter,
d7e09d03
PT
3130 .unlocked_ioctl = ll_file_ioctl,
3131 .open = ll_file_open,
3132 .release = ll_file_release,
3133 .mmap = ll_file_mmap,
3134 .llseek = ll_file_seek,
3135 .splice_read = ll_file_splice_read,
3136 .fsync = ll_fsync,
3137 .flush = ll_flush,
3138 .flock = ll_file_noflock,
3139 .lock = ll_file_noflock
3140};
3141
d2d32738 3142const struct inode_operations ll_file_inode_operations = {
d7e09d03
PT
3143 .setattr = ll_setattr,
3144 .getattr = ll_getattr,
3145 .permission = ll_inode_permission,
3146 .setxattr = ll_setxattr,
3147 .getxattr = ll_getxattr,
3148 .listxattr = ll_listxattr,
3149 .removexattr = ll_removexattr,
89580e37 3150 .fiemap = ll_fiemap,
d7e09d03
PT
3151 .get_acl = ll_get_acl,
3152};
3153
d0a0acc3 3154/* dynamic ioctl number support routines */
d7e09d03
PT
3155static struct llioc_ctl_data {
3156 struct rw_semaphore ioc_sem;
3157 struct list_head ioc_head;
3158} llioc = {
3159 __RWSEM_INITIALIZER(llioc.ioc_sem),
3160 LIST_HEAD_INIT(llioc.ioc_head)
3161};
3162
d7e09d03
PT
3163struct llioc_data {
3164 struct list_head iocd_list;
3165 unsigned int iocd_size;
3166 llioc_callback_t iocd_cb;
3167 unsigned int iocd_count;
3168 unsigned int iocd_cmd[0];
3169};
3170
3171void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3172{
3173 unsigned int size;
3174 struct llioc_data *in_data = NULL;
d7e09d03
PT
3175
3176 if (cb == NULL || cmd == NULL ||
3177 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 3178 return NULL;
d7e09d03
PT
3179
3180 size = sizeof(*in_data) + count * sizeof(unsigned int);
496a51bd
JL
3181 in_data = kzalloc(size, GFP_NOFS);
3182 if (!in_data)
0a3bdb00 3183 return NULL;
d7e09d03
PT
3184
3185 memset(in_data, 0, sizeof(*in_data));
3186 in_data->iocd_size = size;
3187 in_data->iocd_cb = cb;
3188 in_data->iocd_count = count;
3189 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3190
3191 down_write(&llioc.ioc_sem);
3192 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3193 up_write(&llioc.ioc_sem);
3194
0a3bdb00 3195 return in_data;
d7e09d03 3196}
93133eb4 3197EXPORT_SYMBOL(ll_iocontrol_register);
d7e09d03
PT
3198
3199void ll_iocontrol_unregister(void *magic)
3200{
3201 struct llioc_data *tmp;
3202
3203 if (magic == NULL)
3204 return;
3205
3206 down_write(&llioc.ioc_sem);
3207 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3208 if (tmp == magic) {
d7e09d03
PT
3209 list_del(&tmp->iocd_list);
3210 up_write(&llioc.ioc_sem);
3211
97903a26 3212 kfree(tmp);
d7e09d03
PT
3213 return;
3214 }
3215 }
3216 up_write(&llioc.ioc_sem);
3217
3218 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3219}
d7e09d03
PT
3220EXPORT_SYMBOL(ll_iocontrol_unregister);
3221
2d95f10e
JH
3222static enum llioc_iter
3223ll_iocontrol_call(struct inode *inode, struct file *file,
3224 unsigned int cmd, unsigned long arg, int *rcp)
d7e09d03
PT
3225{
3226 enum llioc_iter ret = LLIOC_CONT;
3227 struct llioc_data *data;
3228 int rc = -EINVAL, i;
3229
3230 down_read(&llioc.ioc_sem);
3231 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3232 for (i = 0; i < data->iocd_count; i++) {
3233 if (cmd != data->iocd_cmd[i])
3234 continue;
3235
3236 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3237 break;
3238 }
3239
3240 if (ret == LLIOC_STOP)
3241 break;
3242 }
3243 up_read(&llioc.ioc_sem);
3244
3245 if (rcp)
3246 *rcp = rc;
3247 return ret;
3248}
3249
3250int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3251{
3252 struct ll_inode_info *lli = ll_i2info(inode);
3253 struct cl_env_nest nest;
3254 struct lu_env *env;
3255 int result;
d7e09d03
PT
3256
3257 if (lli->lli_clob == NULL)
0a3bdb00 3258 return 0;
d7e09d03
PT
3259
3260 env = cl_env_nested_get(&nest);
3261 if (IS_ERR(env))
0a3bdb00 3262 return PTR_ERR(env);
d7e09d03
PT
3263
3264 result = cl_conf_set(env, lli->lli_clob, conf);
3265 cl_env_nested_put(&nest, env);
3266
3267 if (conf->coc_opc == OBJECT_CONF_SET) {
3268 struct ldlm_lock *lock = conf->coc_lock;
3269
3270 LASSERT(lock != NULL);
3271 LASSERT(ldlm_has_layout(lock));
3272 if (result == 0) {
3273 /* it can only be allowed to match after layout is
3274 * applied to inode otherwise false layout would be
d0a0acc3 3275 * seen. Applying layout should happen before dropping
d7e09d03
PT
3276 * the intent lock. */
3277 ldlm_lock_allow_match(lock);
3278 }
3279 }
0a3bdb00 3280 return result;
d7e09d03
PT
3281}
3282
3283/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3284static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3285
3286{
3287 struct ll_sb_info *sbi = ll_i2sbi(inode);
d7e09d03
PT
3288 struct ptlrpc_request *req;
3289 struct mdt_body *body;
3290 void *lvbdata;
3291 void *lmm;
3292 int lmmsize;
3293 int rc;
d7e09d03 3294
e2335e5d 3295 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3296 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3297 lock->l_lvb_data, lock->l_lvb_len);
3298
3299 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 3300 return 0;
d7e09d03
PT
3301
3302 /* if layout lock was granted right away, the layout is returned
3303 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3304 * blocked and then granted via completion ast, we have to fetch
3305 * layout here. Please note that we can't use the LVB buffer in
3306 * completion AST because it doesn't have a large enough buffer */
44779340 3307 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03 3308 if (rc == 0)
ef2e0f55
OD
3309 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3310 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3311 lmmsize, 0, &req);
d7e09d03 3312 if (rc < 0)
0a3bdb00 3313 return rc;
d7e09d03
PT
3314
3315 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
34e1f2bb
JL
3316 if (body == NULL) {
3317 rc = -EPROTO;
3318 goto out;
3319 }
d7e09d03
PT
3320
3321 lmmsize = body->eadatasize;
34e1f2bb
JL
3322 if (lmmsize == 0) /* empty layout */ {
3323 rc = 0;
3324 goto out;
3325 }
d7e09d03
PT
3326
3327 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
34e1f2bb
JL
3328 if (lmm == NULL) {
3329 rc = -EFAULT;
3330 goto out;
3331 }
d7e09d03 3332
e958f49b 3333 lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
34e1f2bb
JL
3334 if (lvbdata == NULL) {
3335 rc = -ENOMEM;
3336 goto out;
3337 }
d7e09d03
PT
3338
3339 memcpy(lvbdata, lmm, lmmsize);
3340 lock_res_and_lock(lock);
e2335e5d 3341 if (lock->l_lvb_data != NULL)
e958f49b 3342 kvfree(lock->l_lvb_data);
e2335e5d 3343
3344 lock->l_lvb_data = lvbdata;
3345 lock->l_lvb_len = lmmsize;
d7e09d03
PT
3346 unlock_res_and_lock(lock);
3347
d7e09d03
PT
3348out:
3349 ptlrpc_req_finished(req);
3350 return rc;
3351}
3352
3353/**
3354 * Apply the layout to the inode. Layout lock is held and will be released
3355 * in this function.
3356 */
3357static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3358 struct inode *inode, __u32 *gen, bool reconf)
3359{
3360 struct ll_inode_info *lli = ll_i2info(inode);
3361 struct ll_sb_info *sbi = ll_i2sbi(inode);
3362 struct ldlm_lock *lock;
3363 struct lustre_md md = { NULL };
3364 struct cl_object_conf conf;
3365 int rc = 0;
3366 bool lvb_ready;
3367 bool wait_layout = false;
d7e09d03
PT
3368
3369 LASSERT(lustre_handle_is_used(lockh));
3370
3371 lock = ldlm_handle2lock(lockh);
3372 LASSERT(lock != NULL);
3373 LASSERT(ldlm_has_layout(lock));
3374
3375 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 3376 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 3377
bc969176
JL
3378 /* in case this is a caching lock and reinstate with new inode */
3379 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3380
d7e09d03
PT
3381 lock_res_and_lock(lock);
3382 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3383 unlock_res_and_lock(lock);
3384 /* checking lvb_ready is racy but this is okay. The worst case is
3385 * that multi processes may configure the file on the same time. */
3386 if (lvb_ready || !reconf) {
3387 rc = -ENODATA;
3388 if (lvb_ready) {
3389 /* layout_gen must be valid if layout lock is not
3390 * cancelled and stripe has already set */
09aed8a5 3391 *gen = ll_layout_version_get(lli);
d7e09d03
PT
3392 rc = 0;
3393 }
34e1f2bb 3394 goto out;
d7e09d03
PT
3395 }
3396
3397 rc = ll_layout_fetch(inode, lock);
3398 if (rc < 0)
34e1f2bb 3399 goto out;
d7e09d03
PT
3400
3401 /* for layout lock, lmm is returned in lock's lvb.
3402 * lvb_data is immutable if the lock is held so it's safe to access it
3403 * without res lock. See the description in ldlm_lock_decref_internal()
3404 * for the condition to free lvb_data of layout lock */
3405 if (lock->l_lvb_data != NULL) {
3406 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3407 lock->l_lvb_data, lock->l_lvb_len);
3408 if (rc >= 0) {
3409 *gen = LL_LAYOUT_GEN_EMPTY;
3410 if (md.lsm != NULL)
3411 *gen = md.lsm->lsm_layout_gen;
3412 rc = 0;
3413 } else {
3414 CERROR("%s: file "DFID" unpackmd error: %d\n",
3415 ll_get_fsname(inode->i_sb, NULL, 0),
3416 PFID(&lli->lli_fid), rc);
3417 }
3418 }
3419 if (rc < 0)
34e1f2bb 3420 goto out;
d7e09d03
PT
3421
3422 /* set layout to file. Unlikely this will fail as old layout was
3423 * surely eliminated */
ec83e611 3424 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3425 conf.coc_opc = OBJECT_CONF_SET;
3426 conf.coc_inode = inode;
3427 conf.coc_lock = lock;
3428 conf.u.coc_md = &md;
3429 rc = ll_layout_conf(inode, &conf);
3430
3431 if (md.lsm != NULL)
3432 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3433
3434 /* refresh layout failed, need to wait */
3435 wait_layout = rc == -EBUSY;
d7e09d03
PT
3436
3437out:
3438 LDLM_LOCK_PUT(lock);
3439 ldlm_lock_decref(lockh, mode);
3440
3441 /* wait for IO to complete if it's still being used. */
3442 if (wait_layout) {
3443 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3444 ll_get_fsname(inode->i_sb, NULL, 0),
3445 inode, PFID(&lli->lli_fid));
3446
ec83e611 3447 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3448 conf.coc_opc = OBJECT_CONF_WAIT;
3449 conf.coc_inode = inode;
3450 rc = ll_layout_conf(inode, &conf);
3451 if (rc == 0)
3452 rc = -EAGAIN;
3453
3454 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3455 PFID(&lli->lli_fid), rc);
3456 }
0a3bdb00 3457 return rc;
d7e09d03
PT
3458}
3459
3460/**
3461 * This function checks if there exists a LAYOUT lock on the client side,
3462 * or enqueues it if it doesn't have one in cache.
3463 *
3464 * This function will not hold layout lock so it may be revoked any time after
3465 * this function returns. Any operations depend on layout should be redone
3466 * in that case.
3467 *
3468 * This function should be called before lov_io_init() to get an uptodate
3469 * layout version, the caller should save the version number and after IO
3470 * is finished, this function should be called again to verify that layout
3471 * is not changed during IO time.
3472 */
3473int ll_layout_refresh(struct inode *inode, __u32 *gen)
3474{
3475 struct ll_inode_info *lli = ll_i2info(inode);
3476 struct ll_sb_info *sbi = ll_i2sbi(inode);
3477 struct md_op_data *op_data;
3478 struct lookup_intent it;
3479 struct lustre_handle lockh;
3480 ldlm_mode_t mode;
f2145eae
BK
3481 struct ldlm_enqueue_info einfo = {
3482 .ei_type = LDLM_IBITS,
3483 .ei_mode = LCK_CR,
3484 .ei_cb_bl = ll_md_blocking_ast,
3485 .ei_cb_cp = ldlm_completion_ast,
3486 };
d7e09d03 3487 int rc;
d7e09d03 3488
09aed8a5
JX
3489 *gen = ll_layout_version_get(lli);
3490 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
0a3bdb00 3491 return 0;
d7e09d03
PT
3492
3493 /* sanity checks */
3494 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3495 LASSERT(S_ISREG(inode->i_mode));
3496
d7e09d03
PT
3497 /* take layout lock mutex to enqueue layout lock exclusively. */
3498 mutex_lock(&lli->lli_layout_mutex);
3499
3500again:
09aed8a5
JX
3501 /* mostly layout lock is caching on the local side, so try to match
3502 * it before grabbing layout lock mutex. */
7fc1f831
AP
3503 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3504 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
d7e09d03
PT
3505 if (mode != 0) { /* hit cached lock */
3506 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3507 if (rc == -EAGAIN)
3508 goto again;
3509
3510 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3511 return rc;
d7e09d03
PT
3512 }
3513
3514 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3515 0, 0, LUSTRE_OPC_ANY, NULL);
3516 if (IS_ERR(op_data)) {
3517 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3518 return PTR_ERR(op_data);
d7e09d03
PT
3519 }
3520
3521 /* have to enqueue one */
3522 memset(&it, 0, sizeof(it));
3523 it.it_op = IT_LAYOUT;
3524 lockh.cookie = 0ULL;
3525
3526 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3527 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3528 PFID(&lli->lli_fid));
3529
3530 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3531 NULL, 0, NULL, 0);
3532 if (it.d.lustre.it_data != NULL)
3533 ptlrpc_req_finished(it.d.lustre.it_data);
3534 it.d.lustre.it_data = NULL;
3535
3536 ll_finish_md_op_data(op_data);
3537
d7e09d03
PT
3538 mode = it.d.lustre.it_lock_mode;
3539 it.d.lustre.it_lock_mode = 0;
3540 ll_intent_drop_lock(&it);
3541
3542 if (rc == 0) {
3543 /* set lock data in case this is a new lock */
3544 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3545 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3546 if (rc == -EAGAIN)
3547 goto again;
3548 }
3549 mutex_unlock(&lli->lli_layout_mutex);
3550
0a3bdb00 3551 return rc;
d7e09d03 3552}
5ea17d6c
JL
3553
3554/**
3555 * This function send a restore request to the MDT
3556 */
3557int ll_layout_restore(struct inode *inode)
3558{
3559 struct hsm_user_request *hur;
3560 int len, rc;
3561
3562 len = sizeof(struct hsm_user_request) +
3563 sizeof(struct hsm_user_item);
496a51bd
JL
3564 hur = kzalloc(len, GFP_NOFS);
3565 if (!hur)
5ea17d6c
JL
3566 return -ENOMEM;
3567
3568 hur->hur_request.hr_action = HUA_RESTORE;
3569 hur->hur_request.hr_archive_id = 0;
3570 hur->hur_request.hr_flags = 0;
3571 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3572 sizeof(hur->hur_user_item[0].hui_fid));
3573 hur->hur_user_item[0].hui_extent.length = -1;
3574 hur->hur_request.hr_itemcount = 1;
3575 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3576 len, hur, NULL);
97903a26 3577 kfree(hur);
5ea17d6c
JL
3578 return rc;
3579}