4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
85 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
86 op_data->op_bias |= MDS_DATA_MODIFIED;
90 * Closes the IO epoch and packs all the attributes into @op_data for
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
102 if (!(och->och_flags & FMODE_WRITE))
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
108 ll_ioepoch_close(inode, op_data, &och, 0);
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
117 static int ll_close_inode_openhandle(struct obd_export *md_exp,
119 struct obd_client_handle *och)
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
131 * XXX: in case of LMV, is this correct to access
134 CERROR("Invalid MDC connection handle "LPX64"\n",
135 ll_i2mdexp(inode)->exp_handle.h_cookie);
139 OBD_ALLOC_PTR(op_data);
141 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
143 ll_prepare_close(inode, op_data, och);
144 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
145 rc = md_close(md_exp, op_data, och->och_mod, &req);
147 /* This close must have the epoch closed. */
148 LASSERT(epoch_close);
149 /* MDS has instructed us to obtain Size-on-MDS attribute from
150 * OSTs and send setattr to back to MDS. */
151 rc = ll_som_update(inode, op_data);
153 CERROR("inode %lu mdc Size-on-MDS update failed: "
154 "rc = %d\n", inode->i_ino, rc);
158 CERROR("inode %lu mdc close failed: rc = %d\n",
162 /* DATA_MODIFIED flag was successfully sent on close, cancel data
163 * modification flag. */
164 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
165 struct ll_inode_info *lli = ll_i2info(inode);
167 spin_lock(&lli->lli_lock);
168 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
169 spin_unlock(&lli->lli_lock);
172 ll_finish_md_op_data(op_data);
175 rc = ll_objects_destroy(req, inode);
177 CERROR("inode %lu ll_objects destroy: rc = %d\n",
184 if (exp_connect_som(exp) && !epoch_close &&
185 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
186 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
188 md_clear_open_replay_data(md_exp, och);
189 /* Free @och if it is not waiting for DONE_WRITING. */
190 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
193 if (req) /* This is close request */
194 ptlrpc_req_finished(req);
198 int ll_md_real_close(struct inode *inode, int flags)
200 struct ll_inode_info *lli = ll_i2info(inode);
201 struct obd_client_handle **och_p;
202 struct obd_client_handle *och;
207 if (flags & FMODE_WRITE) {
208 och_p = &lli->lli_mds_write_och;
209 och_usecount = &lli->lli_open_fd_write_count;
210 } else if (flags & FMODE_EXEC) {
211 och_p = &lli->lli_mds_exec_och;
212 och_usecount = &lli->lli_open_fd_exec_count;
214 LASSERT(flags & FMODE_READ);
215 och_p = &lli->lli_mds_read_och;
216 och_usecount = &lli->lli_open_fd_read_count;
219 mutex_lock(&lli->lli_och_mutex);
220 if (*och_usecount) { /* There are still users of this handle, so
222 mutex_unlock(&lli->lli_och_mutex);
227 mutex_unlock(&lli->lli_och_mutex);
229 if (och) { /* There might be a race and somebody have freed this och
231 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
238 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
241 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
242 struct ll_inode_info *lli = ll_i2info(inode);
246 /* clear group lock, if present */
247 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
248 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
250 /* Let's see if we have good enough OPEN lock on the file and if
251 we can skip talking to MDS */
252 if (file->f_dentry->d_inode) { /* Can this ever be false? */
254 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
255 struct lustre_handle lockh;
256 struct inode *inode = file->f_dentry->d_inode;
257 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
259 mutex_lock(&lli->lli_och_mutex);
260 if (fd->fd_omode & FMODE_WRITE) {
262 LASSERT(lli->lli_open_fd_write_count);
263 lli->lli_open_fd_write_count--;
264 } else if (fd->fd_omode & FMODE_EXEC) {
266 LASSERT(lli->lli_open_fd_exec_count);
267 lli->lli_open_fd_exec_count--;
270 LASSERT(lli->lli_open_fd_read_count);
271 lli->lli_open_fd_read_count--;
273 mutex_unlock(&lli->lli_och_mutex);
275 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
276 LDLM_IBITS, &policy, lockmode,
278 rc = ll_md_real_close(file->f_dentry->d_inode,
282 CERROR("Releasing a file %p with negative dentry %p. Name %s",
283 file, file->f_dentry, file->f_dentry->d_name.name);
286 LUSTRE_FPRIVATE(file) = NULL;
287 ll_file_data_put(fd);
288 ll_capa_close(inode);
293 /* While this returns an error code, fput() the caller does not, so we need
294 * to make every effort to clean up all of our state here. Also, applications
295 * rarely check close errors and even if an error is returned they will not
296 * re-try the close call.
298 int ll_file_release(struct inode *inode, struct file *file)
300 struct ll_file_data *fd;
301 struct ll_sb_info *sbi = ll_i2sbi(inode);
302 struct ll_inode_info *lli = ll_i2info(inode);
306 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
307 inode->i_generation, inode);
309 #ifdef CONFIG_FS_POSIX_ACL
310 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
311 inode == inode->i_sb->s_root->d_inode) {
312 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
315 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
316 fd->fd_flags &= ~LL_FILE_RMTACL;
317 rct_del(&sbi->ll_rct, current_pid());
318 et_search_free(&sbi->ll_et, current_pid());
323 if (inode->i_sb->s_root != file->f_dentry)
324 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
325 fd = LUSTRE_FPRIVATE(file);
328 /* The last ref on @file, maybe not the the owner pid of statahead.
329 * Different processes can open the same dir, "ll_opendir_key" means:
330 * it is me that should stop the statahead thread. */
331 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
332 lli->lli_opendir_pid != 0)
333 ll_stop_statahead(inode, lli->lli_opendir_key);
335 if (inode->i_sb->s_root == file->f_dentry) {
336 LUSTRE_FPRIVATE(file) = NULL;
337 ll_file_data_put(fd);
341 if (!S_ISDIR(inode->i_mode)) {
342 lov_read_and_clear_async_rc(lli->lli_clob);
343 lli->lli_async_rc = 0;
346 rc = ll_md_close(sbi->ll_md_exp, inode, file);
348 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
349 libcfs_debug_dumplog();
354 static int ll_intent_file_open(struct file *file, void *lmm,
355 int lmmsize, struct lookup_intent *itp)
357 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
358 struct dentry *parent = file->f_dentry->d_parent;
359 const char *name = file->f_dentry->d_name.name;
360 const int len = file->f_dentry->d_name.len;
361 struct md_op_data *op_data;
362 struct ptlrpc_request *req;
363 __u32 opc = LUSTRE_OPC_ANY;
370 /* Usually we come here only for NFSD, and we want open lock.
371 But we can also get here with pre 2.6.15 patchless kernels, and in
372 that case that lock is also ok */
373 /* We can also get here if there was cached open handle in revalidate_it
374 * but it disappeared while we were getting from there to ll_file_open.
375 * But this means this file was closed and immediatelly opened which
376 * makes a good candidate for using OPEN lock */
377 /* If lmmsize & lmm are not 0, we are just setting stripe info
378 * parameters. No need for the open lock */
379 if (lmm == NULL && lmmsize == 0) {
380 itp->it_flags |= MDS_OPEN_LOCK;
381 if (itp->it_flags & FMODE_WRITE)
382 opc = LUSTRE_OPC_CREATE;
385 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
386 file->f_dentry->d_inode, name, len,
389 RETURN(PTR_ERR(op_data));
391 itp->it_flags |= MDS_OPEN_BY_FID;
392 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
393 0 /*unused */, &req, ll_md_blocking_ast, 0);
394 ll_finish_md_op_data(op_data);
396 /* reason for keep own exit path - don`t flood log
397 * with messages with -ESTALE errors.
399 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
400 it_open_error(DISP_OPEN_OPEN, itp))
402 ll_release_openhandle(file->f_dentry, itp);
406 if (it_disposition(itp, DISP_LOOKUP_NEG))
407 GOTO(out, rc = -ENOENT);
409 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
410 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
411 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
415 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
416 if (!rc && itp->d.lustre.it_lock_mode)
417 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
421 ptlrpc_req_finished(itp->d.lustre.it_data);
422 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
423 ll_intent_drop_lock(itp);
429 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
430 * not believe attributes if a few ioepoch holders exist. Attributes for
431 * previous ioepoch if new one is opened are also skipped by MDS.
433 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
435 if (ioepoch && lli->lli_ioepoch != ioepoch) {
436 lli->lli_ioepoch = ioepoch;
437 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
438 ioepoch, PFID(&lli->lli_fid));
442 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
443 struct lookup_intent *it, struct obd_client_handle *och)
445 struct ptlrpc_request *req = it->d.lustre.it_data;
446 struct mdt_body *body;
450 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
451 LASSERT(body != NULL); /* reply already checked out */
453 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
454 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
455 och->och_fid = lli->lli_fid;
456 och->och_flags = it->it_flags;
457 ll_ioepoch_open(lli, body->ioepoch);
459 return md_set_open_replay_data(md_exp, och, req);
462 int ll_local_open(struct file *file, struct lookup_intent *it,
463 struct ll_file_data *fd, struct obd_client_handle *och)
465 struct inode *inode = file->f_dentry->d_inode;
466 struct ll_inode_info *lli = ll_i2info(inode);
469 LASSERT(!LUSTRE_FPRIVATE(file));
474 struct ptlrpc_request *req = it->d.lustre.it_data;
475 struct mdt_body *body;
478 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
482 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
483 if ((it->it_flags & FMODE_WRITE) &&
484 (body->valid & OBD_MD_FLSIZE))
485 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
486 lli->lli_ioepoch, PFID(&lli->lli_fid));
489 LUSTRE_FPRIVATE(file) = fd;
490 ll_readahead_init(inode, &fd->fd_ras);
491 fd->fd_omode = it->it_flags;
495 /* Open a file, and (for the very first open) create objects on the OSTs at
496 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
497 * creation or open until ll_lov_setstripe() ioctl is called.
499 * If we already have the stripe MD locally then we don't request it in
500 * md_open(), by passing a lmm_size = 0.
502 * It is up to the application to ensure no other processes open this file
503 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
504 * used. We might be able to avoid races of that sort by getting lli_open_sem
505 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
506 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
508 int ll_file_open(struct inode *inode, struct file *file)
510 struct ll_inode_info *lli = ll_i2info(inode);
511 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
512 .it_flags = file->f_flags };
513 struct obd_client_handle **och_p = NULL;
514 __u64 *och_usecount = NULL;
515 struct ll_file_data *fd;
516 int rc = 0, opendir_set = 0;
519 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
520 inode->i_generation, inode, file->f_flags);
522 it = file->private_data; /* XXX: compat macro */
523 file->private_data = NULL; /* prevent ll_local_open assertion */
525 fd = ll_file_data_get();
527 GOTO(out_och_free, rc = -ENOMEM);
530 if (S_ISDIR(inode->i_mode)) {
531 spin_lock(&lli->lli_sa_lock);
532 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
533 lli->lli_opendir_pid == 0) {
534 lli->lli_opendir_key = fd;
535 lli->lli_opendir_pid = current_pid();
538 spin_unlock(&lli->lli_sa_lock);
541 if (inode->i_sb->s_root == file->f_dentry) {
542 LUSTRE_FPRIVATE(file) = fd;
546 if (!it || !it->d.lustre.it_disposition) {
547 /* Convert f_flags into access mode. We cannot use file->f_mode,
548 * because everything but O_ACCMODE mask was stripped from
550 if ((oit.it_flags + 1) & O_ACCMODE)
552 if (file->f_flags & O_TRUNC)
553 oit.it_flags |= FMODE_WRITE;
555 /* kernel only call f_op->open in dentry_open. filp_open calls
556 * dentry_open after call to open_namei that checks permissions.
557 * Only nfsd_open call dentry_open directly without checking
558 * permissions and because of that this code below is safe. */
559 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
560 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
562 /* We do not want O_EXCL here, presumably we opened the file
563 * already? XXX - NFS implications? */
564 oit.it_flags &= ~O_EXCL;
566 /* bug20584, if "it_flags" contains O_CREAT, the file will be
567 * created if necessary, then "IT_CREAT" should be set to keep
568 * consistent with it */
569 if (oit.it_flags & O_CREAT)
570 oit.it_op |= IT_CREAT;
576 /* Let's see if we have file open on MDS already. */
577 if (it->it_flags & FMODE_WRITE) {
578 och_p = &lli->lli_mds_write_och;
579 och_usecount = &lli->lli_open_fd_write_count;
580 } else if (it->it_flags & FMODE_EXEC) {
581 och_p = &lli->lli_mds_exec_och;
582 och_usecount = &lli->lli_open_fd_exec_count;
584 och_p = &lli->lli_mds_read_och;
585 och_usecount = &lli->lli_open_fd_read_count;
588 mutex_lock(&lli->lli_och_mutex);
589 if (*och_p) { /* Open handle is present */
590 if (it_disposition(it, DISP_OPEN_OPEN)) {
591 /* Well, there's extra open request that we do not need,
592 let's close it somehow. This will decref request. */
593 rc = it_open_error(DISP_OPEN_OPEN, it);
595 mutex_unlock(&lli->lli_och_mutex);
596 GOTO(out_openerr, rc);
599 ll_release_openhandle(file->f_dentry, it);
603 rc = ll_local_open(file, it, fd, NULL);
606 mutex_unlock(&lli->lli_och_mutex);
607 GOTO(out_openerr, rc);
610 LASSERT(*och_usecount == 0);
611 if (!it->d.lustre.it_disposition) {
612 /* We cannot just request lock handle now, new ELC code
613 means that one of other OPEN locks for this file
614 could be cancelled, and since blocking ast handler
615 would attempt to grab och_mutex as well, that would
616 result in a deadlock */
617 mutex_unlock(&lli->lli_och_mutex);
618 it->it_create_mode |= M_CHECK_STALE;
619 rc = ll_intent_file_open(file, NULL, 0, it);
620 it->it_create_mode &= ~M_CHECK_STALE;
622 GOTO(out_openerr, rc);
626 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
628 GOTO(out_och_free, rc = -ENOMEM);
632 /* md_intent_lock() didn't get a request ref if there was an
633 * open error, so don't do cleanup on the request here
635 /* XXX (green): Should not we bail out on any error here, not
636 * just open error? */
637 rc = it_open_error(DISP_OPEN_OPEN, it);
639 GOTO(out_och_free, rc);
641 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
643 rc = ll_local_open(file, it, fd, *och_p);
645 GOTO(out_och_free, rc);
647 mutex_unlock(&lli->lli_och_mutex);
650 /* Must do this outside lli_och_mutex lock to prevent deadlock where
651 different kind of OPEN lock for this same inode gets cancelled
652 by ldlm_cancel_lru */
653 if (!S_ISREG(inode->i_mode))
654 GOTO(out_och_free, rc);
658 if (!lli->lli_has_smd) {
659 if (file->f_flags & O_LOV_DELAY_CREATE ||
660 !(file->f_mode & FMODE_WRITE)) {
661 CDEBUG(D_INODE, "object creation was delayed\n");
662 GOTO(out_och_free, rc);
665 file->f_flags &= ~O_LOV_DELAY_CREATE;
666 GOTO(out_och_free, rc);
670 if (och_p && *och_p) {
671 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
672 *och_p = NULL; /* OBD_FREE writes some magic there */
675 mutex_unlock(&lli->lli_och_mutex);
678 if (opendir_set != 0)
679 ll_stop_statahead(inode, lli->lli_opendir_key);
681 ll_file_data_put(fd);
683 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
686 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
687 ptlrpc_req_finished(it->d.lustre.it_data);
688 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
694 /* Fills the obdo with the attributes for the lsm */
695 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
696 struct obd_capa *capa, struct obdo *obdo,
697 __u64 ioepoch, int sync)
699 struct ptlrpc_request_set *set;
700 struct obd_info oinfo = { { { 0 } } };
705 LASSERT(lsm != NULL);
709 oinfo.oi_oa->o_oi = lsm->lsm_oi;
710 oinfo.oi_oa->o_mode = S_IFREG;
711 oinfo.oi_oa->o_ioepoch = ioepoch;
712 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
713 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
714 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
715 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
716 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
717 OBD_MD_FLDATAVERSION;
718 oinfo.oi_capa = capa;
720 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
721 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
724 set = ptlrpc_prep_set();
726 CERROR("can't allocate ptlrpc set\n");
729 rc = obd_getattr_async(exp, &oinfo, set);
731 rc = ptlrpc_set_wait(set);
732 ptlrpc_set_destroy(set);
735 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
736 OBD_MD_FLATIME | OBD_MD_FLMTIME |
737 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
738 OBD_MD_FLDATAVERSION);
743 * Performs the getattr on the inode and updates its fields.
744 * If @sync != 0, perform the getattr under the server-side lock.
746 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
747 __u64 ioepoch, int sync)
749 struct obd_capa *capa = ll_mdscapa_get(inode);
750 struct lov_stripe_md *lsm;
754 lsm = ccc_inode_lsm_get(inode);
755 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
756 capa, obdo, ioepoch, sync);
759 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
761 obdo_refresh_inode(inode, obdo, obdo->o_valid);
762 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
763 " blksize %lu\n", POSTID(oi), i_size_read(inode),
764 (unsigned long long)inode->i_blocks,
765 (unsigned long)ll_inode_blksize(inode));
767 ccc_inode_lsm_put(inode, lsm);
771 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
773 struct ll_inode_info *lli = ll_i2info(inode);
774 struct cl_object *obj = lli->lli_clob;
775 struct cl_attr *attr = ccc_env_thread_attr(env);
781 ll_inode_size_lock(inode);
782 /* merge timestamps the most recently obtained from mds with
783 timestamps obtained from osts */
784 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
785 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
786 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
787 inode_init_lvb(inode, &lvb);
789 cl_object_attr_lock(obj);
790 rc = cl_object_attr_get(env, obj, attr);
791 cl_object_attr_unlock(obj);
794 if (lvb.lvb_atime < attr->cat_atime)
795 lvb.lvb_atime = attr->cat_atime;
796 if (lvb.lvb_ctime < attr->cat_ctime)
797 lvb.lvb_ctime = attr->cat_ctime;
798 if (lvb.lvb_mtime < attr->cat_mtime)
799 lvb.lvb_mtime = attr->cat_mtime;
801 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
802 PFID(&lli->lli_fid), attr->cat_size);
803 cl_isize_write_nolock(inode, attr->cat_size);
805 inode->i_blocks = attr->cat_blocks;
807 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
808 LTIME_S(inode->i_atime) = lvb.lvb_atime;
809 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
811 ll_inode_size_unlock(inode);
816 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
819 struct obdo obdo = { 0 };
822 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
824 st->st_size = obdo.o_size;
825 st->st_blocks = obdo.o_blocks;
826 st->st_mtime = obdo.o_mtime;
827 st->st_atime = obdo.o_atime;
828 st->st_ctime = obdo.o_ctime;
833 void ll_io_init(struct cl_io *io, const struct file *file, int write)
835 struct inode *inode = file->f_dentry->d_inode;
837 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
839 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
840 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
841 file->f_flags & O_DIRECT ||
844 io->ci_obj = ll_i2info(inode)->lli_clob;
845 io->ci_lockreq = CILR_MAYBE;
846 if (ll_file_nolock(file)) {
847 io->ci_lockreq = CILR_NEVER;
848 io->ci_no_srvlock = 1;
849 } else if (file->f_flags & O_APPEND) {
850 io->ci_lockreq = CILR_MANDATORY;
855 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
856 struct file *file, enum cl_io_type iot,
857 loff_t *ppos, size_t count)
859 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
866 io = ccc_env_thread_io(env);
867 ll_io_init(io, file, iot == CIT_WRITE);
869 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
870 struct vvp_io *vio = vvp_env_io(env);
871 struct ccc_io *cio = ccc_env_io(env);
872 int write_mutex_locked = 0;
874 cio->cui_fd = LUSTRE_FPRIVATE(file);
875 vio->cui_io_subtype = args->via_io_subtype;
877 switch (vio->cui_io_subtype) {
879 cio->cui_iov = args->u.normal.via_iov;
880 cio->cui_nrsegs = args->u.normal.via_nrsegs;
881 cio->cui_tot_nrsegs = cio->cui_nrsegs;
882 cio->cui_iocb = args->u.normal.via_iocb;
883 if ((iot == CIT_WRITE) &&
884 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
885 if (mutex_lock_interruptible(&lli->
887 GOTO(out, result = -ERESTARTSYS);
888 write_mutex_locked = 1;
889 } else if (iot == CIT_READ) {
890 down_read(&lli->lli_trunc_sem);
894 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
895 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
898 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
899 vio->u.splice.cui_flags = args->u.splice.via_flags;
902 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
905 result = cl_io_loop(env, io);
906 if (write_mutex_locked)
907 mutex_unlock(&lli->lli_write_mutex);
908 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
909 up_read(&lli->lli_trunc_sem);
911 /* cl_io_rw_init() handled IO */
912 result = io->ci_result;
915 if (io->ci_nob > 0) {
917 *ppos = io->u.ci_wr.wr.crw_pos;
922 /* If any bit been read/written (result != 0), we just return
923 * short read/write instead of restart io. */
924 if (result == 0 && io->ci_need_restart) {
925 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
926 iot == CIT_READ ? "read" : "write",
927 file->f_dentry->d_name.name, *ppos, count);
928 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
932 if (iot == CIT_READ) {
934 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
935 LPROC_LL_READ_BYTES, result);
936 } else if (iot == CIT_WRITE) {
938 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
939 LPROC_LL_WRITE_BYTES, result);
940 fd->fd_write_failed = false;
941 } else if (result != -ERESTARTSYS) {
942 fd->fd_write_failed = true;
951 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
953 static int ll_file_get_iov_count(const struct iovec *iov,
954 unsigned long *nr_segs, size_t *count)
959 for (seg = 0; seg < *nr_segs; seg++) {
960 const struct iovec *iv = &iov[seg];
963 * If any segment has a negative length, or the cumulative
964 * length ever wraps negative then return -EINVAL.
967 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
969 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
974 cnt -= iv->iov_len; /* This segment is no good */
981 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
982 unsigned long nr_segs, loff_t pos)
985 struct vvp_io_args *args;
991 result = ll_file_get_iov_count(iov, &nr_segs, &count);
995 env = cl_env_get(&refcheck);
997 RETURN(PTR_ERR(env));
999 args = vvp_env_args(env, IO_NORMAL);
1000 args->u.normal.via_iov = (struct iovec *)iov;
1001 args->u.normal.via_nrsegs = nr_segs;
1002 args->u.normal.via_iocb = iocb;
1004 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1005 &iocb->ki_pos, count);
1006 cl_env_put(env, &refcheck);
1010 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1014 struct iovec *local_iov;
1015 struct kiocb *kiocb;
1020 env = cl_env_get(&refcheck);
1022 RETURN(PTR_ERR(env));
1024 local_iov = &vvp_env_info(env)->vti_local_iov;
1025 kiocb = &vvp_env_info(env)->vti_kiocb;
1026 local_iov->iov_base = (void __user *)buf;
1027 local_iov->iov_len = count;
1028 init_sync_kiocb(kiocb, file);
1029 kiocb->ki_pos = *ppos;
1030 kiocb->ki_left = count;
1032 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1033 *ppos = kiocb->ki_pos;
1035 cl_env_put(env, &refcheck);
1040 * Write to a file (through the page cache).
1042 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1043 unsigned long nr_segs, loff_t pos)
1046 struct vvp_io_args *args;
1052 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1056 env = cl_env_get(&refcheck);
1058 RETURN(PTR_ERR(env));
1060 args = vvp_env_args(env, IO_NORMAL);
1061 args->u.normal.via_iov = (struct iovec *)iov;
1062 args->u.normal.via_nrsegs = nr_segs;
1063 args->u.normal.via_iocb = iocb;
1065 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1066 &iocb->ki_pos, count);
1067 cl_env_put(env, &refcheck);
1071 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1075 struct iovec *local_iov;
1076 struct kiocb *kiocb;
1081 env = cl_env_get(&refcheck);
1083 RETURN(PTR_ERR(env));
1085 local_iov = &vvp_env_info(env)->vti_local_iov;
1086 kiocb = &vvp_env_info(env)->vti_kiocb;
1087 local_iov->iov_base = (void __user *)buf;
1088 local_iov->iov_len = count;
1089 init_sync_kiocb(kiocb, file);
1090 kiocb->ki_pos = *ppos;
1091 kiocb->ki_left = count;
1093 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1094 *ppos = kiocb->ki_pos;
1096 cl_env_put(env, &refcheck);
1103 * Send file content (through pagecache) somewhere with helper
1105 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1106 struct pipe_inode_info *pipe, size_t count,
1110 struct vvp_io_args *args;
1115 env = cl_env_get(&refcheck);
1117 RETURN(PTR_ERR(env));
1119 args = vvp_env_args(env, IO_SPLICE);
1120 args->u.splice.via_pipe = pipe;
1121 args->u.splice.via_flags = flags;
1123 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1124 cl_env_put(env, &refcheck);
1128 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1131 struct obd_export *exp = ll_i2dtexp(inode);
1132 struct obd_trans_info oti = { 0 };
1133 struct obdo *oa = NULL;
1136 struct lov_stripe_md *lsm = NULL, *lsm2;
1143 lsm = ccc_inode_lsm_get(inode);
1145 GOTO(out, rc = -ENOENT);
1147 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1148 (lsm->lsm_stripe_count));
1150 OBD_ALLOC_LARGE(lsm2, lsm_size);
1152 GOTO(out, rc = -ENOMEM);
1155 oa->o_nlink = ost_idx;
1156 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1157 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1158 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1159 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1160 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1161 memcpy(lsm2, lsm, lsm_size);
1162 ll_inode_size_lock(inode);
1163 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1164 ll_inode_size_unlock(inode);
1166 OBD_FREE_LARGE(lsm2, lsm_size);
1169 ccc_inode_lsm_put(inode, lsm);
1174 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1176 struct ll_recreate_obj ucreat;
1180 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1183 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1187 ostid_set_seq_mdt0(&oi);
1188 ostid_set_id(&oi, ucreat.lrc_id);
1189 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1192 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1199 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1202 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1205 fid_to_ostid(&fid, &oi);
1206 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1207 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1210 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1211 int flags, struct lov_user_md *lum, int lum_size)
1213 struct lov_stripe_md *lsm = NULL;
1214 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1218 lsm = ccc_inode_lsm_get(inode);
1220 ccc_inode_lsm_put(inode, lsm);
1221 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1226 ll_inode_size_lock(inode);
1227 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1230 rc = oit.d.lustre.it_status;
1232 GOTO(out_req_free, rc);
1234 ll_release_openhandle(file->f_dentry, &oit);
1237 ll_inode_size_unlock(inode);
1238 ll_intent_release(&oit);
1239 ccc_inode_lsm_put(inode, lsm);
1242 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1246 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1247 struct lov_mds_md **lmmp, int *lmm_size,
1248 struct ptlrpc_request **request)
1250 struct ll_sb_info *sbi = ll_i2sbi(inode);
1251 struct mdt_body *body;
1252 struct lov_mds_md *lmm = NULL;
1253 struct ptlrpc_request *req = NULL;
1254 struct md_op_data *op_data;
1257 rc = ll_get_max_mdsize(sbi, &lmmsize);
1261 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1262 strlen(filename), lmmsize,
1263 LUSTRE_OPC_ANY, NULL);
1264 if (IS_ERR(op_data))
1265 RETURN(PTR_ERR(op_data));
1267 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1268 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1269 ll_finish_md_op_data(op_data);
1271 CDEBUG(D_INFO, "md_getattr_name failed "
1272 "on %s: rc %d\n", filename, rc);
1276 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1277 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1279 lmmsize = body->eadatasize;
1281 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1283 GOTO(out, rc = -ENODATA);
1286 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1287 LASSERT(lmm != NULL);
1289 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1290 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1291 GOTO(out, rc = -EPROTO);
1295 * This is coming from the MDS, so is probably in
1296 * little endian. We convert it to host endian before
1297 * passing it to userspace.
1299 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1300 /* if function called for directory - we should
1301 * avoid swab not existent lsm objects */
1302 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1303 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1304 if (S_ISREG(body->mode))
1305 lustre_swab_lov_user_md_objects(
1306 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1307 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1308 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1309 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1310 if (S_ISREG(body->mode))
1311 lustre_swab_lov_user_md_objects(
1312 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1313 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1319 *lmm_size = lmmsize;
1324 static int ll_lov_setea(struct inode *inode, struct file *file,
1327 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1328 struct lov_user_md *lump;
1329 int lum_size = sizeof(struct lov_user_md) +
1330 sizeof(struct lov_user_ost_data);
1334 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1337 OBD_ALLOC_LARGE(lump, lum_size);
1341 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1342 OBD_FREE_LARGE(lump, lum_size);
1346 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1348 OBD_FREE_LARGE(lump, lum_size);
1352 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1355 struct lov_user_md_v3 lumv3;
1356 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1357 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1358 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1360 int flags = FMODE_WRITE;
1363 /* first try with v1 which is smaller than v3 */
1364 lum_size = sizeof(struct lov_user_md_v1);
1365 if (copy_from_user(lumv1, lumv1p, lum_size))
1368 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1369 lum_size = sizeof(struct lov_user_md_v3);
1370 if (copy_from_user(&lumv3, lumv3p, lum_size))
1374 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1376 struct lov_stripe_md *lsm;
1379 put_user(0, &lumv1p->lmm_stripe_count);
1381 ll_layout_refresh(inode, &gen);
1382 lsm = ccc_inode_lsm_get(inode);
1383 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1384 0, lsm, (void *)arg);
1385 ccc_inode_lsm_put(inode, lsm);
1390 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1392 struct lov_stripe_md *lsm;
1396 lsm = ccc_inode_lsm_get(inode);
1398 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1400 ccc_inode_lsm_put(inode, lsm);
1404 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1406 struct ll_inode_info *lli = ll_i2info(inode);
1407 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1408 struct ccc_grouplock grouplock;
1412 if (ll_file_nolock(file))
1413 RETURN(-EOPNOTSUPP);
1415 spin_lock(&lli->lli_lock);
1416 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1417 CWARN("group lock already existed with gid %lu\n",
1418 fd->fd_grouplock.cg_gid);
1419 spin_unlock(&lli->lli_lock);
1422 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1423 spin_unlock(&lli->lli_lock);
1425 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1426 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1430 spin_lock(&lli->lli_lock);
1431 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1432 spin_unlock(&lli->lli_lock);
1433 CERROR("another thread just won the race\n");
1434 cl_put_grouplock(&grouplock);
1438 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1439 fd->fd_grouplock = grouplock;
1440 spin_unlock(&lli->lli_lock);
1442 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1446 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1448 struct ll_inode_info *lli = ll_i2info(inode);
1449 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1450 struct ccc_grouplock grouplock;
1453 spin_lock(&lli->lli_lock);
1454 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1455 spin_unlock(&lli->lli_lock);
1456 CWARN("no group lock held\n");
1459 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1461 if (fd->fd_grouplock.cg_gid != arg) {
1462 CWARN("group lock %lu doesn't match current id %lu\n",
1463 arg, fd->fd_grouplock.cg_gid);
1464 spin_unlock(&lli->lli_lock);
1468 grouplock = fd->fd_grouplock;
1469 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1470 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1471 spin_unlock(&lli->lli_lock);
1473 cl_put_grouplock(&grouplock);
1474 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1479 * Close inode open handle
1481 * \param dentry [in] dentry which contains the inode
1482 * \param it [in,out] intent which contains open info and result
1485 * \retval <0 failure
1487 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1489 struct inode *inode = dentry->d_inode;
1490 struct obd_client_handle *och;
1496 /* Root ? Do nothing. */
1497 if (dentry->d_inode->i_sb->s_root == dentry)
1500 /* No open handle to close? Move away */
1501 if (!it_disposition(it, DISP_OPEN_OPEN))
1504 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1506 OBD_ALLOC(och, sizeof(*och));
1508 GOTO(out, rc = -ENOMEM);
1510 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1511 ll_i2info(inode), it, och);
1513 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1516 /* this one is in place of ll_file_open */
1517 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1518 ptlrpc_req_finished(it->d.lustre.it_data);
1519 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1525 * Get size for inode for which FIEMAP mapping is requested.
1526 * Make the FIEMAP get_info call and returns the result.
1528 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1531 struct obd_export *exp = ll_i2dtexp(inode);
1532 struct lov_stripe_md *lsm = NULL;
1533 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1534 int vallen = num_bytes;
1538 /* Checks for fiemap flags */
1539 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1540 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1544 /* Check for FIEMAP_FLAG_SYNC */
1545 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1546 rc = filemap_fdatawrite(inode->i_mapping);
1551 lsm = ccc_inode_lsm_get(inode);
1555 /* If the stripe_count > 1 and the application does not understand
1556 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1558 if (lsm->lsm_stripe_count > 1 &&
1559 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1560 GOTO(out, rc = -EOPNOTSUPP);
1562 fm_key.oa.o_oi = lsm->lsm_oi;
1563 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1565 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1566 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1567 /* If filesize is 0, then there would be no objects for mapping */
1568 if (fm_key.oa.o_size == 0) {
1569 fiemap->fm_mapped_extents = 0;
1573 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1575 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1578 CERROR("obd_get_info failed: rc = %d\n", rc);
1581 ccc_inode_lsm_put(inode, lsm);
1585 int ll_fid2path(struct inode *inode, void *arg)
1587 struct obd_export *exp = ll_i2mdexp(inode);
1588 struct getinfo_fid2path *gfout, *gfin;
1592 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1593 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1596 /* Need to get the buflen */
1597 OBD_ALLOC_PTR(gfin);
1600 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1605 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1606 OBD_ALLOC(gfout, outsize);
1607 if (gfout == NULL) {
1611 memcpy(gfout, gfin, sizeof(*gfout));
1614 /* Call mdc_iocontrol */
1615 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1619 if (copy_to_user(arg, gfout, outsize))
1623 OBD_FREE(gfout, outsize);
1627 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1629 struct ll_user_fiemap *fiemap_s;
1630 size_t num_bytes, ret_bytes;
1631 unsigned int extent_count;
1634 /* Get the extent count so we can calculate the size of
1635 * required fiemap buffer */
1636 if (get_user(extent_count,
1637 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1639 num_bytes = sizeof(*fiemap_s) + (extent_count *
1640 sizeof(struct ll_fiemap_extent));
1642 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1643 if (fiemap_s == NULL)
1646 /* get the fiemap value */
1647 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1649 GOTO(error, rc = -EFAULT);
1651 /* If fm_extent_count is non-zero, read the first extent since
1652 * it is used to calculate end_offset and device from previous
1655 if (copy_from_user(&fiemap_s->fm_extents[0],
1656 (char __user *)arg + sizeof(*fiemap_s),
1657 sizeof(struct ll_fiemap_extent)))
1658 GOTO(error, rc = -EFAULT);
1661 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1665 ret_bytes = sizeof(struct ll_user_fiemap);
1667 if (extent_count != 0)
1668 ret_bytes += (fiemap_s->fm_mapped_extents *
1669 sizeof(struct ll_fiemap_extent));
1671 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1675 OBD_FREE_LARGE(fiemap_s, num_bytes);
1680 * Read the data_version for inode.
1682 * This value is computed using stripe object version on OST.
1683 * Version is computed using server side locking.
1685 * @param extent_lock Take extent lock. Not needed if a process is already
1686 * holding the OST object group locks.
1688 int ll_data_version(struct inode *inode, __u64 *data_version,
1691 struct lov_stripe_md *lsm = NULL;
1692 struct ll_sb_info *sbi = ll_i2sbi(inode);
1693 struct obdo *obdo = NULL;
1697 /* If no stripe, we consider version is 0. */
1698 lsm = ccc_inode_lsm_get(inode);
1701 CDEBUG(D_INODE, "No object for inode\n");
1705 OBD_ALLOC_PTR(obdo);
1707 ccc_inode_lsm_put(inode, lsm);
1711 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1713 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1716 *data_version = obdo->o_data_version;
1720 ccc_inode_lsm_put(inode, lsm);
1725 struct ll_swap_stack {
1726 struct iattr ia1, ia2;
1728 struct inode *inode1, *inode2;
1729 bool check_dv1, check_dv2;
1732 static int ll_swap_layouts(struct file *file1, struct file *file2,
1733 struct lustre_swap_layouts *lsl)
1735 struct mdc_swap_layouts msl;
1736 struct md_op_data *op_data;
1739 struct ll_swap_stack *llss = NULL;
1742 OBD_ALLOC_PTR(llss);
1746 llss->inode1 = file1->f_dentry->d_inode;
1747 llss->inode2 = file2->f_dentry->d_inode;
1749 if (!S_ISREG(llss->inode2->i_mode))
1750 GOTO(free, rc = -EINVAL);
1752 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1753 ll_permission(llss->inode2, MAY_WRITE, NULL))
1754 GOTO(free, rc = -EPERM);
1756 if (llss->inode2->i_sb != llss->inode1->i_sb)
1757 GOTO(free, rc = -EXDEV);
1759 /* we use 2 bool because it is easier to swap than 2 bits */
1760 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1761 llss->check_dv1 = true;
1763 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1764 llss->check_dv2 = true;
1766 /* we cannot use lsl->sl_dvX directly because we may swap them */
1767 llss->dv1 = lsl->sl_dv1;
1768 llss->dv2 = lsl->sl_dv2;
1770 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1771 if (rc == 0) /* same file, done! */
1774 if (rc < 0) { /* sequentialize it */
1775 swap(llss->inode1, llss->inode2);
1777 swap(llss->dv1, llss->dv2);
1778 swap(llss->check_dv1, llss->check_dv2);
1782 if (gid != 0) { /* application asks to flush dirty cache */
1783 rc = ll_get_grouplock(llss->inode1, file1, gid);
1787 rc = ll_get_grouplock(llss->inode2, file2, gid);
1789 ll_put_grouplock(llss->inode1, file1, gid);
1794 /* to be able to restore mtime and atime after swap
1795 * we need to first save them */
1797 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1798 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1799 llss->ia1.ia_atime = llss->inode1->i_atime;
1800 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1801 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1802 llss->ia2.ia_atime = llss->inode2->i_atime;
1803 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1806 /* ultimate check, before swaping the layouts we check if
1807 * dataversion has changed (if requested) */
1808 if (llss->check_dv1) {
1809 rc = ll_data_version(llss->inode1, &dv, 0);
1812 if (dv != llss->dv1)
1813 GOTO(putgl, rc = -EAGAIN);
1816 if (llss->check_dv2) {
1817 rc = ll_data_version(llss->inode2, &dv, 0);
1820 if (dv != llss->dv2)
1821 GOTO(putgl, rc = -EAGAIN);
1824 /* struct md_op_data is used to send the swap args to the mdt
1825 * only flags is missing, so we use struct mdc_swap_layouts
1826 * through the md_op_data->op_data */
1827 /* flags from user space have to be converted before they are send to
1828 * server, no flag is sent today, they are only used on the client */
1831 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1832 0, LUSTRE_OPC_ANY, &msl);
1833 if (op_data != NULL) {
1834 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1835 ll_i2mdexp(llss->inode1),
1836 sizeof(*op_data), op_data, NULL);
1837 ll_finish_md_op_data(op_data);
1842 ll_put_grouplock(llss->inode2, file2, gid);
1843 ll_put_grouplock(llss->inode1, file1, gid);
1846 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1850 /* clear useless flags */
1851 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1852 llss->ia1.ia_valid &= ~ATTR_MTIME;
1853 llss->ia2.ia_valid &= ~ATTR_MTIME;
1856 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1857 llss->ia1.ia_valid &= ~ATTR_ATIME;
1858 llss->ia2.ia_valid &= ~ATTR_ATIME;
1861 /* update time if requested */
1863 if (llss->ia2.ia_valid != 0) {
1864 mutex_lock(&llss->inode1->i_mutex);
1865 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1866 mutex_unlock(&llss->inode1->i_mutex);
1869 if (llss->ia1.ia_valid != 0) {
1872 mutex_lock(&llss->inode2->i_mutex);
1873 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1874 mutex_unlock(&llss->inode2->i_mutex);
1886 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1888 struct inode *inode = file->f_dentry->d_inode;
1889 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1893 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1894 inode->i_generation, inode, cmd);
1895 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1897 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1898 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1902 case LL_IOC_GETFLAGS:
1903 /* Get the current value of the file flags */
1904 return put_user(fd->fd_flags, (int *)arg);
1905 case LL_IOC_SETFLAGS:
1906 case LL_IOC_CLRFLAGS:
1907 /* Set or clear specific file flags */
1908 /* XXX This probably needs checks to ensure the flags are
1909 * not abused, and to handle any flag side effects.
1911 if (get_user(flags, (int *) arg))
1914 if (cmd == LL_IOC_SETFLAGS) {
1915 if ((flags & LL_FILE_IGNORE_LOCK) &&
1916 !(file->f_flags & O_DIRECT)) {
1917 CERROR("%s: unable to disable locking on "
1918 "non-O_DIRECT file\n", current->comm);
1922 fd->fd_flags |= flags;
1924 fd->fd_flags &= ~flags;
1927 case LL_IOC_LOV_SETSTRIPE:
1928 RETURN(ll_lov_setstripe(inode, file, arg));
1929 case LL_IOC_LOV_SETEA:
1930 RETURN(ll_lov_setea(inode, file, arg));
1931 case LL_IOC_LOV_SWAP_LAYOUTS: {
1933 struct lustre_swap_layouts lsl;
1935 if (copy_from_user(&lsl, (char *)arg,
1936 sizeof(struct lustre_swap_layouts)))
1939 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
1942 file2 = fget(lsl.sl_fd);
1947 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1948 rc = ll_swap_layouts(file, file2, &lsl);
1952 case LL_IOC_LOV_GETSTRIPE:
1953 RETURN(ll_lov_getstripe(inode, arg));
1954 case LL_IOC_RECREATE_OBJ:
1955 RETURN(ll_lov_recreate_obj(inode, arg));
1956 case LL_IOC_RECREATE_FID:
1957 RETURN(ll_lov_recreate_fid(inode, arg));
1958 case FSFILT_IOC_FIEMAP:
1959 RETURN(ll_ioctl_fiemap(inode, arg));
1960 case FSFILT_IOC_GETFLAGS:
1961 case FSFILT_IOC_SETFLAGS:
1962 RETURN(ll_iocontrol(inode, file, cmd, arg));
1963 case FSFILT_IOC_GETVERSION_OLD:
1964 case FSFILT_IOC_GETVERSION:
1965 RETURN(put_user(inode->i_generation, (int *)arg));
1966 case LL_IOC_GROUP_LOCK:
1967 RETURN(ll_get_grouplock(inode, file, arg));
1968 case LL_IOC_GROUP_UNLOCK:
1969 RETURN(ll_put_grouplock(inode, file, arg));
1970 case IOC_OBD_STATFS:
1971 RETURN(ll_obd_statfs(inode, (void *)arg));
1973 /* We need to special case any other ioctls we want to handle,
1974 * to send them to the MDS/OST as appropriate and to properly
1975 * network encode the arg field.
1976 case FSFILT_IOC_SETVERSION_OLD:
1977 case FSFILT_IOC_SETVERSION:
1979 case LL_IOC_FLUSHCTX:
1980 RETURN(ll_flush_ctx(inode));
1981 case LL_IOC_PATH2FID: {
1982 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1983 sizeof(struct lu_fid)))
1988 case OBD_IOC_FID2PATH:
1989 RETURN(ll_fid2path(inode, (void *)arg));
1990 case LL_IOC_DATA_VERSION: {
1991 struct ioc_data_version idv;
1994 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
1997 rc = ll_data_version(inode, &idv.idv_version,
1998 !(idv.idv_flags & LL_DV_NOFLUSH));
2000 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2006 case LL_IOC_GET_MDTIDX: {
2009 mdtidx = ll_get_mdt_idx(inode);
2013 if (put_user((int)mdtidx, (int*)arg))
2018 case OBD_IOC_GETDTNAME:
2019 case OBD_IOC_GETMDNAME:
2020 RETURN(ll_get_obd_name(inode, cmd, arg));
2021 case LL_IOC_HSM_STATE_GET: {
2022 struct md_op_data *op_data;
2023 struct hsm_user_state *hus;
2030 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2031 LUSTRE_OPC_ANY, hus);
2032 if (op_data == NULL) {
2037 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2040 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2043 ll_finish_md_op_data(op_data);
2047 case LL_IOC_HSM_STATE_SET: {
2048 struct md_op_data *op_data;
2049 struct hsm_state_set *hss;
2055 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2060 /* Non-root users are forbidden to set or clear flags which are
2061 * NOT defined in HSM_USER_MASK. */
2062 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2063 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2068 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2069 LUSTRE_OPC_ANY, hss);
2070 if (op_data == NULL) {
2075 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2078 ll_finish_md_op_data(op_data);
2083 case LL_IOC_HSM_ACTION: {
2084 struct md_op_data *op_data;
2085 struct hsm_current_action *hca;
2092 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2093 LUSTRE_OPC_ANY, hca);
2094 if (op_data == NULL) {
2099 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2102 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2105 ll_finish_md_op_data(op_data);
2113 ll_iocontrol_call(inode, file, cmd, arg, &err))
2116 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2123 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2125 struct inode *inode = file->f_dentry->d_inode;
2126 loff_t retval, eof = 0;
2129 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2130 (origin == SEEK_CUR) ? file->f_pos : 0);
2131 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2132 inode->i_ino, inode->i_generation, inode, retval, retval,
2134 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2136 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2137 retval = ll_glimpse_size(inode);
2140 eof = i_size_read(inode);
2143 retval = ll_generic_file_llseek_size(file, offset, origin,
2144 ll_file_maxbytes(inode), eof);
2148 int ll_flush(struct file *file, fl_owner_t id)
2150 struct inode *inode = file->f_dentry->d_inode;
2151 struct ll_inode_info *lli = ll_i2info(inode);
2152 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2155 LASSERT(!S_ISDIR(inode->i_mode));
2157 /* catch async errors that were recorded back when async writeback
2158 * failed for pages in this mapping. */
2159 rc = lli->lli_async_rc;
2160 lli->lli_async_rc = 0;
2161 err = lov_read_and_clear_async_rc(lli->lli_clob);
2165 /* The application has been told write failure already.
2166 * Do not report failure again. */
2167 if (fd->fd_write_failed)
2169 return rc ? -EIO : 0;
2173 * Called to make sure a portion of file has been written out.
2174 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2176 * Return how many pages have been written.
2178 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2179 enum cl_fsync_mode mode, int ignore_layout)
2181 struct cl_env_nest nest;
2184 struct obd_capa *capa = NULL;
2185 struct cl_fsync_io *fio;
2189 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2190 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2193 env = cl_env_nested_get(&nest);
2195 RETURN(PTR_ERR(env));
2197 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2199 io = ccc_env_thread_io(env);
2200 io->ci_obj = cl_i2info(inode)->lli_clob;
2201 io->ci_ignore_layout = ignore_layout;
2203 /* initialize parameters for sync */
2204 fio = &io->u.ci_fsync;
2205 fio->fi_capa = capa;
2206 fio->fi_start = start;
2208 fio->fi_fid = ll_inode2fid(inode);
2209 fio->fi_mode = mode;
2210 fio->fi_nr_written = 0;
2212 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2213 result = cl_io_loop(env, io);
2215 result = io->ci_result;
2217 result = fio->fi_nr_written;
2218 cl_io_fini(env, io);
2219 cl_env_nested_put(&nest, env);
2227 * When dentry is provided (the 'else' case), *file->f_dentry may be
2228 * null and dentry must be used directly rather than pulled from
2229 * *file->f_dentry as is done otherwise.
2232 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2234 struct dentry *dentry = file->f_dentry;
2235 struct inode *inode = dentry->d_inode;
2236 struct ll_inode_info *lli = ll_i2info(inode);
2237 struct ptlrpc_request *req;
2238 struct obd_capa *oc;
2242 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2243 inode->i_generation, inode);
2244 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2246 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2247 mutex_lock(&inode->i_mutex);
2249 /* catch async errors that were recorded back when async writeback
2250 * failed for pages in this mapping. */
2251 if (!S_ISDIR(inode->i_mode)) {
2252 err = lli->lli_async_rc;
2253 lli->lli_async_rc = 0;
2256 err = lov_read_and_clear_async_rc(lli->lli_clob);
2261 oc = ll_mdscapa_get(inode);
2262 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2268 ptlrpc_req_finished(req);
2270 if (datasync && S_ISREG(inode->i_mode)) {
2271 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2273 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2275 if (rc == 0 && err < 0)
2278 fd->fd_write_failed = true;
2280 fd->fd_write_failed = false;
2283 mutex_unlock(&inode->i_mutex);
2287 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2289 struct inode *inode = file->f_dentry->d_inode;
2290 struct ll_sb_info *sbi = ll_i2sbi(inode);
2291 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2292 .ei_cb_cp =ldlm_flock_completion_ast,
2293 .ei_cbdata = file_lock };
2294 struct md_op_data *op_data;
2295 struct lustre_handle lockh = {0};
2296 ldlm_policy_data_t flock = {{0}};
2302 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2303 inode->i_ino, file_lock);
2305 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2307 if (file_lock->fl_flags & FL_FLOCK) {
2308 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2309 /* flocks are whole-file locks */
2310 flock.l_flock.end = OFFSET_MAX;
2311 /* For flocks owner is determined by the local file desctiptor*/
2312 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2313 } else if (file_lock->fl_flags & FL_POSIX) {
2314 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2315 flock.l_flock.start = file_lock->fl_start;
2316 flock.l_flock.end = file_lock->fl_end;
2320 flock.l_flock.pid = file_lock->fl_pid;
2322 /* Somewhat ugly workaround for svc lockd.
2323 * lockd installs custom fl_lmops->lm_compare_owner that checks
2324 * for the fl_owner to be the same (which it always is on local node
2325 * I guess between lockd processes) and then compares pid.
2326 * As such we assign pid to the owner field to make it all work,
2327 * conflict with normal locks is unlikely since pid space and
2328 * pointer space for current->files are not intersecting */
2329 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2330 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2332 switch (file_lock->fl_type) {
2334 einfo.ei_mode = LCK_PR;
2337 /* An unlock request may or may not have any relation to
2338 * existing locks so we may not be able to pass a lock handle
2339 * via a normal ldlm_lock_cancel() request. The request may even
2340 * unlock a byte range in the middle of an existing lock. In
2341 * order to process an unlock request we need all of the same
2342 * information that is given with a normal read or write record
2343 * lock request. To avoid creating another ldlm unlock (cancel)
2344 * message we'll treat a LCK_NL flock request as an unlock. */
2345 einfo.ei_mode = LCK_NL;
2348 einfo.ei_mode = LCK_PW;
2351 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2352 file_lock->fl_type);
2367 flags = LDLM_FL_BLOCK_NOWAIT;
2373 flags = LDLM_FL_TEST_LOCK;
2374 /* Save the old mode so that if the mode in the lock changes we
2375 * can decrement the appropriate reader or writer refcount. */
2376 file_lock->fl_type = einfo.ei_mode;
2379 CERROR("unknown fcntl lock command: %d\n", cmd);
2383 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2384 LUSTRE_OPC_ANY, NULL);
2385 if (IS_ERR(op_data))
2386 RETURN(PTR_ERR(op_data));
2388 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2389 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2390 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2392 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2393 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2395 if ((file_lock->fl_flags & FL_FLOCK) &&
2396 (rc == 0 || file_lock->fl_type == F_UNLCK))
2397 rc2 = flock_lock_file_wait(file, file_lock);
2398 if ((file_lock->fl_flags & FL_POSIX) &&
2399 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2400 !(flags & LDLM_FL_TEST_LOCK))
2401 rc2 = posix_lock_file_wait(file, file_lock);
2403 if (rc2 && file_lock->fl_type != F_UNLCK) {
2404 einfo.ei_mode = LCK_NL;
2405 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2406 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2410 ll_finish_md_op_data(op_data);
2415 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2423 * test if some locks matching bits and l_req_mode are acquired
2424 * - bits can be in different locks
2425 * - if found clear the common lock bits in *bits
2426 * - the bits not found, are kept in *bits
2428 * \param bits [IN] searched lock bits [IN]
2429 * \param l_req_mode [IN] searched lock mode
2430 * \retval boolean, true iff all bits are found
2432 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2434 struct lustre_handle lockh;
2435 ldlm_policy_data_t policy;
2436 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2437 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2446 fid = &ll_i2info(inode)->lli_fid;
2447 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2448 ldlm_lockname[mode]);
2450 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2451 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2452 policy.l_inodebits.bits = *bits & (1 << i);
2453 if (policy.l_inodebits.bits == 0)
2456 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2457 &policy, mode, &lockh)) {
2458 struct ldlm_lock *lock;
2460 lock = ldlm_handle2lock(&lockh);
2463 ~(lock->l_policy_data.l_inodebits.bits);
2464 LDLM_LOCK_PUT(lock);
2466 *bits &= ~policy.l_inodebits.bits;
2473 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2474 struct lustre_handle *lockh, __u64 flags)
2476 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2481 fid = &ll_i2info(inode)->lli_fid;
2482 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2484 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2485 fid, LDLM_IBITS, &policy,
2486 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2490 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2492 /* Already unlinked. Just update nlink and return success */
2493 if (rc == -ENOENT) {
2495 /* This path cannot be hit for regular files unless in
2496 * case of obscure races, so no need to to validate
2498 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2500 } else if (rc != 0) {
2501 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2502 ll_get_fsname(inode->i_sb, NULL, 0),
2503 PFID(ll_inode2fid(inode)), rc);
2509 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2512 struct inode *inode = dentry->d_inode;
2513 struct ptlrpc_request *req = NULL;
2514 struct obd_export *exp;
2518 LASSERT(inode != NULL);
2520 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2521 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2523 exp = ll_i2mdexp(inode);
2525 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2526 * But under CMD case, it caused some lock issues, should be fixed
2527 * with new CMD ibits lock. See bug 12718 */
2528 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2529 struct lookup_intent oit = { .it_op = IT_GETATTR };
2530 struct md_op_data *op_data;
2532 if (ibits == MDS_INODELOCK_LOOKUP)
2533 oit.it_op = IT_LOOKUP;
2535 /* Call getattr by fid, so do not provide name at all. */
2536 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2537 dentry->d_inode, NULL, 0, 0,
2538 LUSTRE_OPC_ANY, NULL);
2539 if (IS_ERR(op_data))
2540 RETURN(PTR_ERR(op_data));
2542 oit.it_create_mode |= M_CHECK_STALE;
2543 rc = md_intent_lock(exp, op_data, NULL, 0,
2544 /* we are not interested in name
2547 ll_md_blocking_ast, 0);
2548 ll_finish_md_op_data(op_data);
2549 oit.it_create_mode &= ~M_CHECK_STALE;
2551 rc = ll_inode_revalidate_fini(inode, rc);
2555 rc = ll_revalidate_it_finish(req, &oit, dentry);
2557 ll_intent_release(&oit);
2561 /* Unlinked? Unhash dentry, so it is not picked up later by
2562 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2563 here to preserve get_cwd functionality on 2.6.
2565 if (!dentry->d_inode->i_nlink)
2566 d_lustre_invalidate(dentry, 0);
2568 ll_lookup_finish_locks(&oit, dentry);
2569 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2570 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2571 obd_valid valid = OBD_MD_FLGETATTR;
2572 struct md_op_data *op_data;
2575 if (S_ISREG(inode->i_mode)) {
2576 rc = ll_get_max_mdsize(sbi, &ealen);
2579 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2582 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2583 0, ealen, LUSTRE_OPC_ANY,
2585 if (IS_ERR(op_data))
2586 RETURN(PTR_ERR(op_data));
2588 op_data->op_valid = valid;
2589 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2590 * capa for this inode. Because we only keep capas of dirs
2592 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2593 ll_finish_md_op_data(op_data);
2595 rc = ll_inode_revalidate_fini(inode, rc);
2599 rc = ll_prep_inode(&inode, req, NULL, NULL);
2602 ptlrpc_req_finished(req);
2606 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2609 struct inode *inode = dentry->d_inode;
2613 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2617 /* if object isn't regular file, don't validate size */
2618 if (!S_ISREG(inode->i_mode)) {
2619 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2620 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2621 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2623 rc = ll_glimpse_size(inode);
2628 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2629 struct lookup_intent *it, struct kstat *stat)
2631 struct inode *inode = de->d_inode;
2632 struct ll_sb_info *sbi = ll_i2sbi(inode);
2633 struct ll_inode_info *lli = ll_i2info(inode);
2636 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2637 MDS_INODELOCK_LOOKUP);
2638 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2643 stat->dev = inode->i_sb->s_dev;
2644 if (ll_need_32bit_api(sbi))
2645 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2647 stat->ino = inode->i_ino;
2648 stat->mode = inode->i_mode;
2649 stat->nlink = inode->i_nlink;
2650 stat->uid = inode->i_uid;
2651 stat->gid = inode->i_gid;
2652 stat->rdev = inode->i_rdev;
2653 stat->atime = inode->i_atime;
2654 stat->mtime = inode->i_mtime;
2655 stat->ctime = inode->i_ctime;
2656 stat->blksize = 1 << inode->i_blkbits;
2658 stat->size = i_size_read(inode);
2659 stat->blocks = inode->i_blocks;
2663 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2665 struct lookup_intent it = { .it_op = IT_GETATTR };
2667 return ll_getattr_it(mnt, de, &it, stat);
2671 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2673 struct ll_inode_info *lli = ll_i2info(inode);
2674 struct posix_acl *acl = NULL;
2677 spin_lock(&lli->lli_lock);
2678 /* VFS' acl_permission_check->check_acl will release the refcount */
2679 acl = posix_acl_dup(lli->lli_posix_acl);
2680 spin_unlock(&lli->lli_lock);
2686 int ll_inode_permission(struct inode *inode, int mask)
2691 #ifdef MAY_NOT_BLOCK
2692 if (mask & MAY_NOT_BLOCK)
2696 /* as root inode are NOT getting validated in lookup operation,
2697 * need to do it before permission check. */
2699 if (inode == inode->i_sb->s_root->d_inode) {
2700 struct lookup_intent it = { .it_op = IT_LOOKUP };
2702 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2703 MDS_INODELOCK_LOOKUP);
2708 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2709 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2711 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2712 return lustre_check_remote_perm(inode, mask);
2714 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2715 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2720 #define READ_METHOD aio_read
2721 #define READ_FUNCTION ll_file_aio_read
2722 #define WRITE_METHOD aio_write
2723 #define WRITE_FUNCTION ll_file_aio_write
2725 /* -o localflock - only provides locally consistent flock locks */
2726 struct file_operations ll_file_operations = {
2727 .read = ll_file_read,
2728 .READ_METHOD = READ_FUNCTION,
2729 .write = ll_file_write,
2730 .WRITE_METHOD = WRITE_FUNCTION,
2731 .unlocked_ioctl = ll_file_ioctl,
2732 .open = ll_file_open,
2733 .release = ll_file_release,
2734 .mmap = ll_file_mmap,
2735 .llseek = ll_file_seek,
2736 .splice_read = ll_file_splice_read,
2741 struct file_operations ll_file_operations_flock = {
2742 .read = ll_file_read,
2743 .READ_METHOD = READ_FUNCTION,
2744 .write = ll_file_write,
2745 .WRITE_METHOD = WRITE_FUNCTION,
2746 .unlocked_ioctl = ll_file_ioctl,
2747 .open = ll_file_open,
2748 .release = ll_file_release,
2749 .mmap = ll_file_mmap,
2750 .llseek = ll_file_seek,
2751 .splice_read = ll_file_splice_read,
2754 .flock = ll_file_flock,
2755 .lock = ll_file_flock
2758 /* These are for -o noflock - to return ENOSYS on flock calls */
2759 struct file_operations ll_file_operations_noflock = {
2760 .read = ll_file_read,
2761 .READ_METHOD = READ_FUNCTION,
2762 .write = ll_file_write,
2763 .WRITE_METHOD = WRITE_FUNCTION,
2764 .unlocked_ioctl = ll_file_ioctl,
2765 .open = ll_file_open,
2766 .release = ll_file_release,
2767 .mmap = ll_file_mmap,
2768 .llseek = ll_file_seek,
2769 .splice_read = ll_file_splice_read,
2772 .flock = ll_file_noflock,
2773 .lock = ll_file_noflock
2776 struct inode_operations ll_file_inode_operations = {
2777 .setattr = ll_setattr,
2778 .getattr = ll_getattr,
2779 .permission = ll_inode_permission,
2780 .setxattr = ll_setxattr,
2781 .getxattr = ll_getxattr,
2782 .listxattr = ll_listxattr,
2783 .removexattr = ll_removexattr,
2784 .get_acl = ll_get_acl,
2787 /* dynamic ioctl number support routins */
2788 static struct llioc_ctl_data {
2789 struct rw_semaphore ioc_sem;
2790 struct list_head ioc_head;
2792 __RWSEM_INITIALIZER(llioc.ioc_sem),
2793 LIST_HEAD_INIT(llioc.ioc_head)
2798 struct list_head iocd_list;
2799 unsigned int iocd_size;
2800 llioc_callback_t iocd_cb;
2801 unsigned int iocd_count;
2802 unsigned int iocd_cmd[0];
2805 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2808 struct llioc_data *in_data = NULL;
2811 if (cb == NULL || cmd == NULL ||
2812 count > LLIOC_MAX_CMD || count < 0)
2815 size = sizeof(*in_data) + count * sizeof(unsigned int);
2816 OBD_ALLOC(in_data, size);
2817 if (in_data == NULL)
2820 memset(in_data, 0, sizeof(*in_data));
2821 in_data->iocd_size = size;
2822 in_data->iocd_cb = cb;
2823 in_data->iocd_count = count;
2824 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2826 down_write(&llioc.ioc_sem);
2827 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2828 up_write(&llioc.ioc_sem);
2833 void ll_iocontrol_unregister(void *magic)
2835 struct llioc_data *tmp;
2840 down_write(&llioc.ioc_sem);
2841 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2843 unsigned int size = tmp->iocd_size;
2845 list_del(&tmp->iocd_list);
2846 up_write(&llioc.ioc_sem);
2848 OBD_FREE(tmp, size);
2852 up_write(&llioc.ioc_sem);
2854 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2857 EXPORT_SYMBOL(ll_iocontrol_register);
2858 EXPORT_SYMBOL(ll_iocontrol_unregister);
2860 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2861 unsigned int cmd, unsigned long arg, int *rcp)
2863 enum llioc_iter ret = LLIOC_CONT;
2864 struct llioc_data *data;
2865 int rc = -EINVAL, i;
2867 down_read(&llioc.ioc_sem);
2868 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2869 for (i = 0; i < data->iocd_count; i++) {
2870 if (cmd != data->iocd_cmd[i])
2873 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2877 if (ret == LLIOC_STOP)
2880 up_read(&llioc.ioc_sem);
2887 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2889 struct ll_inode_info *lli = ll_i2info(inode);
2890 struct cl_env_nest nest;
2895 if (lli->lli_clob == NULL)
2898 env = cl_env_nested_get(&nest);
2900 RETURN(PTR_ERR(env));
2902 result = cl_conf_set(env, lli->lli_clob, conf);
2903 cl_env_nested_put(&nest, env);
2905 if (conf->coc_opc == OBJECT_CONF_SET) {
2906 struct ldlm_lock *lock = conf->coc_lock;
2908 LASSERT(lock != NULL);
2909 LASSERT(ldlm_has_layout(lock));
2911 /* it can only be allowed to match after layout is
2912 * applied to inode otherwise false layout would be
2913 * seen. Applying layout shoud happen before dropping
2914 * the intent lock. */
2915 ldlm_lock_allow_match(lock);
2921 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
2922 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
2925 struct ll_sb_info *sbi = ll_i2sbi(inode);
2926 struct obd_capa *oc;
2927 struct ptlrpc_request *req;
2928 struct mdt_body *body;
2935 if (lock->l_lvb_data != NULL)
2938 /* if layout lock was granted right away, the layout is returned
2939 * within DLM_LVB of dlm reply; otherwise if the lock was ever
2940 * blocked and then granted via completion ast, we have to fetch
2941 * layout here. Please note that we can't use the LVB buffer in
2942 * completion AST because it doesn't have a large enough buffer */
2943 oc = ll_mdscapa_get(inode);
2944 rc = ll_get_max_mdsize(sbi, &lmmsize);
2946 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
2947 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
2953 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2954 if (body == NULL || body->eadatasize > lmmsize)
2955 GOTO(out, rc = -EPROTO);
2957 lmmsize = body->eadatasize;
2958 if (lmmsize == 0) /* empty layout */
2961 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
2963 GOTO(out, rc = -EFAULT);
2965 OBD_ALLOC_LARGE(lvbdata, lmmsize);
2966 if (lvbdata == NULL)
2967 GOTO(out, rc = -ENOMEM);
2969 memcpy(lvbdata, lmm, lmmsize);
2970 lock_res_and_lock(lock);
2971 if (lock->l_lvb_data == NULL) {
2972 lock->l_lvb_data = lvbdata;
2973 lock->l_lvb_len = lmmsize;
2976 unlock_res_and_lock(lock);
2978 if (lvbdata != NULL)
2979 OBD_FREE_LARGE(lvbdata, lmmsize);
2983 ptlrpc_req_finished(req);
2988 * Apply the layout to the inode. Layout lock is held and will be released
2991 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
2992 struct inode *inode, __u32 *gen, bool reconf)
2994 struct ll_inode_info *lli = ll_i2info(inode);
2995 struct ll_sb_info *sbi = ll_i2sbi(inode);
2996 struct ldlm_lock *lock;
2997 struct lustre_md md = { NULL };
2998 struct cl_object_conf conf;
3001 bool wait_layout = false;
3004 LASSERT(lustre_handle_is_used(lockh));
3006 lock = ldlm_handle2lock(lockh);
3007 LASSERT(lock != NULL);
3008 LASSERT(ldlm_has_layout(lock));
3010 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3011 inode, PFID(&lli->lli_fid), reconf);
3013 /* in case this is a caching lock and reinstate with new inode */
3014 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3016 lock_res_and_lock(lock);
3017 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3018 unlock_res_and_lock(lock);
3019 /* checking lvb_ready is racy but this is okay. The worst case is
3020 * that multi processes may configure the file on the same time. */
3021 if (lvb_ready || !reconf) {
3024 /* layout_gen must be valid if layout lock is not
3025 * cancelled and stripe has already set */
3026 *gen = lli->lli_layout_gen;
3032 rc = ll_layout_fetch(inode, lock);
3036 /* for layout lock, lmm is returned in lock's lvb.
3037 * lvb_data is immutable if the lock is held so it's safe to access it
3038 * without res lock. See the description in ldlm_lock_decref_internal()
3039 * for the condition to free lvb_data of layout lock */
3040 if (lock->l_lvb_data != NULL) {
3041 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3042 lock->l_lvb_data, lock->l_lvb_len);
3044 *gen = LL_LAYOUT_GEN_EMPTY;
3046 *gen = md.lsm->lsm_layout_gen;
3049 CERROR("%s: file "DFID" unpackmd error: %d\n",
3050 ll_get_fsname(inode->i_sb, NULL, 0),
3051 PFID(&lli->lli_fid), rc);
3057 /* set layout to file. Unlikely this will fail as old layout was
3058 * surely eliminated */
3059 memset(&conf, 0, sizeof conf);
3060 conf.coc_opc = OBJECT_CONF_SET;
3061 conf.coc_inode = inode;
3062 conf.coc_lock = lock;
3063 conf.u.coc_md = &md;
3064 rc = ll_layout_conf(inode, &conf);
3067 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3069 /* refresh layout failed, need to wait */
3070 wait_layout = rc == -EBUSY;
3074 LDLM_LOCK_PUT(lock);
3075 ldlm_lock_decref(lockh, mode);
3077 /* wait for IO to complete if it's still being used. */
3079 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3080 ll_get_fsname(inode->i_sb, NULL, 0),
3081 inode, PFID(&lli->lli_fid));
3083 memset(&conf, 0, sizeof conf);
3084 conf.coc_opc = OBJECT_CONF_WAIT;
3085 conf.coc_inode = inode;
3086 rc = ll_layout_conf(inode, &conf);
3090 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3091 PFID(&lli->lli_fid), rc);
3097 * This function checks if there exists a LAYOUT lock on the client side,
3098 * or enqueues it if it doesn't have one in cache.
3100 * This function will not hold layout lock so it may be revoked any time after
3101 * this function returns. Any operations depend on layout should be redone
3104 * This function should be called before lov_io_init() to get an uptodate
3105 * layout version, the caller should save the version number and after IO
3106 * is finished, this function should be called again to verify that layout
3107 * is not changed during IO time.
3109 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3111 struct ll_inode_info *lli = ll_i2info(inode);
3112 struct ll_sb_info *sbi = ll_i2sbi(inode);
3113 struct md_op_data *op_data;
3114 struct lookup_intent it;
3115 struct lustre_handle lockh;
3117 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3119 .ei_cb_bl = ll_md_blocking_ast,
3120 .ei_cb_cp = ldlm_completion_ast,
3121 .ei_cbdata = NULL };
3125 *gen = lli->lli_layout_gen;
3126 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3130 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3131 LASSERT(S_ISREG(inode->i_mode));
3133 /* mostly layout lock is caching on the local side, so try to match
3134 * it before grabbing layout lock mutex. */
3135 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3136 if (mode != 0) { /* hit cached lock */
3137 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3141 /* better hold lli_layout_mutex to try again otherwise
3142 * it will have starvation problem. */
3145 /* take layout lock mutex to enqueue layout lock exclusively. */
3146 mutex_lock(&lli->lli_layout_mutex);
3149 /* try again. Maybe somebody else has done this. */
3150 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3151 if (mode != 0) { /* hit cached lock */
3152 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3156 mutex_unlock(&lli->lli_layout_mutex);
3160 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3161 0, 0, LUSTRE_OPC_ANY, NULL);
3162 if (IS_ERR(op_data)) {
3163 mutex_unlock(&lli->lli_layout_mutex);
3164 RETURN(PTR_ERR(op_data));
3167 /* have to enqueue one */
3168 memset(&it, 0, sizeof(it));
3169 it.it_op = IT_LAYOUT;
3170 lockh.cookie = 0ULL;
3172 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3173 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3174 PFID(&lli->lli_fid));
3176 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3178 if (it.d.lustre.it_data != NULL)
3179 ptlrpc_req_finished(it.d.lustre.it_data);
3180 it.d.lustre.it_data = NULL;
3182 ll_finish_md_op_data(op_data);
3184 mode = it.d.lustre.it_lock_mode;
3185 it.d.lustre.it_lock_mode = 0;
3186 ll_intent_drop_lock(&it);
3189 /* set lock data in case this is a new lock */
3190 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3191 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3195 mutex_unlock(&lli->lli_layout_mutex);