4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
51 #include "../include/cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
74 static void ll_file_data_put(struct ll_file_data *fd)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
97 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
102 * Closes the IO epoch and packs all the attributes into @op_data for
105 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
112 if (!(och->och_flags & FMODE_WRITE))
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
118 ll_ioepoch_close(inode, op_data, &och, 0);
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
126 static int ll_close_inode_openhandle(struct obd_export *md_exp,
128 struct obd_client_handle *och,
129 const __u64 *data_version)
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
140 * XXX: in case of LMV, is this correct to access
143 CERROR("Invalid MDC connection handle %#llx\n",
144 ll_i2mdexp(inode)->exp_handle.h_cookie);
149 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
151 /* XXX We leak openhandle and request here. */
156 ll_prepare_close(inode, op_data, och);
157 if (data_version != NULL) {
158 /* Pass in data_version implies release. */
159 op_data->op_bias |= MDS_HSM_RELEASE;
160 op_data->op_data_version = *data_version;
161 op_data->op_lease_handle = och->och_lease_handle;
162 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
165 rc = md_close(md_exp, op_data, och->och_mod, &req);
167 /* This close must have the epoch closed. */
168 LASSERT(epoch_close);
169 /* MDS has instructed us to obtain Size-on-MDS attribute from
170 * OSTs and send setattr to back to MDS. */
171 rc = ll_som_update(inode, op_data);
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
178 CERROR("inode %lu mdc close failed: rc = %d\n",
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
183 * modification flag. */
184 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 struct ll_inode_info *lli = ll_i2info(inode);
187 spin_lock(&lli->lli_lock);
188 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 spin_unlock(&lli->lli_lock);
193 rc = ll_objects_destroy(req, inode);
195 CERROR("inode %lu ll_objects destroy: rc = %d\n",
198 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 struct mdt_body *body;
201 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
202 if (!(body->valid & OBD_MD_FLRELEASED))
206 ll_finish_md_op_data(op_data);
209 if (exp_connect_som(exp) && !epoch_close &&
210 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
211 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
213 md_clear_open_replay_data(md_exp, och);
214 /* Free @och if it is not waiting for DONE_WRITING. */
215 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
218 if (req) /* This is close request */
219 ptlrpc_req_finished(req);
223 int ll_md_real_close(struct inode *inode, fmode_t fmode)
225 struct ll_inode_info *lli = ll_i2info(inode);
226 struct obd_client_handle **och_p;
227 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
271 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
272 struct lustre_handle lockh;
273 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}};
276 /* clear group lock, if present */
277 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
278 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
280 if (fd->fd_lease_och != NULL) {
283 /* Usually the lease is not released when the
284 * application crashed, we need to release here. */
285 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
286 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
287 PFID(&lli->lli_fid), rc, lease_broken);
289 fd->fd_lease_och = NULL;
292 if (fd->fd_och != NULL) {
293 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
298 /* Let's see if we have good enough OPEN lock on the file and if
299 we can skip talking to MDS */
301 mutex_lock(&lli->lli_och_mutex);
302 if (fd->fd_omode & FMODE_WRITE) {
304 LASSERT(lli->lli_open_fd_write_count);
305 lli->lli_open_fd_write_count--;
306 } else if (fd->fd_omode & FMODE_EXEC) {
308 LASSERT(lli->lli_open_fd_exec_count);
309 lli->lli_open_fd_exec_count--;
312 LASSERT(lli->lli_open_fd_read_count);
313 lli->lli_open_fd_read_count--;
315 mutex_unlock(&lli->lli_och_mutex);
317 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
318 LDLM_IBITS, &policy, lockmode, &lockh))
319 rc = ll_md_real_close(inode, fd->fd_omode);
322 LUSTRE_FPRIVATE(file) = NULL;
323 ll_file_data_put(fd);
324 ll_capa_close(inode);
329 /* While this returns an error code, fput() the caller does not, so we need
330 * to make every effort to clean up all of our state here. Also, applications
331 * rarely check close errors and even if an error is returned they will not
332 * re-try the close call.
334 int ll_file_release(struct inode *inode, struct file *file)
336 struct ll_file_data *fd;
337 struct ll_sb_info *sbi = ll_i2sbi(inode);
338 struct ll_inode_info *lli = ll_i2info(inode);
341 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
342 inode->i_generation, inode);
344 #ifdef CONFIG_FS_POSIX_ACL
345 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
349 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
350 fd->fd_flags &= ~LL_FILE_RMTACL;
351 rct_del(&sbi->ll_rct, current_pid());
352 et_search_free(&sbi->ll_et, current_pid());
357 if (!is_root_inode(inode))
358 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
359 fd = LUSTRE_FPRIVATE(file);
362 /* The last ref on @file, maybe not the owner pid of statahead.
363 * Different processes can open the same dir, "ll_opendir_key" means:
364 * it is me that should stop the statahead thread. */
365 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
366 lli->lli_opendir_pid != 0)
367 ll_stop_statahead(inode, lli->lli_opendir_key);
369 if (is_root_inode(inode)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 lov_read_and_clear_async_rc(lli->lli_clob);
377 lli->lli_async_rc = 0;
380 rc = ll_md_close(sbi->ll_md_exp, inode, file);
382 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
383 libcfs_debug_dumplog();
388 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
389 int lmmsize, struct lookup_intent *itp)
391 struct inode *inode = dentry->d_inode;
392 struct ll_sb_info *sbi = ll_i2sbi(inode);
393 struct dentry *parent = dentry->d_parent;
394 const char *name = dentry->d_name.name;
395 const int len = dentry->d_name.len;
396 struct md_op_data *op_data;
397 struct ptlrpc_request *req;
398 __u32 opc = LUSTRE_OPC_ANY;
401 /* Usually we come here only for NFSD, and we want open lock.
402 But we can also get here with pre 2.6.15 patchless kernels, and in
403 that case that lock is also ok */
404 /* We can also get here if there was cached open handle in revalidate_it
405 * but it disappeared while we were getting from there to ll_file_open.
406 * But this means this file was closed and immediately opened which
407 * makes a good candidate for using OPEN lock */
408 /* If lmmsize & lmm are not 0, we are just setting stripe info
409 * parameters. No need for the open lock */
410 if (lmm == NULL && lmmsize == 0) {
411 itp->it_flags |= MDS_OPEN_LOCK;
412 if (itp->it_flags & FMODE_WRITE)
413 opc = LUSTRE_OPC_CREATE;
416 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
420 return PTR_ERR(op_data);
422 itp->it_flags |= MDS_OPEN_BY_FID;
423 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
424 0 /*unused */, &req, ll_md_blocking_ast, 0);
425 ll_finish_md_op_data(op_data);
427 /* reason for keep own exit path - don`t flood log
428 * with messages with -ESTALE errors.
430 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
431 it_open_error(DISP_OPEN_OPEN, itp))
433 ll_release_openhandle(inode, itp);
437 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
442 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
443 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
444 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
448 rc = ll_prep_inode(&inode, req, NULL, itp);
449 if (!rc && itp->d.lustre.it_lock_mode)
450 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
453 ptlrpc_req_finished(req);
454 ll_intent_drop_lock(itp);
460 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
461 * not believe attributes if a few ioepoch holders exist. Attributes for
462 * previous ioepoch if new one is opened are also skipped by MDS.
464 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
466 if (ioepoch && lli->lli_ioepoch != ioepoch) {
467 lli->lli_ioepoch = ioepoch;
468 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
469 ioepoch, PFID(&lli->lli_fid));
473 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
474 struct obd_client_handle *och)
476 struct ptlrpc_request *req = it->d.lustre.it_data;
477 struct mdt_body *body;
479 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
480 och->och_fh = body->handle;
481 och->och_fid = body->fid1;
482 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
483 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
484 och->och_flags = it->it_flags;
486 return md_set_open_replay_data(md_exp, och, it);
489 static int ll_local_open(struct file *file, struct lookup_intent *it,
490 struct ll_file_data *fd, struct obd_client_handle *och)
492 struct inode *inode = file_inode(file);
493 struct ll_inode_info *lli = ll_i2info(inode);
495 LASSERT(!LUSTRE_FPRIVATE(file));
500 struct ptlrpc_request *req = it->d.lustre.it_data;
501 struct mdt_body *body;
504 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
508 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
509 ll_ioepoch_open(lli, body->ioepoch);
512 LUSTRE_FPRIVATE(file) = fd;
513 ll_readahead_init(inode, &fd->fd_ras);
514 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
518 /* Open a file, and (for the very first open) create objects on the OSTs at
519 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
520 * creation or open until ll_lov_setstripe() ioctl is called.
522 * If we already have the stripe MD locally then we don't request it in
523 * md_open(), by passing a lmm_size = 0.
525 * It is up to the application to ensure no other processes open this file
526 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
527 * used. We might be able to avoid races of that sort by getting lli_open_sem
528 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
529 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
531 int ll_file_open(struct inode *inode, struct file *file)
533 struct ll_inode_info *lli = ll_i2info(inode);
534 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
535 .it_flags = file->f_flags };
536 struct obd_client_handle **och_p = NULL;
537 __u64 *och_usecount = NULL;
538 struct ll_file_data *fd;
539 int rc = 0, opendir_set = 0;
541 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
542 inode->i_generation, inode, file->f_flags);
544 it = file->private_data; /* XXX: compat macro */
545 file->private_data = NULL; /* prevent ll_local_open assertion */
547 fd = ll_file_data_get();
554 if (S_ISDIR(inode->i_mode)) {
555 spin_lock(&lli->lli_sa_lock);
556 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
557 lli->lli_opendir_pid == 0) {
558 lli->lli_opendir_key = fd;
559 lli->lli_opendir_pid = current_pid();
562 spin_unlock(&lli->lli_sa_lock);
565 if (is_root_inode(inode)) {
566 LUSTRE_FPRIVATE(file) = fd;
570 if (!it || !it->d.lustre.it_disposition) {
571 /* Convert f_flags into access mode. We cannot use file->f_mode,
572 * because everything but O_ACCMODE mask was stripped from
574 if ((oit.it_flags + 1) & O_ACCMODE)
576 if (file->f_flags & O_TRUNC)
577 oit.it_flags |= FMODE_WRITE;
579 /* kernel only call f_op->open in dentry_open. filp_open calls
580 * dentry_open after call to open_namei that checks permissions.
581 * Only nfsd_open call dentry_open directly without checking
582 * permissions and because of that this code below is safe. */
583 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
584 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
586 /* We do not want O_EXCL here, presumably we opened the file
587 * already? XXX - NFS implications? */
588 oit.it_flags &= ~O_EXCL;
590 /* bug20584, if "it_flags" contains O_CREAT, the file will be
591 * created if necessary, then "IT_CREAT" should be set to keep
592 * consistent with it */
593 if (oit.it_flags & O_CREAT)
594 oit.it_op |= IT_CREAT;
600 /* Let's see if we have file open on MDS already. */
601 if (it->it_flags & FMODE_WRITE) {
602 och_p = &lli->lli_mds_write_och;
603 och_usecount = &lli->lli_open_fd_write_count;
604 } else if (it->it_flags & FMODE_EXEC) {
605 och_p = &lli->lli_mds_exec_och;
606 och_usecount = &lli->lli_open_fd_exec_count;
608 och_p = &lli->lli_mds_read_och;
609 och_usecount = &lli->lli_open_fd_read_count;
612 mutex_lock(&lli->lli_och_mutex);
613 if (*och_p) { /* Open handle is present */
614 if (it_disposition(it, DISP_OPEN_OPEN)) {
615 /* Well, there's extra open request that we do not need,
616 let's close it somehow. This will decref request. */
617 rc = it_open_error(DISP_OPEN_OPEN, it);
619 mutex_unlock(&lli->lli_och_mutex);
623 ll_release_openhandle(inode, it);
627 rc = ll_local_open(file, it, fd, NULL);
630 mutex_unlock(&lli->lli_och_mutex);
634 LASSERT(*och_usecount == 0);
635 if (!it->d.lustre.it_disposition) {
636 /* We cannot just request lock handle now, new ELC code
637 means that one of other OPEN locks for this file
638 could be cancelled, and since blocking ast handler
639 would attempt to grab och_mutex as well, that would
640 result in a deadlock */
641 mutex_unlock(&lli->lli_och_mutex);
642 it->it_create_mode |= M_CHECK_STALE;
643 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
644 it->it_create_mode &= ~M_CHECK_STALE;
650 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
658 /* md_intent_lock() didn't get a request ref if there was an
659 * open error, so don't do cleanup on the request here
661 /* XXX (green): Should not we bail out on any error here, not
662 * just open error? */
663 rc = it_open_error(DISP_OPEN_OPEN, it);
667 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
669 rc = ll_local_open(file, it, fd, *och_p);
673 mutex_unlock(&lli->lli_och_mutex);
676 /* Must do this outside lli_och_mutex lock to prevent deadlock where
677 different kind of OPEN lock for this same inode gets cancelled
678 by ldlm_cancel_lru */
679 if (!S_ISREG(inode->i_mode))
684 if (!lli->lli_has_smd &&
685 (cl_is_lov_delay_create(file->f_flags) ||
686 (file->f_mode & FMODE_WRITE) == 0)) {
687 CDEBUG(D_INODE, "object creation was delayed\n");
690 cl_lov_delay_create_clear(&file->f_flags);
695 if (och_p && *och_p) {
696 OBD_FREE(*och_p, sizeof(struct obd_client_handle));
697 *och_p = NULL; /* OBD_FREE writes some magic there */
700 mutex_unlock(&lli->lli_och_mutex);
703 if (opendir_set != 0)
704 ll_stop_statahead(inode, lli->lli_opendir_key);
706 ll_file_data_put(fd);
708 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
711 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
712 ptlrpc_req_finished(it->d.lustre.it_data);
713 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
719 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
720 struct ldlm_lock_desc *desc, void *data, int flag)
723 struct lustre_handle lockh;
726 case LDLM_CB_BLOCKING:
727 ldlm_lock2handle(lock, &lockh);
728 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
730 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
734 case LDLM_CB_CANCELING:
742 * Acquire a lease and open the file.
744 static struct obd_client_handle *
745 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
748 struct lookup_intent it = { .it_op = IT_OPEN };
749 struct ll_sb_info *sbi = ll_i2sbi(inode);
750 struct md_op_data *op_data;
751 struct ptlrpc_request *req;
752 struct lustre_handle old_handle = { 0 };
753 struct obd_client_handle *och = NULL;
757 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
758 return ERR_PTR(-EINVAL);
761 struct ll_inode_info *lli = ll_i2info(inode);
762 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
763 struct obd_client_handle **och_p;
766 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
767 return ERR_PTR(-EPERM);
769 /* Get the openhandle of the file */
771 mutex_lock(&lli->lli_och_mutex);
772 if (fd->fd_lease_och != NULL) {
773 mutex_unlock(&lli->lli_och_mutex);
777 if (fd->fd_och == NULL) {
778 if (file->f_mode & FMODE_WRITE) {
779 LASSERT(lli->lli_mds_write_och != NULL);
780 och_p = &lli->lli_mds_write_och;
781 och_usecount = &lli->lli_open_fd_write_count;
783 LASSERT(lli->lli_mds_read_och != NULL);
784 och_p = &lli->lli_mds_read_och;
785 och_usecount = &lli->lli_open_fd_read_count;
787 if (*och_usecount == 1) {
794 mutex_unlock(&lli->lli_och_mutex);
795 if (rc < 0) /* more than 1 opener */
798 LASSERT(fd->fd_och != NULL);
799 old_handle = fd->fd_och->och_fh;
802 och = kzalloc(sizeof(*och), GFP_NOFS);
804 return ERR_PTR(-ENOMEM);
806 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
807 LUSTRE_OPC_ANY, NULL);
808 if (IS_ERR(op_data)) {
809 rc = PTR_ERR(op_data);
813 /* To tell the MDT this openhandle is from the same owner */
814 op_data->op_handle = old_handle;
816 it.it_flags = fmode | open_flags;
817 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
818 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
819 ll_md_blocking_lease_ast,
820 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
821 * it can be cancelled which may mislead applications that the lease is
823 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
824 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
825 * doesn't deal with openhandle, so normal openhandle will be leaked. */
826 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
827 ll_finish_md_op_data(op_data);
828 ptlrpc_req_finished(req);
832 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
837 rc = it_open_error(DISP_OPEN_OPEN, &it);
841 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
842 ll_och_fill(sbi->ll_md_exp, &it, och);
844 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
849 /* already get lease, handle lease lock */
850 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
851 if (it.d.lustre.it_lock_mode == 0 ||
852 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
853 /* open lock must return for lease */
854 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
855 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
856 it.d.lustre.it_lock_bits);
861 ll_intent_release(&it);
865 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
867 CERROR("Close openhandle returned %d\n", rc2);
869 /* cancel open lock */
870 if (it.d.lustre.it_lock_mode != 0) {
871 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
872 it.d.lustre.it_lock_mode);
873 it.d.lustre.it_lock_mode = 0;
876 ll_intent_release(&it);
883 * Release lease and close the file.
884 * It will check if the lease has ever broken.
886 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
889 struct ldlm_lock *lock;
890 bool cancelled = true;
893 lock = ldlm_handle2lock(&och->och_lease_handle);
895 lock_res_and_lock(lock);
896 cancelled = ldlm_is_cancel(lock);
897 unlock_res_and_lock(lock);
901 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
902 PFID(&ll_i2info(inode)->lli_fid), cancelled);
905 ldlm_cli_cancel(&och->och_lease_handle, 0);
906 if (lease_broken != NULL)
907 *lease_broken = cancelled;
909 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
914 /* Fills the obdo with the attributes for the lsm */
915 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
916 struct obd_capa *capa, struct obdo *obdo,
917 __u64 ioepoch, int sync)
919 struct ptlrpc_request_set *set;
920 struct obd_info oinfo = { { { 0 } } };
923 LASSERT(lsm != NULL);
927 oinfo.oi_oa->o_oi = lsm->lsm_oi;
928 oinfo.oi_oa->o_mode = S_IFREG;
929 oinfo.oi_oa->o_ioepoch = ioepoch;
930 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
931 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
932 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
933 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
934 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
935 OBD_MD_FLDATAVERSION;
936 oinfo.oi_capa = capa;
938 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
939 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
942 set = ptlrpc_prep_set();
944 CERROR("can't allocate ptlrpc set\n");
947 rc = obd_getattr_async(exp, &oinfo, set);
949 rc = ptlrpc_set_wait(set);
950 ptlrpc_set_destroy(set);
953 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
954 OBD_MD_FLATIME | OBD_MD_FLMTIME |
955 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
956 OBD_MD_FLDATAVERSION);
961 * Performs the getattr on the inode and updates its fields.
962 * If @sync != 0, perform the getattr under the server-side lock.
964 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
965 __u64 ioepoch, int sync)
967 struct obd_capa *capa = ll_mdscapa_get(inode);
968 struct lov_stripe_md *lsm;
971 lsm = ccc_inode_lsm_get(inode);
972 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
973 capa, obdo, ioepoch, sync);
976 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
978 obdo_refresh_inode(inode, obdo, obdo->o_valid);
979 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
980 POSTID(oi), i_size_read(inode),
981 (unsigned long long)inode->i_blocks,
982 1UL << inode->i_blkbits);
984 ccc_inode_lsm_put(inode, lsm);
988 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
990 struct ll_inode_info *lli = ll_i2info(inode);
991 struct cl_object *obj = lli->lli_clob;
992 struct cl_attr *attr = ccc_env_thread_attr(env);
996 ll_inode_size_lock(inode);
997 /* merge timestamps the most recently obtained from mds with
998 timestamps obtained from osts */
999 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1000 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1001 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1003 lvb.lvb_size = i_size_read(inode);
1004 lvb.lvb_blocks = inode->i_blocks;
1005 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1006 lvb.lvb_atime = LTIME_S(inode->i_atime);
1007 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1009 cl_object_attr_lock(obj);
1010 rc = cl_object_attr_get(env, obj, attr);
1011 cl_object_attr_unlock(obj);
1014 if (lvb.lvb_atime < attr->cat_atime)
1015 lvb.lvb_atime = attr->cat_atime;
1016 if (lvb.lvb_ctime < attr->cat_ctime)
1017 lvb.lvb_ctime = attr->cat_ctime;
1018 if (lvb.lvb_mtime < attr->cat_mtime)
1019 lvb.lvb_mtime = attr->cat_mtime;
1021 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1022 PFID(&lli->lli_fid), attr->cat_size);
1023 cl_isize_write_nolock(inode, attr->cat_size);
1025 inode->i_blocks = attr->cat_blocks;
1027 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1028 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1029 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1031 ll_inode_size_unlock(inode);
1036 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1039 struct obdo obdo = { 0 };
1042 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1044 st->st_size = obdo.o_size;
1045 st->st_blocks = obdo.o_blocks;
1046 st->st_mtime = obdo.o_mtime;
1047 st->st_atime = obdo.o_atime;
1048 st->st_ctime = obdo.o_ctime;
1053 static bool file_is_noatime(const struct file *file)
1055 const struct vfsmount *mnt = file->f_path.mnt;
1056 const struct inode *inode = file_inode(file);
1058 /* Adapted from file_accessed() and touch_atime().*/
1059 if (file->f_flags & O_NOATIME)
1062 if (inode->i_flags & S_NOATIME)
1065 if (IS_NOATIME(inode))
1068 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1071 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1074 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1080 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1082 struct inode *inode = file_inode(file);
1084 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1086 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1087 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1088 file->f_flags & O_DIRECT ||
1091 io->ci_obj = ll_i2info(inode)->lli_clob;
1092 io->ci_lockreq = CILR_MAYBE;
1093 if (ll_file_nolock(file)) {
1094 io->ci_lockreq = CILR_NEVER;
1095 io->ci_no_srvlock = 1;
1096 } else if (file->f_flags & O_APPEND) {
1097 io->ci_lockreq = CILR_MANDATORY;
1100 io->ci_noatime = file_is_noatime(file);
1104 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1105 struct file *file, enum cl_io_type iot,
1106 loff_t *ppos, size_t count)
1108 struct ll_inode_info *lli = ll_i2info(file_inode(file));
1109 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1114 io = ccc_env_thread_io(env);
1115 ll_io_init(io, file, iot == CIT_WRITE);
1117 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1118 struct vvp_io *vio = vvp_env_io(env);
1119 struct ccc_io *cio = ccc_env_io(env);
1120 int write_mutex_locked = 0;
1122 cio->cui_fd = LUSTRE_FPRIVATE(file);
1123 vio->cui_io_subtype = args->via_io_subtype;
1125 switch (vio->cui_io_subtype) {
1127 cio->cui_iter = args->u.normal.via_iter;
1128 cio->cui_iocb = args->u.normal.via_iocb;
1129 if ((iot == CIT_WRITE) &&
1130 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1131 if (mutex_lock_interruptible(&lli->
1133 result = -ERESTARTSYS;
1136 write_mutex_locked = 1;
1137 } else if (iot == CIT_READ) {
1138 down_read(&lli->lli_trunc_sem);
1142 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1143 vio->u.splice.cui_flags = args->u.splice.via_flags;
1146 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1149 result = cl_io_loop(env, io);
1150 if (write_mutex_locked)
1151 mutex_unlock(&lli->lli_write_mutex);
1152 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1153 up_read(&lli->lli_trunc_sem);
1155 /* cl_io_rw_init() handled IO */
1156 result = io->ci_result;
1159 if (io->ci_nob > 0) {
1160 result = io->ci_nob;
1161 *ppos = io->u.ci_wr.wr.crw_pos;
1165 cl_io_fini(env, io);
1166 /* If any bit been read/written (result != 0), we just return
1167 * short read/write instead of restart io. */
1168 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1169 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1170 iot == CIT_READ ? "read" : "write",
1171 file, *ppos, count);
1172 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1176 if (iot == CIT_READ) {
1178 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1179 LPROC_LL_READ_BYTES, result);
1180 } else if (iot == CIT_WRITE) {
1182 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1183 LPROC_LL_WRITE_BYTES, result);
1184 fd->fd_write_failed = false;
1185 } else if (result != -ERESTARTSYS) {
1186 fd->fd_write_failed = true;
1193 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1196 struct vvp_io_args *args;
1200 env = cl_env_get(&refcheck);
1202 return PTR_ERR(env);
1204 args = vvp_env_args(env, IO_NORMAL);
1205 args->u.normal.via_iter = to;
1206 args->u.normal.via_iocb = iocb;
1208 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1209 &iocb->ki_pos, iov_iter_count(to));
1210 cl_env_put(env, &refcheck);
1215 * Write to a file (through the page cache).
1217 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1220 struct vvp_io_args *args;
1224 env = cl_env_get(&refcheck);
1226 return PTR_ERR(env);
1228 args = vvp_env_args(env, IO_NORMAL);
1229 args->u.normal.via_iter = from;
1230 args->u.normal.via_iocb = iocb;
1232 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1233 &iocb->ki_pos, iov_iter_count(from));
1234 cl_env_put(env, &refcheck);
1239 * Send file content (through pagecache) somewhere with helper
1241 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1242 struct pipe_inode_info *pipe, size_t count,
1246 struct vvp_io_args *args;
1250 env = cl_env_get(&refcheck);
1252 return PTR_ERR(env);
1254 args = vvp_env_args(env, IO_SPLICE);
1255 args->u.splice.via_pipe = pipe;
1256 args->u.splice.via_flags = flags;
1258 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1259 cl_env_put(env, &refcheck);
1263 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1265 struct obd_export *exp = ll_i2dtexp(inode);
1266 struct obd_trans_info oti = { 0 };
1267 struct obdo *oa = NULL;
1270 struct lov_stripe_md *lsm = NULL, *lsm2;
1276 lsm = ccc_inode_lsm_get(inode);
1277 if (!lsm_has_objects(lsm)) {
1282 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1283 (lsm->lsm_stripe_count));
1285 OBD_ALLOC_LARGE(lsm2, lsm_size);
1292 oa->o_nlink = ost_idx;
1293 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1294 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1295 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1296 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1297 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1298 memcpy(lsm2, lsm, lsm_size);
1299 ll_inode_size_lock(inode);
1300 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1301 ll_inode_size_unlock(inode);
1303 OBD_FREE_LARGE(lsm2, lsm_size);
1306 ccc_inode_lsm_put(inode, lsm);
1311 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1313 struct ll_recreate_obj ucreat;
1316 if (!capable(CFS_CAP_SYS_ADMIN))
1319 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1323 ostid_set_seq_mdt0(&oi);
1324 ostid_set_id(&oi, ucreat.lrc_id);
1325 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1328 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1334 if (!capable(CFS_CAP_SYS_ADMIN))
1337 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1340 fid_to_ostid(&fid, &oi);
1341 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1342 return ll_lov_recreate(inode, &oi, ost_idx);
1345 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1346 int flags, struct lov_user_md *lum, int lum_size)
1348 struct lov_stripe_md *lsm = NULL;
1349 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1352 lsm = ccc_inode_lsm_get(inode);
1354 ccc_inode_lsm_put(inode, lsm);
1355 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1361 ll_inode_size_lock(inode);
1362 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1365 rc = oit.d.lustre.it_status;
1369 ll_release_openhandle(inode, &oit);
1372 ll_inode_size_unlock(inode);
1373 ll_intent_release(&oit);
1374 ccc_inode_lsm_put(inode, lsm);
1378 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1382 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1383 struct lov_mds_md **lmmp, int *lmm_size,
1384 struct ptlrpc_request **request)
1386 struct ll_sb_info *sbi = ll_i2sbi(inode);
1387 struct mdt_body *body;
1388 struct lov_mds_md *lmm = NULL;
1389 struct ptlrpc_request *req = NULL;
1390 struct md_op_data *op_data;
1393 rc = ll_get_default_mdsize(sbi, &lmmsize);
1397 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1398 strlen(filename), lmmsize,
1399 LUSTRE_OPC_ANY, NULL);
1400 if (IS_ERR(op_data))
1401 return PTR_ERR(op_data);
1403 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1404 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1405 ll_finish_md_op_data(op_data);
1407 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1412 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1413 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1415 lmmsize = body->eadatasize;
1417 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1423 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1424 LASSERT(lmm != NULL);
1426 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1427 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1433 * This is coming from the MDS, so is probably in
1434 * little endian. We convert it to host endian before
1435 * passing it to userspace.
1437 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1440 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1441 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1444 /* if function called for directory - we should
1445 * avoid swab not existent lsm objects */
1446 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1447 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1448 if (S_ISREG(body->mode))
1449 lustre_swab_lov_user_md_objects(
1450 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1452 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1453 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1454 if (S_ISREG(body->mode))
1455 lustre_swab_lov_user_md_objects(
1456 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1463 *lmm_size = lmmsize;
1468 static int ll_lov_setea(struct inode *inode, struct file *file,
1471 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1472 struct lov_user_md *lump;
1473 int lum_size = sizeof(struct lov_user_md) +
1474 sizeof(struct lov_user_ost_data);
1477 if (!capable(CFS_CAP_SYS_ADMIN))
1480 OBD_ALLOC_LARGE(lump, lum_size);
1484 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1485 OBD_FREE_LARGE(lump, lum_size);
1489 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1491 cl_lov_delay_create_clear(&file->f_flags);
1493 OBD_FREE_LARGE(lump, lum_size);
1497 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1500 struct lov_user_md_v3 lumv3;
1501 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1502 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1503 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1505 int flags = FMODE_WRITE;
1507 /* first try with v1 which is smaller than v3 */
1508 lum_size = sizeof(struct lov_user_md_v1);
1509 if (copy_from_user(lumv1, lumv1p, lum_size))
1512 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1513 lum_size = sizeof(struct lov_user_md_v3);
1514 if (copy_from_user(&lumv3, lumv3p, lum_size))
1518 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1520 cl_lov_delay_create_clear(&file->f_flags);
1522 struct lov_stripe_md *lsm;
1525 put_user(0, &lumv1p->lmm_stripe_count);
1527 ll_layout_refresh(inode, &gen);
1528 lsm = ccc_inode_lsm_get(inode);
1529 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1530 0, lsm, (void *)arg);
1531 ccc_inode_lsm_put(inode, lsm);
1536 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1538 struct lov_stripe_md *lsm;
1541 lsm = ccc_inode_lsm_get(inode);
1543 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1545 ccc_inode_lsm_put(inode, lsm);
1550 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1552 struct ll_inode_info *lli = ll_i2info(inode);
1553 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1554 struct ccc_grouplock grouplock;
1558 CWARN("group id for group lock must not be 0\n");
1562 if (ll_file_nolock(file))
1565 spin_lock(&lli->lli_lock);
1566 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1567 CWARN("group lock already existed with gid %lu\n",
1568 fd->fd_grouplock.cg_gid);
1569 spin_unlock(&lli->lli_lock);
1572 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1573 spin_unlock(&lli->lli_lock);
1575 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1576 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1580 spin_lock(&lli->lli_lock);
1581 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1582 spin_unlock(&lli->lli_lock);
1583 CERROR("another thread just won the race\n");
1584 cl_put_grouplock(&grouplock);
1588 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1589 fd->fd_grouplock = grouplock;
1590 spin_unlock(&lli->lli_lock);
1592 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1596 static int ll_put_grouplock(struct inode *inode, struct file *file,
1599 struct ll_inode_info *lli = ll_i2info(inode);
1600 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1601 struct ccc_grouplock grouplock;
1603 spin_lock(&lli->lli_lock);
1604 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1605 spin_unlock(&lli->lli_lock);
1606 CWARN("no group lock held\n");
1609 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1611 if (fd->fd_grouplock.cg_gid != arg) {
1612 CWARN("group lock %lu doesn't match current id %lu\n",
1613 arg, fd->fd_grouplock.cg_gid);
1614 spin_unlock(&lli->lli_lock);
1618 grouplock = fd->fd_grouplock;
1619 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1620 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1621 spin_unlock(&lli->lli_lock);
1623 cl_put_grouplock(&grouplock);
1624 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1629 * Close inode open handle
1631 * \param inode [in] inode in question
1632 * \param it [in,out] intent which contains open info and result
1635 * \retval <0 failure
1637 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1639 struct obd_client_handle *och;
1644 /* Root ? Do nothing. */
1645 if (is_root_inode(inode))
1648 /* No open handle to close? Move away */
1649 if (!it_disposition(it, DISP_OPEN_OPEN))
1652 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1654 och = kzalloc(sizeof(*och), GFP_NOFS);
1660 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1662 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1665 /* this one is in place of ll_file_open */
1666 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1667 ptlrpc_req_finished(it->d.lustre.it_data);
1668 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1674 * Get size for inode for which FIEMAP mapping is requested.
1675 * Make the FIEMAP get_info call and returns the result.
1677 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1680 struct obd_export *exp = ll_i2dtexp(inode);
1681 struct lov_stripe_md *lsm = NULL;
1682 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1683 __u32 vallen = num_bytes;
1686 /* Checks for fiemap flags */
1687 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1688 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1692 /* Check for FIEMAP_FLAG_SYNC */
1693 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1694 rc = filemap_fdatawrite(inode->i_mapping);
1699 lsm = ccc_inode_lsm_get(inode);
1703 /* If the stripe_count > 1 and the application does not understand
1704 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1706 if (lsm->lsm_stripe_count > 1 &&
1707 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1712 fm_key.oa.o_oi = lsm->lsm_oi;
1713 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1715 if (i_size_read(inode) == 0) {
1716 rc = ll_glimpse_size(inode);
1721 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1722 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1723 /* If filesize is 0, then there would be no objects for mapping */
1724 if (fm_key.oa.o_size == 0) {
1725 fiemap->fm_mapped_extents = 0;
1730 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1732 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1735 CERROR("obd_get_info failed: rc = %d\n", rc);
1738 ccc_inode_lsm_put(inode, lsm);
1742 int ll_fid2path(struct inode *inode, void __user *arg)
1744 struct obd_export *exp = ll_i2mdexp(inode);
1745 const struct getinfo_fid2path __user *gfin = arg;
1746 struct getinfo_fid2path *gfout;
1751 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1752 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1755 /* Only need to get the buflen */
1756 if (get_user(pathlen, &gfin->gf_pathlen))
1759 if (pathlen > PATH_MAX)
1762 outsize = sizeof(*gfout) + pathlen;
1764 gfout = kzalloc(outsize, GFP_NOFS);
1768 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1773 /* Call mdc_iocontrol */
1774 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1778 if (copy_to_user(arg, gfout, outsize))
1782 OBD_FREE(gfout, outsize);
1786 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1788 struct ll_user_fiemap *fiemap_s;
1789 size_t num_bytes, ret_bytes;
1790 unsigned int extent_count;
1793 /* Get the extent count so we can calculate the size of
1794 * required fiemap buffer */
1795 if (get_user(extent_count,
1796 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1800 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1802 num_bytes = sizeof(*fiemap_s) + (extent_count *
1803 sizeof(struct ll_fiemap_extent));
1805 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1806 if (fiemap_s == NULL)
1809 /* get the fiemap value */
1810 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1811 sizeof(*fiemap_s))) {
1816 /* If fm_extent_count is non-zero, read the first extent since
1817 * it is used to calculate end_offset and device from previous
1820 if (copy_from_user(&fiemap_s->fm_extents[0],
1821 (char __user *)arg + sizeof(*fiemap_s),
1822 sizeof(struct ll_fiemap_extent))) {
1828 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1832 ret_bytes = sizeof(struct ll_user_fiemap);
1834 if (extent_count != 0)
1835 ret_bytes += (fiemap_s->fm_mapped_extents *
1836 sizeof(struct ll_fiemap_extent));
1838 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1842 OBD_FREE_LARGE(fiemap_s, num_bytes);
1847 * Read the data_version for inode.
1849 * This value is computed using stripe object version on OST.
1850 * Version is computed using server side locking.
1852 * @param extent_lock Take extent lock. Not needed if a process is already
1853 * holding the OST object group locks.
1855 int ll_data_version(struct inode *inode, __u64 *data_version,
1858 struct lov_stripe_md *lsm = NULL;
1859 struct ll_sb_info *sbi = ll_i2sbi(inode);
1860 struct obdo *obdo = NULL;
1863 /* If no stripe, we consider version is 0. */
1864 lsm = ccc_inode_lsm_get(inode);
1865 if (!lsm_has_objects(lsm)) {
1867 CDEBUG(D_INODE, "No object for inode\n");
1872 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1878 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1880 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1883 *data_version = obdo->o_data_version;
1888 ccc_inode_lsm_put(inode, lsm);
1893 * Trigger a HSM release request for the provided inode.
1895 int ll_hsm_release(struct inode *inode)
1897 struct cl_env_nest nest;
1899 struct obd_client_handle *och = NULL;
1900 __u64 data_version = 0;
1904 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1905 ll_get_fsname(inode->i_sb, NULL, 0),
1906 PFID(&ll_i2info(inode)->lli_fid));
1908 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1914 /* Grab latest data_version and [am]time values */
1915 rc = ll_data_version(inode, &data_version, 1);
1919 env = cl_env_nested_get(&nest);
1925 ll_merge_lvb(env, inode);
1926 cl_env_nested_put(&nest, env);
1928 /* Release the file.
1929 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1930 * we still need it to pack l_remote_handle to MDT. */
1931 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1937 if (och != NULL && !IS_ERR(och)) /* close the file */
1938 ll_lease_close(och, inode, NULL);
1943 struct ll_swap_stack {
1944 struct iattr ia1, ia2;
1946 struct inode *inode1, *inode2;
1947 bool check_dv1, check_dv2;
1950 static int ll_swap_layouts(struct file *file1, struct file *file2,
1951 struct lustre_swap_layouts *lsl)
1953 struct mdc_swap_layouts msl;
1954 struct md_op_data *op_data;
1957 struct ll_swap_stack *llss = NULL;
1960 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1964 llss->inode1 = file_inode(file1);
1965 llss->inode2 = file_inode(file2);
1967 if (!S_ISREG(llss->inode2->i_mode)) {
1972 if (inode_permission(llss->inode1, MAY_WRITE) ||
1973 inode_permission(llss->inode2, MAY_WRITE)) {
1978 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1983 /* we use 2 bool because it is easier to swap than 2 bits */
1984 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1985 llss->check_dv1 = true;
1987 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1988 llss->check_dv2 = true;
1990 /* we cannot use lsl->sl_dvX directly because we may swap them */
1991 llss->dv1 = lsl->sl_dv1;
1992 llss->dv2 = lsl->sl_dv2;
1994 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1995 if (rc == 0) /* same file, done! */ {
2000 if (rc < 0) { /* sequentialize it */
2001 swap(llss->inode1, llss->inode2);
2003 swap(llss->dv1, llss->dv2);
2004 swap(llss->check_dv1, llss->check_dv2);
2008 if (gid != 0) { /* application asks to flush dirty cache */
2009 rc = ll_get_grouplock(llss->inode1, file1, gid);
2013 rc = ll_get_grouplock(llss->inode2, file2, gid);
2015 ll_put_grouplock(llss->inode1, file1, gid);
2020 /* to be able to restore mtime and atime after swap
2021 * we need to first save them */
2023 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2024 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2025 llss->ia1.ia_atime = llss->inode1->i_atime;
2026 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2027 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2028 llss->ia2.ia_atime = llss->inode2->i_atime;
2029 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2032 /* ultimate check, before swapping the layouts we check if
2033 * dataversion has changed (if requested) */
2034 if (llss->check_dv1) {
2035 rc = ll_data_version(llss->inode1, &dv, 0);
2038 if (dv != llss->dv1) {
2044 if (llss->check_dv2) {
2045 rc = ll_data_version(llss->inode2, &dv, 0);
2048 if (dv != llss->dv2) {
2054 /* struct md_op_data is used to send the swap args to the mdt
2055 * only flags is missing, so we use struct mdc_swap_layouts
2056 * through the md_op_data->op_data */
2057 /* flags from user space have to be converted before they are send to
2058 * server, no flag is sent today, they are only used on the client */
2061 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2062 0, LUSTRE_OPC_ANY, &msl);
2063 if (IS_ERR(op_data)) {
2064 rc = PTR_ERR(op_data);
2068 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2069 sizeof(*op_data), op_data, NULL);
2070 ll_finish_md_op_data(op_data);
2074 ll_put_grouplock(llss->inode2, file2, gid);
2075 ll_put_grouplock(llss->inode1, file1, gid);
2078 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2082 /* clear useless flags */
2083 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2084 llss->ia1.ia_valid &= ~ATTR_MTIME;
2085 llss->ia2.ia_valid &= ~ATTR_MTIME;
2088 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2089 llss->ia1.ia_valid &= ~ATTR_ATIME;
2090 llss->ia2.ia_valid &= ~ATTR_ATIME;
2093 /* update time if requested */
2095 if (llss->ia2.ia_valid != 0) {
2096 mutex_lock(&llss->inode1->i_mutex);
2097 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2098 mutex_unlock(&llss->inode1->i_mutex);
2101 if (llss->ia1.ia_valid != 0) {
2104 mutex_lock(&llss->inode2->i_mutex);
2105 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2106 mutex_unlock(&llss->inode2->i_mutex);
2118 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2120 struct md_op_data *op_data;
2123 /* Non-root users are forbidden to set or clear flags which are
2124 * NOT defined in HSM_USER_MASK. */
2125 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2126 !capable(CFS_CAP_SYS_ADMIN))
2129 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2130 LUSTRE_OPC_ANY, hss);
2131 if (IS_ERR(op_data))
2132 return PTR_ERR(op_data);
2134 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2135 sizeof(*op_data), op_data, NULL);
2137 ll_finish_md_op_data(op_data);
2142 static int ll_hsm_import(struct inode *inode, struct file *file,
2143 struct hsm_user_import *hui)
2145 struct hsm_state_set *hss = NULL;
2146 struct iattr *attr = NULL;
2150 if (!S_ISREG(inode->i_mode))
2154 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2160 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2161 hss->hss_archive_id = hui->hui_archive_id;
2162 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2163 rc = ll_hsm_state_set(inode, hss);
2167 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2173 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2174 attr->ia_mode |= S_IFREG;
2175 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2176 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2177 attr->ia_size = hui->hui_size;
2178 attr->ia_mtime.tv_sec = hui->hui_mtime;
2179 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2180 attr->ia_atime.tv_sec = hui->hui_atime;
2181 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2183 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2184 ATTR_UID | ATTR_GID |
2185 ATTR_MTIME | ATTR_MTIME_SET |
2186 ATTR_ATIME | ATTR_ATIME_SET;
2188 mutex_lock(&inode->i_mutex);
2190 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2194 mutex_unlock(&inode->i_mutex);
2207 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2209 struct inode *inode = file_inode(file);
2210 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2213 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2214 inode->i_generation, inode, cmd);
2215 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2217 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2218 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2222 case LL_IOC_GETFLAGS:
2223 /* Get the current value of the file flags */
2224 return put_user(fd->fd_flags, (int *)arg);
2225 case LL_IOC_SETFLAGS:
2226 case LL_IOC_CLRFLAGS:
2227 /* Set or clear specific file flags */
2228 /* XXX This probably needs checks to ensure the flags are
2229 * not abused, and to handle any flag side effects.
2231 if (get_user(flags, (int *) arg))
2234 if (cmd == LL_IOC_SETFLAGS) {
2235 if ((flags & LL_FILE_IGNORE_LOCK) &&
2236 !(file->f_flags & O_DIRECT)) {
2237 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2242 fd->fd_flags |= flags;
2244 fd->fd_flags &= ~flags;
2247 case LL_IOC_LOV_SETSTRIPE:
2248 return ll_lov_setstripe(inode, file, arg);
2249 case LL_IOC_LOV_SETEA:
2250 return ll_lov_setea(inode, file, arg);
2251 case LL_IOC_LOV_SWAP_LAYOUTS: {
2253 struct lustre_swap_layouts lsl;
2255 if (copy_from_user(&lsl, (char *)arg,
2256 sizeof(struct lustre_swap_layouts)))
2259 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2262 file2 = fget(lsl.sl_fd);
2267 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2268 rc = ll_swap_layouts(file, file2, &lsl);
2272 case LL_IOC_LOV_GETSTRIPE:
2273 return ll_lov_getstripe(inode, arg);
2274 case LL_IOC_RECREATE_OBJ:
2275 return ll_lov_recreate_obj(inode, arg);
2276 case LL_IOC_RECREATE_FID:
2277 return ll_lov_recreate_fid(inode, arg);
2278 case FSFILT_IOC_FIEMAP:
2279 return ll_ioctl_fiemap(inode, arg);
2280 case FSFILT_IOC_GETFLAGS:
2281 case FSFILT_IOC_SETFLAGS:
2282 return ll_iocontrol(inode, file, cmd, arg);
2283 case FSFILT_IOC_GETVERSION_OLD:
2284 case FSFILT_IOC_GETVERSION:
2285 return put_user(inode->i_generation, (int *)arg);
2286 case LL_IOC_GROUP_LOCK:
2287 return ll_get_grouplock(inode, file, arg);
2288 case LL_IOC_GROUP_UNLOCK:
2289 return ll_put_grouplock(inode, file, arg);
2290 case IOC_OBD_STATFS:
2291 return ll_obd_statfs(inode, (void *)arg);
2293 /* We need to special case any other ioctls we want to handle,
2294 * to send them to the MDS/OST as appropriate and to properly
2295 * network encode the arg field.
2296 case FSFILT_IOC_SETVERSION_OLD:
2297 case FSFILT_IOC_SETVERSION:
2299 case LL_IOC_FLUSHCTX:
2300 return ll_flush_ctx(inode);
2301 case LL_IOC_PATH2FID: {
2302 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2303 sizeof(struct lu_fid)))
2308 case OBD_IOC_FID2PATH:
2309 return ll_fid2path(inode, (void *)arg);
2310 case LL_IOC_DATA_VERSION: {
2311 struct ioc_data_version idv;
2314 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2317 rc = ll_data_version(inode, &idv.idv_version,
2318 !(idv.idv_flags & LL_DV_NOFLUSH));
2320 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2326 case LL_IOC_GET_MDTIDX: {
2329 mdtidx = ll_get_mdt_idx(inode);
2333 if (put_user((int)mdtidx, (int *)arg))
2338 case OBD_IOC_GETDTNAME:
2339 case OBD_IOC_GETMDNAME:
2340 return ll_get_obd_name(inode, cmd, arg);
2341 case LL_IOC_HSM_STATE_GET: {
2342 struct md_op_data *op_data;
2343 struct hsm_user_state *hus;
2346 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2350 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2351 LUSTRE_OPC_ANY, hus);
2352 if (IS_ERR(op_data)) {
2354 return PTR_ERR(op_data);
2357 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2360 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2363 ll_finish_md_op_data(op_data);
2367 case LL_IOC_HSM_STATE_SET: {
2368 struct hsm_state_set *hss;
2371 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2375 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2380 rc = ll_hsm_state_set(inode, hss);
2385 case LL_IOC_HSM_ACTION: {
2386 struct md_op_data *op_data;
2387 struct hsm_current_action *hca;
2390 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2394 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2395 LUSTRE_OPC_ANY, hca);
2396 if (IS_ERR(op_data)) {
2398 return PTR_ERR(op_data);
2401 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2404 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2407 ll_finish_md_op_data(op_data);
2411 case LL_IOC_SET_LEASE: {
2412 struct ll_inode_info *lli = ll_i2info(inode);
2413 struct obd_client_handle *och = NULL;
2419 if (!(file->f_mode & FMODE_WRITE))
2424 if (!(file->f_mode & FMODE_READ))
2429 mutex_lock(&lli->lli_och_mutex);
2430 if (fd->fd_lease_och != NULL) {
2431 och = fd->fd_lease_och;
2432 fd->fd_lease_och = NULL;
2434 mutex_unlock(&lli->lli_och_mutex);
2437 mode = och->och_flags &
2438 (FMODE_READ|FMODE_WRITE);
2439 rc = ll_lease_close(och, inode, &lease_broken);
2440 if (rc == 0 && lease_broken)
2446 /* return the type of lease or error */
2447 return rc < 0 ? rc : (int)mode;
2452 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2454 /* apply for lease */
2455 och = ll_lease_open(inode, file, mode, 0);
2457 return PTR_ERR(och);
2460 mutex_lock(&lli->lli_och_mutex);
2461 if (fd->fd_lease_och == NULL) {
2462 fd->fd_lease_och = och;
2465 mutex_unlock(&lli->lli_och_mutex);
2467 /* impossible now that only excl is supported for now */
2468 ll_lease_close(och, inode, &lease_broken);
2473 case LL_IOC_GET_LEASE: {
2474 struct ll_inode_info *lli = ll_i2info(inode);
2475 struct ldlm_lock *lock = NULL;
2478 mutex_lock(&lli->lli_och_mutex);
2479 if (fd->fd_lease_och != NULL) {
2480 struct obd_client_handle *och = fd->fd_lease_och;
2482 lock = ldlm_handle2lock(&och->och_lease_handle);
2484 lock_res_and_lock(lock);
2485 if (!ldlm_is_cancel(lock))
2486 rc = och->och_flags &
2487 (FMODE_READ | FMODE_WRITE);
2488 unlock_res_and_lock(lock);
2489 ldlm_lock_put(lock);
2492 mutex_unlock(&lli->lli_och_mutex);
2495 case LL_IOC_HSM_IMPORT: {
2496 struct hsm_user_import *hui;
2498 hui = kzalloc(sizeof(*hui), GFP_NOFS);
2502 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2507 rc = ll_hsm_import(inode, file, hui);
2516 ll_iocontrol_call(inode, file, cmd, arg, &err))
2519 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2526 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2528 struct inode *inode = file_inode(file);
2529 loff_t retval, eof = 0;
2531 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2532 (origin == SEEK_CUR) ? file->f_pos : 0);
2533 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2534 inode->i_ino, inode->i_generation, inode, retval, retval,
2536 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2538 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2539 retval = ll_glimpse_size(inode);
2542 eof = i_size_read(inode);
2545 retval = generic_file_llseek_size(file, offset, origin,
2546 ll_file_maxbytes(inode), eof);
2550 static int ll_flush(struct file *file, fl_owner_t id)
2552 struct inode *inode = file_inode(file);
2553 struct ll_inode_info *lli = ll_i2info(inode);
2554 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2557 LASSERT(!S_ISDIR(inode->i_mode));
2559 /* catch async errors that were recorded back when async writeback
2560 * failed for pages in this mapping. */
2561 rc = lli->lli_async_rc;
2562 lli->lli_async_rc = 0;
2563 err = lov_read_and_clear_async_rc(lli->lli_clob);
2567 /* The application has been told write failure already.
2568 * Do not report failure again. */
2569 if (fd->fd_write_failed)
2571 return rc ? -EIO : 0;
2575 * Called to make sure a portion of file has been written out.
2576 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2578 * Return how many pages have been written.
2580 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2581 enum cl_fsync_mode mode, int ignore_layout)
2583 struct cl_env_nest nest;
2586 struct obd_capa *capa = NULL;
2587 struct cl_fsync_io *fio;
2590 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2591 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2594 env = cl_env_nested_get(&nest);
2596 return PTR_ERR(env);
2598 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2600 io = ccc_env_thread_io(env);
2601 io->ci_obj = cl_i2info(inode)->lli_clob;
2602 io->ci_ignore_layout = ignore_layout;
2604 /* initialize parameters for sync */
2605 fio = &io->u.ci_fsync;
2606 fio->fi_capa = capa;
2607 fio->fi_start = start;
2609 fio->fi_fid = ll_inode2fid(inode);
2610 fio->fi_mode = mode;
2611 fio->fi_nr_written = 0;
2613 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2614 result = cl_io_loop(env, io);
2616 result = io->ci_result;
2618 result = fio->fi_nr_written;
2619 cl_io_fini(env, io);
2620 cl_env_nested_put(&nest, env);
2627 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2629 struct inode *inode = file_inode(file);
2630 struct ll_inode_info *lli = ll_i2info(inode);
2631 struct ptlrpc_request *req;
2632 struct obd_capa *oc;
2635 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2636 inode->i_generation, inode);
2637 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2639 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2640 mutex_lock(&inode->i_mutex);
2642 /* catch async errors that were recorded back when async writeback
2643 * failed for pages in this mapping. */
2644 if (!S_ISDIR(inode->i_mode)) {
2645 err = lli->lli_async_rc;
2646 lli->lli_async_rc = 0;
2649 err = lov_read_and_clear_async_rc(lli->lli_clob);
2654 oc = ll_mdscapa_get(inode);
2655 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2661 ptlrpc_req_finished(req);
2663 if (S_ISREG(inode->i_mode)) {
2664 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2666 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2667 if (rc == 0 && err < 0)
2670 fd->fd_write_failed = true;
2672 fd->fd_write_failed = false;
2675 mutex_unlock(&inode->i_mutex);
2680 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2682 struct inode *inode = file_inode(file);
2683 struct ll_sb_info *sbi = ll_i2sbi(inode);
2684 struct ldlm_enqueue_info einfo = {
2685 .ei_type = LDLM_FLOCK,
2686 .ei_cb_cp = ldlm_flock_completion_ast,
2687 .ei_cbdata = file_lock,
2689 struct md_op_data *op_data;
2690 struct lustre_handle lockh = {0};
2691 ldlm_policy_data_t flock = {{0}};
2696 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2697 inode->i_ino, file_lock);
2699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2701 if (file_lock->fl_flags & FL_FLOCK)
2702 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2703 else if (!(file_lock->fl_flags & FL_POSIX))
2706 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2707 flock.l_flock.pid = file_lock->fl_pid;
2708 flock.l_flock.start = file_lock->fl_start;
2709 flock.l_flock.end = file_lock->fl_end;
2711 /* Somewhat ugly workaround for svc lockd.
2712 * lockd installs custom fl_lmops->lm_compare_owner that checks
2713 * for the fl_owner to be the same (which it always is on local node
2714 * I guess between lockd processes) and then compares pid.
2715 * As such we assign pid to the owner field to make it all work,
2716 * conflict with normal locks is unlikely since pid space and
2717 * pointer space for current->files are not intersecting */
2718 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2719 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2721 switch (file_lock->fl_type) {
2723 einfo.ei_mode = LCK_PR;
2726 /* An unlock request may or may not have any relation to
2727 * existing locks so we may not be able to pass a lock handle
2728 * via a normal ldlm_lock_cancel() request. The request may even
2729 * unlock a byte range in the middle of an existing lock. In
2730 * order to process an unlock request we need all of the same
2731 * information that is given with a normal read or write record
2732 * lock request. To avoid creating another ldlm unlock (cancel)
2733 * message we'll treat a LCK_NL flock request as an unlock. */
2734 einfo.ei_mode = LCK_NL;
2737 einfo.ei_mode = LCK_PW;
2740 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2741 file_lock->fl_type);
2756 flags = LDLM_FL_BLOCK_NOWAIT;
2762 flags = LDLM_FL_TEST_LOCK;
2763 /* Save the old mode so that if the mode in the lock changes we
2764 * can decrement the appropriate reader or writer refcount. */
2765 file_lock->fl_type = einfo.ei_mode;
2768 CERROR("unknown fcntl lock command: %d\n", cmd);
2772 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2773 LUSTRE_OPC_ANY, NULL);
2774 if (IS_ERR(op_data))
2775 return PTR_ERR(op_data);
2777 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2778 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2779 flock.l_flock.start, flock.l_flock.end);
2781 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2782 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2784 if ((file_lock->fl_flags & FL_FLOCK) &&
2785 (rc == 0 || file_lock->fl_type == F_UNLCK))
2786 rc2 = flock_lock_file_wait(file, file_lock);
2787 if ((file_lock->fl_flags & FL_POSIX) &&
2788 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2789 !(flags & LDLM_FL_TEST_LOCK))
2790 rc2 = posix_lock_file_wait(file, file_lock);
2792 if (rc2 && file_lock->fl_type != F_UNLCK) {
2793 einfo.ei_mode = LCK_NL;
2794 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2795 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2799 ll_finish_md_op_data(op_data);
2805 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2811 * test if some locks matching bits and l_req_mode are acquired
2812 * - bits can be in different locks
2813 * - if found clear the common lock bits in *bits
2814 * - the bits not found, are kept in *bits
2816 * \param bits [IN] searched lock bits [IN]
2817 * \param l_req_mode [IN] searched lock mode
2818 * \retval boolean, true iff all bits are found
2820 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2822 struct lustre_handle lockh;
2823 ldlm_policy_data_t policy;
2824 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2825 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2833 fid = &ll_i2info(inode)->lli_fid;
2834 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2835 ldlm_lockname[mode]);
2837 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2838 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2839 policy.l_inodebits.bits = *bits & (1 << i);
2840 if (policy.l_inodebits.bits == 0)
2843 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2844 &policy, mode, &lockh)) {
2845 struct ldlm_lock *lock;
2847 lock = ldlm_handle2lock(&lockh);
2850 ~(lock->l_policy_data.l_inodebits.bits);
2851 LDLM_LOCK_PUT(lock);
2853 *bits &= ~policy.l_inodebits.bits;
2860 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2861 struct lustre_handle *lockh, __u64 flags,
2864 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2868 fid = &ll_i2info(inode)->lli_fid;
2869 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2871 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2872 fid, LDLM_IBITS, &policy, mode, lockh);
2877 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2879 /* Already unlinked. Just update nlink and return success */
2880 if (rc == -ENOENT) {
2882 /* This path cannot be hit for regular files unless in
2883 * case of obscure races, so no need to validate size.
2885 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2887 } else if (rc != 0) {
2888 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2889 "%s: revalidate FID "DFID" error: rc = %d\n",
2890 ll_get_fsname(inode->i_sb, NULL, 0),
2891 PFID(ll_inode2fid(inode)), rc);
2897 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2899 struct inode *inode = dentry->d_inode;
2900 struct ptlrpc_request *req = NULL;
2901 struct obd_export *exp;
2904 LASSERT(inode != NULL);
2906 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2907 inode->i_ino, inode->i_generation, inode, dentry);
2909 exp = ll_i2mdexp(inode);
2911 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2912 * But under CMD case, it caused some lock issues, should be fixed
2913 * with new CMD ibits lock. See bug 12718 */
2914 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2915 struct lookup_intent oit = { .it_op = IT_GETATTR };
2916 struct md_op_data *op_data;
2918 if (ibits == MDS_INODELOCK_LOOKUP)
2919 oit.it_op = IT_LOOKUP;
2921 /* Call getattr by fid, so do not provide name at all. */
2922 op_data = ll_prep_md_op_data(NULL, inode,
2924 LUSTRE_OPC_ANY, NULL);
2925 if (IS_ERR(op_data))
2926 return PTR_ERR(op_data);
2928 oit.it_create_mode |= M_CHECK_STALE;
2929 rc = md_intent_lock(exp, op_data, NULL, 0,
2930 /* we are not interested in name
2933 ll_md_blocking_ast, 0);
2934 ll_finish_md_op_data(op_data);
2935 oit.it_create_mode &= ~M_CHECK_STALE;
2937 rc = ll_inode_revalidate_fini(inode, rc);
2941 rc = ll_revalidate_it_finish(req, &oit, inode);
2943 ll_intent_release(&oit);
2947 /* Unlinked? Unhash dentry, so it is not picked up later by
2948 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2949 here to preserve get_cwd functionality on 2.6.
2951 if (!dentry->d_inode->i_nlink)
2952 d_lustre_invalidate(dentry, 0);
2954 ll_lookup_finish_locks(&oit, inode);
2955 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2956 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2957 u64 valid = OBD_MD_FLGETATTR;
2958 struct md_op_data *op_data;
2961 if (S_ISREG(inode->i_mode)) {
2962 rc = ll_get_default_mdsize(sbi, &ealen);
2965 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2968 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2969 0, ealen, LUSTRE_OPC_ANY,
2971 if (IS_ERR(op_data))
2972 return PTR_ERR(op_data);
2974 op_data->op_valid = valid;
2975 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2976 * capa for this inode. Because we only keep capas of dirs
2978 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2979 ll_finish_md_op_data(op_data);
2981 rc = ll_inode_revalidate_fini(inode, rc);
2985 rc = ll_prep_inode(&inode, req, NULL, NULL);
2988 ptlrpc_req_finished(req);
2992 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2994 struct inode *inode = dentry->d_inode;
2997 rc = __ll_inode_revalidate(dentry, ibits);
3001 /* if object isn't regular file, don't validate size */
3002 if (!S_ISREG(inode->i_mode)) {
3003 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3004 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3005 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3007 /* In case of restore, the MDT has the right size and has
3008 * already send it back without granting the layout lock,
3009 * inode is up-to-date so glimpse is useless.
3010 * Also to glimpse we need the layout, in case of a running
3011 * restore the MDT holds the layout lock so the glimpse will
3012 * block up to the end of restore (getattr will block)
3014 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3015 rc = ll_glimpse_size(inode);
3020 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3022 struct inode *inode = de->d_inode;
3023 struct ll_sb_info *sbi = ll_i2sbi(inode);
3024 struct ll_inode_info *lli = ll_i2info(inode);
3027 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3028 MDS_INODELOCK_LOOKUP);
3029 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3034 stat->dev = inode->i_sb->s_dev;
3035 if (ll_need_32bit_api(sbi))
3036 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3038 stat->ino = inode->i_ino;
3039 stat->mode = inode->i_mode;
3040 stat->nlink = inode->i_nlink;
3041 stat->uid = inode->i_uid;
3042 stat->gid = inode->i_gid;
3043 stat->rdev = inode->i_rdev;
3044 stat->atime = inode->i_atime;
3045 stat->mtime = inode->i_mtime;
3046 stat->ctime = inode->i_ctime;
3047 stat->blksize = 1 << inode->i_blkbits;
3049 stat->size = i_size_read(inode);
3050 stat->blocks = inode->i_blocks;
3055 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3056 __u64 start, __u64 len)
3060 struct ll_user_fiemap *fiemap;
3061 unsigned int extent_count = fieinfo->fi_extents_max;
3063 num_bytes = sizeof(*fiemap) + (extent_count *
3064 sizeof(struct ll_fiemap_extent));
3065 OBD_ALLOC_LARGE(fiemap, num_bytes);
3070 fiemap->fm_flags = fieinfo->fi_flags;
3071 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3072 fiemap->fm_start = start;
3073 fiemap->fm_length = len;
3074 if (extent_count > 0)
3075 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3076 sizeof(struct ll_fiemap_extent));
3078 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3080 fieinfo->fi_flags = fiemap->fm_flags;
3081 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3082 if (extent_count > 0)
3083 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3084 fiemap->fm_mapped_extents *
3085 sizeof(struct ll_fiemap_extent));
3087 OBD_FREE_LARGE(fiemap, num_bytes);
3091 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3093 struct ll_inode_info *lli = ll_i2info(inode);
3094 struct posix_acl *acl = NULL;
3096 spin_lock(&lli->lli_lock);
3097 /* VFS' acl_permission_check->check_acl will release the refcount */
3098 acl = posix_acl_dup(lli->lli_posix_acl);
3099 spin_unlock(&lli->lli_lock);
3105 int ll_inode_permission(struct inode *inode, int mask)
3109 #ifdef MAY_NOT_BLOCK
3110 if (mask & MAY_NOT_BLOCK)
3114 /* as root inode are NOT getting validated in lookup operation,
3115 * need to do it before permission check. */
3117 if (is_root_inode(inode)) {
3118 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3119 MDS_INODELOCK_LOOKUP);
3124 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3125 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3127 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3128 return lustre_check_remote_perm(inode, mask);
3130 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3131 rc = generic_permission(inode, mask);
3136 /* -o localflock - only provides locally consistent flock locks */
3137 struct file_operations ll_file_operations = {
3138 .read_iter = ll_file_read_iter,
3139 .write_iter = ll_file_write_iter,
3140 .unlocked_ioctl = ll_file_ioctl,
3141 .open = ll_file_open,
3142 .release = ll_file_release,
3143 .mmap = ll_file_mmap,
3144 .llseek = ll_file_seek,
3145 .splice_read = ll_file_splice_read,
3150 struct file_operations ll_file_operations_flock = {
3151 .read_iter = ll_file_read_iter,
3152 .write_iter = ll_file_write_iter,
3153 .unlocked_ioctl = ll_file_ioctl,
3154 .open = ll_file_open,
3155 .release = ll_file_release,
3156 .mmap = ll_file_mmap,
3157 .llseek = ll_file_seek,
3158 .splice_read = ll_file_splice_read,
3161 .flock = ll_file_flock,
3162 .lock = ll_file_flock
3165 /* These are for -o noflock - to return ENOSYS on flock calls */
3166 struct file_operations ll_file_operations_noflock = {
3167 .read_iter = ll_file_read_iter,
3168 .write_iter = ll_file_write_iter,
3169 .unlocked_ioctl = ll_file_ioctl,
3170 .open = ll_file_open,
3171 .release = ll_file_release,
3172 .mmap = ll_file_mmap,
3173 .llseek = ll_file_seek,
3174 .splice_read = ll_file_splice_read,
3177 .flock = ll_file_noflock,
3178 .lock = ll_file_noflock
3181 struct inode_operations ll_file_inode_operations = {
3182 .setattr = ll_setattr,
3183 .getattr = ll_getattr,
3184 .permission = ll_inode_permission,
3185 .setxattr = ll_setxattr,
3186 .getxattr = ll_getxattr,
3187 .listxattr = ll_listxattr,
3188 .removexattr = ll_removexattr,
3189 .fiemap = ll_fiemap,
3190 .get_acl = ll_get_acl,
3193 /* dynamic ioctl number support routines */
3194 static struct llioc_ctl_data {
3195 struct rw_semaphore ioc_sem;
3196 struct list_head ioc_head;
3198 __RWSEM_INITIALIZER(llioc.ioc_sem),
3199 LIST_HEAD_INIT(llioc.ioc_head)
3204 struct list_head iocd_list;
3205 unsigned int iocd_size;
3206 llioc_callback_t iocd_cb;
3207 unsigned int iocd_count;
3208 unsigned int iocd_cmd[0];
3211 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3214 struct llioc_data *in_data = NULL;
3216 if (cb == NULL || cmd == NULL ||
3217 count > LLIOC_MAX_CMD || count < 0)
3220 size = sizeof(*in_data) + count * sizeof(unsigned int);
3221 in_data = kzalloc(size, GFP_NOFS);
3225 memset(in_data, 0, sizeof(*in_data));
3226 in_data->iocd_size = size;
3227 in_data->iocd_cb = cb;
3228 in_data->iocd_count = count;
3229 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3231 down_write(&llioc.ioc_sem);
3232 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3233 up_write(&llioc.ioc_sem);
3237 EXPORT_SYMBOL(ll_iocontrol_register);
3239 void ll_iocontrol_unregister(void *magic)
3241 struct llioc_data *tmp;
3246 down_write(&llioc.ioc_sem);
3247 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3249 unsigned int size = tmp->iocd_size;
3251 list_del(&tmp->iocd_list);
3252 up_write(&llioc.ioc_sem);
3254 OBD_FREE(tmp, size);
3258 up_write(&llioc.ioc_sem);
3260 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3262 EXPORT_SYMBOL(ll_iocontrol_unregister);
3264 static enum llioc_iter
3265 ll_iocontrol_call(struct inode *inode, struct file *file,
3266 unsigned int cmd, unsigned long arg, int *rcp)
3268 enum llioc_iter ret = LLIOC_CONT;
3269 struct llioc_data *data;
3270 int rc = -EINVAL, i;
3272 down_read(&llioc.ioc_sem);
3273 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3274 for (i = 0; i < data->iocd_count; i++) {
3275 if (cmd != data->iocd_cmd[i])
3278 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3282 if (ret == LLIOC_STOP)
3285 up_read(&llioc.ioc_sem);
3292 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3294 struct ll_inode_info *lli = ll_i2info(inode);
3295 struct cl_env_nest nest;
3299 if (lli->lli_clob == NULL)
3302 env = cl_env_nested_get(&nest);
3304 return PTR_ERR(env);
3306 result = cl_conf_set(env, lli->lli_clob, conf);
3307 cl_env_nested_put(&nest, env);
3309 if (conf->coc_opc == OBJECT_CONF_SET) {
3310 struct ldlm_lock *lock = conf->coc_lock;
3312 LASSERT(lock != NULL);
3313 LASSERT(ldlm_has_layout(lock));
3315 /* it can only be allowed to match after layout is
3316 * applied to inode otherwise false layout would be
3317 * seen. Applying layout should happen before dropping
3318 * the intent lock. */
3319 ldlm_lock_allow_match(lock);
3325 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3326 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3329 struct ll_sb_info *sbi = ll_i2sbi(inode);
3330 struct obd_capa *oc;
3331 struct ptlrpc_request *req;
3332 struct mdt_body *body;
3338 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3339 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3340 lock->l_lvb_data, lock->l_lvb_len);
3342 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3345 /* if layout lock was granted right away, the layout is returned
3346 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3347 * blocked and then granted via completion ast, we have to fetch
3348 * layout here. Please note that we can't use the LVB buffer in
3349 * completion AST because it doesn't have a large enough buffer */
3350 oc = ll_mdscapa_get(inode);
3351 rc = ll_get_default_mdsize(sbi, &lmmsize);
3353 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3354 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3360 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3366 lmmsize = body->eadatasize;
3367 if (lmmsize == 0) /* empty layout */ {
3372 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3378 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3379 if (lvbdata == NULL) {
3384 memcpy(lvbdata, lmm, lmmsize);
3385 lock_res_and_lock(lock);
3386 if (lock->l_lvb_data != NULL)
3387 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3389 lock->l_lvb_data = lvbdata;
3390 lock->l_lvb_len = lmmsize;
3391 unlock_res_and_lock(lock);
3394 ptlrpc_req_finished(req);
3399 * Apply the layout to the inode. Layout lock is held and will be released
3402 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3403 struct inode *inode, __u32 *gen, bool reconf)
3405 struct ll_inode_info *lli = ll_i2info(inode);
3406 struct ll_sb_info *sbi = ll_i2sbi(inode);
3407 struct ldlm_lock *lock;
3408 struct lustre_md md = { NULL };
3409 struct cl_object_conf conf;
3412 bool wait_layout = false;
3414 LASSERT(lustre_handle_is_used(lockh));
3416 lock = ldlm_handle2lock(lockh);
3417 LASSERT(lock != NULL);
3418 LASSERT(ldlm_has_layout(lock));
3420 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3421 inode, PFID(&lli->lli_fid), reconf);
3423 /* in case this is a caching lock and reinstate with new inode */
3424 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3426 lock_res_and_lock(lock);
3427 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3428 unlock_res_and_lock(lock);
3429 /* checking lvb_ready is racy but this is okay. The worst case is
3430 * that multi processes may configure the file on the same time. */
3431 if (lvb_ready || !reconf) {
3434 /* layout_gen must be valid if layout lock is not
3435 * cancelled and stripe has already set */
3436 *gen = ll_layout_version_get(lli);
3442 rc = ll_layout_fetch(inode, lock);
3446 /* for layout lock, lmm is returned in lock's lvb.
3447 * lvb_data is immutable if the lock is held so it's safe to access it
3448 * without res lock. See the description in ldlm_lock_decref_internal()
3449 * for the condition to free lvb_data of layout lock */
3450 if (lock->l_lvb_data != NULL) {
3451 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3452 lock->l_lvb_data, lock->l_lvb_len);
3454 *gen = LL_LAYOUT_GEN_EMPTY;
3456 *gen = md.lsm->lsm_layout_gen;
3459 CERROR("%s: file "DFID" unpackmd error: %d\n",
3460 ll_get_fsname(inode->i_sb, NULL, 0),
3461 PFID(&lli->lli_fid), rc);
3467 /* set layout to file. Unlikely this will fail as old layout was
3468 * surely eliminated */
3469 memset(&conf, 0, sizeof(conf));
3470 conf.coc_opc = OBJECT_CONF_SET;
3471 conf.coc_inode = inode;
3472 conf.coc_lock = lock;
3473 conf.u.coc_md = &md;
3474 rc = ll_layout_conf(inode, &conf);
3477 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3479 /* refresh layout failed, need to wait */
3480 wait_layout = rc == -EBUSY;
3483 LDLM_LOCK_PUT(lock);
3484 ldlm_lock_decref(lockh, mode);
3486 /* wait for IO to complete if it's still being used. */
3488 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3489 ll_get_fsname(inode->i_sb, NULL, 0),
3490 inode, PFID(&lli->lli_fid));
3492 memset(&conf, 0, sizeof(conf));
3493 conf.coc_opc = OBJECT_CONF_WAIT;
3494 conf.coc_inode = inode;
3495 rc = ll_layout_conf(inode, &conf);
3499 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3500 PFID(&lli->lli_fid), rc);
3506 * This function checks if there exists a LAYOUT lock on the client side,
3507 * or enqueues it if it doesn't have one in cache.
3509 * This function will not hold layout lock so it may be revoked any time after
3510 * this function returns. Any operations depend on layout should be redone
3513 * This function should be called before lov_io_init() to get an uptodate
3514 * layout version, the caller should save the version number and after IO
3515 * is finished, this function should be called again to verify that layout
3516 * is not changed during IO time.
3518 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3520 struct ll_inode_info *lli = ll_i2info(inode);
3521 struct ll_sb_info *sbi = ll_i2sbi(inode);
3522 struct md_op_data *op_data;
3523 struct lookup_intent it;
3524 struct lustre_handle lockh;
3526 struct ldlm_enqueue_info einfo = {
3527 .ei_type = LDLM_IBITS,
3529 .ei_cb_bl = ll_md_blocking_ast,
3530 .ei_cb_cp = ldlm_completion_ast,
3534 *gen = ll_layout_version_get(lli);
3535 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3539 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3540 LASSERT(S_ISREG(inode->i_mode));
3542 /* take layout lock mutex to enqueue layout lock exclusively. */
3543 mutex_lock(&lli->lli_layout_mutex);
3546 /* mostly layout lock is caching on the local side, so try to match
3547 * it before grabbing layout lock mutex. */
3548 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3549 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3550 if (mode != 0) { /* hit cached lock */
3551 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3555 mutex_unlock(&lli->lli_layout_mutex);
3559 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3560 0, 0, LUSTRE_OPC_ANY, NULL);
3561 if (IS_ERR(op_data)) {
3562 mutex_unlock(&lli->lli_layout_mutex);
3563 return PTR_ERR(op_data);
3566 /* have to enqueue one */
3567 memset(&it, 0, sizeof(it));
3568 it.it_op = IT_LAYOUT;
3569 lockh.cookie = 0ULL;
3571 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3572 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3573 PFID(&lli->lli_fid));
3575 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3577 if (it.d.lustre.it_data != NULL)
3578 ptlrpc_req_finished(it.d.lustre.it_data);
3579 it.d.lustre.it_data = NULL;
3581 ll_finish_md_op_data(op_data);
3583 mode = it.d.lustre.it_lock_mode;
3584 it.d.lustre.it_lock_mode = 0;
3585 ll_intent_drop_lock(&it);
3588 /* set lock data in case this is a new lock */
3589 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3590 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3594 mutex_unlock(&lli->lli_layout_mutex);
3600 * This function send a restore request to the MDT
3602 int ll_layout_restore(struct inode *inode)
3604 struct hsm_user_request *hur;
3607 len = sizeof(struct hsm_user_request) +
3608 sizeof(struct hsm_user_item);
3609 hur = kzalloc(len, GFP_NOFS);
3613 hur->hur_request.hr_action = HUA_RESTORE;
3614 hur->hur_request.hr_archive_id = 0;
3615 hur->hur_request.hr_flags = 0;
3616 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3617 sizeof(hur->hur_user_item[0].hui_fid));
3618 hur->hur_user_item[0].hui_extent.length = -1;
3619 hur->hur_request.hr_itemcount = 1;
3620 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,