fs/open.c

   1 /*
   2  *  linux/fs/open.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 #include <linux/string.h>
   8 #include <linux/mm.h>
   9 #include <linux/file.h>
  10 #include <linux/fdtable.h>
  11 #include <linux/fsnotify.h>
  12 #include <linux/module.h>
  13 #include <linux/tty.h>
  14 #include <linux/namei.h>
  15 #include <linux/backing-dev.h>
  16 #include <linux/capability.h>
  17 #include <linux/securebits.h>
  18 #include <linux/security.h>
  19 #include <linux/mount.h>
  20 #include <linux/fcntl.h>
  21 #include <linux/slab.h>
  22 #include <asm/uaccess.h>
  23 #include <linux/fs.h>
  24 #include <linux/personality.h>
  25 #include <linux/pagemap.h>
  26 #include <linux/syscalls.h>
  27 #include <linux/rcupdate.h>
  28 #include <linux/audit.h>
  29 #include <linux/falloc.h>
  30 #include <linux/fs_struct.h>
  31 #include <linux/ima.h>
  32 #include <linux/dnotify.h>
  33
  34 #include "internal.h"
  35
  36 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
  37         struct file *filp)
  38 {
  39         int ret;
  40         struct iattr newattrs;
  41
  42         /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
  43         if (length < 0)
  44                 return -EINVAL;
  45
  46         newattrs.ia_size = length;
  47         newattrs.ia_valid = ATTR_SIZE | time_attrs;
  48         if (filp) {
  49                 newattrs.ia_file = filp;
  50                 newattrs.ia_valid |= ATTR_FILE;
  51         }
  52
  53         /* Remove suid/sgid on truncate too */
  54         ret = should_remove_suid(dentry);
  55         if (ret)
  56                 newattrs.ia_valid |= ret | ATTR_FORCE;
  57
  58         mutex_lock(&dentry->d_inode->i_mutex);
  59         ret = notify_change(dentry, &newattrs);
  60         mutex_unlock(&dentry->d_inode->i_mutex);
  61         return ret;
  62 }
  63
  64 long vfs_truncate(struct path *path, loff_t length)
  65 {
  66         struct inode *inode;
  67         long error;
  68
  69         inode = path->dentry->d_inode;
  70
  71         /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
  72         if (S_ISDIR(inode->i_mode))
  73                 return -EISDIR;
  74         if (!S_ISREG(inode->i_mode))
  75                 return -EINVAL;
  76
  77         error = mnt_want_write(path->mnt);
  78         if (error)
  79                 goto out;
  80
  81         error = inode_permission(inode, MAY_WRITE);
  82         if (error)
  83                 goto mnt_drop_write_and_out;
  84
  85         error = -EPERM;
  86         if (IS_APPEND(inode))
  87                 goto mnt_drop_write_and_out;
  88
  89         error = get_write_access(inode);
  90         if (error)
  91                 goto mnt_drop_write_and_out;
  92
  93         /*
  94          * Make sure that there are no leases.  get_write_access() protects
  95          * against the truncate racing with a lease-granting setlease().
  96          */
  97         error = break_lease(inode, O_WRONLY);
  98         if (error)
  99                 goto put_write_and_out;
 100
 101         error = locks_verify_truncate(inode, NULL, length);
 102         if (!error)
 103                 error = security_path_truncate(path);
 104         if (!error)
 105                 error = do_truncate(path->dentry, length, 0, NULL);
 106
 107 put_write_and_out:
 108         put_write_access(inode);
 109 mnt_drop_write_and_out:
 110         mnt_drop_write(path->mnt);
 111 out:
 112         return error;
 113 }
 114 EXPORT_SYMBOL_GPL(vfs_truncate);
 115
 116 static long do_sys_truncate(const char __user *pathname, loff_t length)
 117 {
 118         unsigned int lookup_flags = LOOKUP_FOLLOW;
 119         struct path path;
 120         int error;
 121
 122         if (length < 0) /* sorry, but loff_t says... */
 123                 return -EINVAL;
 124
 125 retry:
 126         error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 127         if (!error) {
 128                 error = vfs_truncate(&path, length);
 129                 path_put(&path);
 130         }
 131         if (retry_estale(error, lookup_flags)) {
 132                 lookup_flags |= LOOKUP_REVAL;
 133                 goto retry;
 134         }
 135         return error;
 136 }
 137
 138 SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 139 {
 140         return do_sys_truncate(path, length);
 141 }
 142
 143 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 144 {
 145         struct inode *inode;
 146         struct dentry *dentry;
 147         struct fd f;
 148         int error;
 149
 150         error = -EINVAL;
 151         if (length < 0)
 152                 goto out;
 153         error = -EBADF;
 154         f = fdget(fd);
 155         if (!f.file)
 156                 goto out;
 157
 158         /* explicitly opened as large or we are on 64-bit box */
 159         if (f.file->f_flags & O_LARGEFILE)
 160                 small = 0;
 161
 162         dentry = f.file->f_path.dentry;
 163         inode = dentry->d_inode;
 164         error = -EINVAL;
 165         if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
 166                 goto out_putf;
 167
 168         error = -EINVAL;
 169         /* Cannot ftruncate over 2^31 bytes without large file support */
 170         if (small && length > MAX_NON_LFS)
 171                 goto out_putf;
 172
 173         error = -EPERM;
 174         if (IS_APPEND(inode))
 175                 goto out_putf;
 176
 177         sb_start_write(inode->i_sb);
 178         error = locks_verify_truncate(inode, f.file, length);
 179         if (!error)
 180                 error = security_path_truncate(&f.file->f_path);
 181         if (!error)
 182                 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
 183         sb_end_write(inode->i_sb);
 184 out_putf:
 185         fdput(f);
 186 out:
 187         return error;
 188 }
 189
 190 SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 191 {
 192         long ret = do_sys_ftruncate(fd, length, 1);
 193         /* avoid REGPARM breakage on x86: */
 194         asmlinkage_protect(2, ret, fd, length);
 195         return ret;
 196 }
 197
 198 /* LFS versions of truncate are only needed on 32 bit machines */
 199 #if BITS_PER_LONG == 32
 200 SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
 201 {
 202         return do_sys_truncate(path, length);
 203 }
 204 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 205 asmlinkage long SyS_truncate64(long path, loff_t length)
 206 {
 207         return SYSC_truncate64((const char __user *) path, length);
 208 }
 209 SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
 210 #endif
 211
 212 SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
 213 {
 214         long ret = do_sys_ftruncate(fd, length, 0);
 215         /* avoid REGPARM breakage on x86: */
 216         asmlinkage_protect(2, ret, fd, length);
 217         return ret;
 218 }
 219 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 220 asmlinkage long SyS_ftruncate64(long fd, loff_t length)
 221 {
 222         return SYSC_ftruncate64((unsigned int) fd, length);
 223 }
 224 SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 225 #endif
 226 #endif /* BITS_PER_LONG == 32 */
 227
 228
 229 int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 230 {
 231         struct inode *inode = file->f_path.dentry->d_inode;
 232         long ret;
 233
 234         if (offset < 0 || len <= 0)
 235                 return -EINVAL;
 236
 237         /* Return error if mode is not supported */
 238         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 239                 return -EOPNOTSUPP;
 240
 241         /* Punch hole must have keep size set */
 242         if ((mode & FALLOC_FL_PUNCH_HOLE) &&
 243             !(mode & FALLOC_FL_KEEP_SIZE))
 244                 return -EOPNOTSUPP;
 245
 246         if (!(file->f_mode & FMODE_WRITE))
 247                 return -EBADF;
 248
 249         /* It's not possible punch hole on append only file */
 250         if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
 251                 return -EPERM;
 252
 253         if (IS_IMMUTABLE(inode))
 254                 return -EPERM;
 255
 256         /*
 257          * Revalidate the write permissions, in case security policy has
 258          * changed since the files were opened.
 259          */
 260         ret = security_file_permission(file, MAY_WRITE);
 261         if (ret)
 262                 return ret;
 263
 264         if (S_ISFIFO(inode->i_mode))
 265                 return -ESPIPE;
 266
 267         /*
 268          * Let individual file system decide if it supports preallocation
 269          * for directories or not.
 270          */
 271         if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 272                 return -ENODEV;
 273
 274         /* Check for wrap through zero too */
 275         if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 276                 return -EFBIG;
 277
 278         if (!file->f_op->fallocate)
 279                 return -EOPNOTSUPP;
 280
 281         sb_start_write(inode->i_sb);
 282         ret = file->f_op->fallocate(file, mode, offset, len);
 283         sb_end_write(inode->i_sb);
 284         return ret;
 285 }
 286
 287 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 288 {
 289         struct fd f = fdget(fd);
 290         int error = -EBADF;
 291
 292         if (f.file) {
 293                 error = do_fallocate(f.file, mode, offset, len);
 294                 fdput(f);
 295         }
 296         return error;
 297 }
 298
 299 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 300 asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
 301 {
 302         return SYSC_fallocate((int)fd, (int)mode, offset, len);
 303 }
 304 SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
 305 #endif
 306
 307 /*
 308  * access() needs to use the real uid/gid, not the effective uid/gid.
 309  * We do this by temporarily clearing all FS-related capabilities and
 310  * switching the fsuid/fsgid around to the real ones.
 311  */
 312 SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 313 {
 314         const struct cred *old_cred;
 315         struct cred *override_cred;
 316         struct path path;
 317         struct inode *inode;
 318         int res;
 319
 320         if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
 321                 return -EINVAL;
 322
 323         override_cred = prepare_creds();
 324         if (!override_cred)
 325                 return -ENOMEM;
 326
 327         override_cred->fsuid = override_cred->uid;
 328         override_cred->fsgid = override_cred->gid;
 329
 330         if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 331                 /* Clear the capabilities if we switch to a non-root user */
 332                 kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
 333                 if (!uid_eq(override_cred->uid, root_uid))
 334                         cap_clear(override_cred->cap_effective);
 335                 else
 336                         override_cred->cap_effective =
 337                                 override_cred->cap_permitted;
 338         }
 339
 340         old_cred = override_creds(override_cred);
 341
 342         res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 343         if (res)
 344                 goto out;
 345
 346         inode = path.dentry->d_inode;
 347
 348         if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 349                 /*
 350                  * MAY_EXEC on regular files is denied if the fs is mounted
 351                  * with the "noexec" flag.
 352                  */
 353                 res = -EACCES;
 354                 if (path.mnt->mnt_flags & MNT_NOEXEC)
 355                         goto out_path_release;
 356         }
 357
 358         res = inode_permission(inode, mode | MAY_ACCESS);
 359         /* SuS v2 requires we report a read only fs too */
 360         if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 361                 goto out_path_release;
 362         /*
 363          * This is a rare case where using __mnt_is_readonly()
 364          * is OK without a mnt_want/drop_write() pair.  Since
 365          * no actual write to the fs is performed here, we do
 366          * not need to telegraph to that to anyone.
 367          *
 368          * By doing this, we accept that this access is
 369          * inherently racy and know that the fs may change
 370          * state before we even see this result.
 371          */
 372         if (__mnt_is_readonly(path.mnt))
 373                 res = -EROFS;
 374
 375 out_path_release:
 376         path_put(&path);
 377 out:
 378         revert_creds(old_cred);
 379         put_cred(override_cred);
 380         return res;
 381 }
 382
 383 SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 384 {
 385         return sys_faccessat(AT_FDCWD, filename, mode);
 386 }
 387
 388 SYSCALL_DEFINE1(chdir, const char __user *, filename)
 389 {
 390         struct path path;
 391         int error;
 392
 393         error = user_path_dir(filename, &path);
 394         if (error)
 395                 goto out;
 396
 397         error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 398         if (error)
 399                 goto dput_and_out;
 400
 401         set_fs_pwd(current->fs, &path);
 402
 403 dput_and_out:
 404         path_put(&path);
 405 out:
 406         return error;
 407 }
 408
 409 SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 410 {
 411         struct fd f = fdget_raw(fd);
 412         struct inode *inode;
 413         int error = -EBADF;
 414
 415         error = -EBADF;
 416         if (!f.file)
 417                 goto out;
 418
 419         inode = f.file->f_path.dentry->d_inode;
 420
 421         error = -ENOTDIR;
 422         if (!S_ISDIR(inode->i_mode))
 423                 goto out_putf;
 424
 425         error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 426         if (!error)
 427                 set_fs_pwd(current->fs, &f.file->f_path);
 428 out_putf:
 429         fdput(f);
 430 out:
 431         return error;
 432 }
 433
 434 SYSCALL_DEFINE1(chroot, const char __user *, filename)
 435 {
 436         struct path path;
 437         int error;
 438
 439         error = user_path_dir(filename, &path);
 440         if (error)
 441                 goto out;
 442
 443         error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
 444         if (error)
 445                 goto dput_and_out;
 446
 447         error = -EPERM;
 448         if (!nsown_capable(CAP_SYS_CHROOT))
 449                 goto dput_and_out;
 450         error = security_path_chroot(&path);
 451         if (error)
 452                 goto dput_and_out;
 453
 454         set_fs_root(current->fs, &path);
 455         error = 0;
 456 dput_and_out:
 457         path_put(&path);
 458 out:
 459         return error;
 460 }
 461
 462 static int chmod_common(struct path *path, umode_t mode)
 463 {
 464         struct inode *inode = path->dentry->d_inode;
 465         struct iattr newattrs;
 466         int error;
 467
 468         error = mnt_want_write(path->mnt);
 469         if (error)
 470                 return error;
 471         mutex_lock(&inode->i_mutex);
 472         error = security_path_chmod(path, mode);
 473         if (error)
 474                 goto out_unlock;
 475         newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 476         newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 477         error = notify_change(path->dentry, &newattrs);
 478 out_unlock:
 479         mutex_unlock(&inode->i_mutex);
 480         mnt_drop_write(path->mnt);
 481         return error;
 482 }
 483
 484 SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
 485 {
 486         struct file * file;
 487         int err = -EBADF;
 488
 489         file = fget(fd);
 490         if (file) {
 491                 audit_inode(NULL, file->f_path.dentry, 0);
 492                 err = chmod_common(&file->f_path, mode);
 493                 fput(file);
 494         }
 495         return err;
 496 }
 497
 498 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode)
 499 {
 500         struct path path;
 501         int error;
 502
 503         error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 504         if (!error) {
 505                 error = chmod_common(&path, mode);
 506                 path_put(&path);
 507         }
 508         return error;
 509 }
 510
 511 SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
 512 {
 513         return sys_fchmodat(AT_FDCWD, filename, mode);
 514 }
 515
 516 static int chown_common(struct path *path, uid_t user, gid_t group)
 517 {
 518         struct inode *inode = path->dentry->d_inode;
 519         int error;
 520         struct iattr newattrs;
 521         kuid_t uid;
 522         kgid_t gid;
 523
 524         uid = make_kuid(current_user_ns(), user);
 525         gid = make_kgid(current_user_ns(), group);
 526
 527         newattrs.ia_valid =  ATTR_CTIME;
 528         if (user != (uid_t) -1) {
 529                 if (!uid_valid(uid))
 530                         return -EINVAL;
 531                 newattrs.ia_valid |= ATTR_UID;
 532                 newattrs.ia_uid = uid;
 533         }
 534         if (group != (gid_t) -1) {
 535                 if (!gid_valid(gid))
 536                         return -EINVAL;
 537                 newattrs.ia_valid |= ATTR_GID;
 538                 newattrs.ia_gid = gid;
 539         }
 540         if (!S_ISDIR(inode->i_mode))
 541                 newattrs.ia_valid |=
 542                         ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
 543         mutex_lock(&inode->i_mutex);
 544         error = security_path_chown(path, uid, gid);
 545         if (!error)
 546                 error = notify_change(path->dentry, &newattrs);
 547         mutex_unlock(&inode->i_mutex);
 548
 549         return error;
 550 }
 551
 552 SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 553                 gid_t, group, int, flag)
 554 {
 555         struct path path;
 556         int error = -EINVAL;
 557         int lookup_flags;
 558
 559         if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 560                 goto out;
 561
 562         lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 563         if (flag & AT_EMPTY_PATH)
 564                 lookup_flags |= LOOKUP_EMPTY;
 565         error = user_path_at(dfd, filename, lookup_flags, &path);
 566         if (error)
 567                 goto out;
 568         error = mnt_want_write(path.mnt);
 569         if (error)
 570                 goto out_release;
 571         error = chown_common(&path, user, group);
 572         mnt_drop_write(path.mnt);
 573 out_release:
 574         path_put(&path);
 575 out:
 576         return error;
 577 }
 578
 579 SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 580 {
 581         return sys_fchownat(AT_FDCWD, filename, user, group, 0);
 582 }
 583
 584 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 585 {
 586         return sys_fchownat(AT_FDCWD, filename, user, group,
 587                             AT_SYMLINK_NOFOLLOW);
 588 }
 589
 590 SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 591 {
 592         struct fd f = fdget(fd);
 593         int error = -EBADF;
 594
 595         if (!f.file)
 596                 goto out;
 597
 598         error = mnt_want_write_file(f.file);
 599         if (error)
 600                 goto out_fput;
 601         audit_inode(NULL, f.file->f_path.dentry, 0);
 602         error = chown_common(&f.file->f_path, user, group);
 603         mnt_drop_write_file(f.file);
 604 out_fput:
 605         fdput(f);
 606 out:
 607         return error;
 608 }
 609
 610 /*
 611  * You have to be very careful that these write
 612  * counts get cleaned up in error cases and
 613  * upon __fput().  This should probably never
 614  * be called outside of __dentry_open().
 615  */
 616 static inline int __get_file_write_access(struct inode *inode,
 617                                           struct vfsmount *mnt)
 618 {
 619         int error;
 620         error = get_write_access(inode);
 621         if (error)
 622                 return error;
 623         /*
 624          * Do not take mount writer counts on
 625          * special files since no writes to
 626          * the mount itself will occur.
 627          */
 628         if (!special_file(inode->i_mode)) {
 629                 /*
 630                  * Balanced in __fput()
 631                  */
 632                 error = __mnt_want_write(mnt);
 633                 if (error)
 634                         put_write_access(inode);
 635         }
 636         return error;
 637 }
 638
 639 int open_check_o_direct(struct file *f)
 640 {
 641         /* NB: we're sure to have correct a_ops only after f_op->open */
 642         if (f->f_flags & O_DIRECT) {
 643                 if (!f->f_mapping->a_ops ||
 644                     ((!f->f_mapping->a_ops->direct_IO) &&
 645                     (!f->f_mapping->a_ops->get_xip_mem))) {
 646                         return -EINVAL;
 647                 }
 648         }
 649         return 0;
 650 }
 651
 652 static int do_dentry_open(struct file *f,
 653                           int (*open)(struct inode *, struct file *),
 654                           const struct cred *cred)
 655 {
 656         static const struct file_operations empty_fops = {};
 657         struct inode *inode;
 658         int error;
 659
 660         f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 661                                 FMODE_PREAD | FMODE_PWRITE;
 662
 663         if (unlikely(f->f_flags & O_PATH))
 664                 f->f_mode = FMODE_PATH;
 665
 666         path_get(&f->f_path);
 667         inode = f->f_path.dentry->d_inode;
 668         if (f->f_mode & FMODE_WRITE) {
 669                 error = __get_file_write_access(inode, f->f_path.mnt);
 670                 if (error)
 671                         goto cleanup_file;
 672                 if (!special_file(inode->i_mode))
 673                         file_take_write(f);
 674         }
 675
 676         f->f_mapping = inode->i_mapping;
 677         f->f_pos = 0;
 678         file_sb_list_add(f, inode->i_sb);
 679
 680         if (unlikely(f->f_mode & FMODE_PATH)) {
 681                 f->f_op = &empty_fops;
 682                 return 0;
 683         }
 684
 685         f->f_op = fops_get(inode->i_fop);
 686
 687         error = security_file_open(f, cred);
 688         if (error)
 689                 goto cleanup_all;
 690
 691         error = break_lease(inode, f->f_flags);
 692         if (error)
 693                 goto cleanup_all;
 694
 695         if (!open && f->f_op)
 696                 open = f->f_op->open;
 697         if (open) {
 698                 error = open(inode, f);
 699                 if (error)
 700                         goto cleanup_all;
 701         }
 702         if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 703                 i_readcount_inc(inode);
 704
 705         f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 706
 707         file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 708
 709         return 0;
 710
 711 cleanup_all:
 712         fops_put(f->f_op);
 713         file_sb_list_del(f);
 714         if (f->f_mode & FMODE_WRITE) {
 715                 put_write_access(inode);
 716                 if (!special_file(inode->i_mode)) {
 717                         /*
 718                          * We don't consider this a real
 719                          * mnt_want/drop_write() pair
 720                          * because it all happenend right
 721                          * here, so just reset the state.
 722                          */
 723                         file_reset_write(f);
 724                         __mnt_drop_write(f->f_path.mnt);
 725                 }
 726         }
 727 cleanup_file:
 728         path_put(&f->f_path);
 729         f->f_path.mnt = NULL;
 730         f->f_path.dentry = NULL;
 731         return error;
 732 }
 733
 734 /**
 735  * finish_open - finish opening a file
 736  * @od: opaque open data
 737  * @dentry: pointer to dentry
 738  * @open: open callback
 739  *
 740  * This can be used to finish opening a file passed to i_op->atomic_open().
 741  *
 742  * If the open callback is set to NULL, then the standard f_op->open()
 743  * filesystem callback is substituted.
 744  */
 745 int finish_open(struct file *file, struct dentry *dentry,
 746                 int (*open)(struct inode *, struct file *),
 747                 int *opened)
 748 {
 749         int error;
 750         BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 751
 752         file->f_path.dentry = dentry;
 753         error = do_dentry_open(file, open, current_cred());
 754         if (!error)
 755                 *opened |= FILE_OPENED;
 756
 757         return error;
 758 }
 759 EXPORT_SYMBOL(finish_open);
 760
 761 /**
 762  * finish_no_open - finish ->atomic_open() without opening the file
 763  *
 764  * @od: opaque open data
 765  * @dentry: dentry or NULL (as returned from ->lookup())
 766  *
 767  * This can be used to set the result of a successful lookup in ->atomic_open().
 768  * The filesystem's atomic_open() method shall return NULL after calling this.
 769  */
 770 int finish_no_open(struct file *file, struct dentry *dentry)
 771 {
 772         file->f_path.dentry = dentry;
 773         return 1;
 774 }
 775 EXPORT_SYMBOL(finish_no_open);
 776
 777 struct file *dentry_open(const struct path *path, int flags,
 778                          const struct cred *cred)
 779 {
 780         int error;
 781         struct file *f;
 782
 783         validate_creds(cred);
 784
 785         /* We must always pass in a valid mount pointer. */
 786         BUG_ON(!path->mnt);
 787
 788         error = -ENFILE;
 789         f = get_empty_filp();
 790         if (f == NULL)
 791                 return ERR_PTR(error);
 792
 793         f->f_flags = flags;
 794         f->f_path = *path;
 795         error = do_dentry_open(f, NULL, cred);
 796         if (!error) {
 797                 error = open_check_o_direct(f);
 798                 if (error) {
 799                         fput(f);
 800                         f = ERR_PTR(error);
 801                 }
 802         } else {
 803                 put_filp(f);
 804                 f = ERR_PTR(error);
 805         }
 806         return f;
 807 }
 808 EXPORT_SYMBOL(dentry_open);
 809
 810 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 811 {
 812         int lookup_flags = 0;
 813         int acc_mode;
 814
 815         if (flags & O_CREAT)
 816                 op->mode = (mode & S_IALLUGO) | S_IFREG;
 817         else
 818                 op->mode = 0;
 819
 820         /* Must never be set by userspace */
 821         flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
 822
 823         /*
 824          * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
 825          * check for O_DSYNC if the need any syncing at all we enforce it's
 826          * always set instead of having to deal with possibly weird behaviour
 827          * for malicious applications setting only __O_SYNC.
 828          */
 829         if (flags & __O_SYNC)
 830                 flags |= O_DSYNC;
 831
 832         /*
 833          * If we have O_PATH in the open flag. Then we
 834          * cannot have anything other than the below set of flags
 835          */
 836         if (flags & O_PATH) {
 837                 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
 838                 acc_mode = 0;
 839         } else {
 840                 acc_mode = MAY_OPEN | ACC_MODE(flags);
 841         }
 842
 843         op->open_flag = flags;
 844
 845         /* O_TRUNC implies we need access checks for write permissions */
 846         if (flags & O_TRUNC)
 847                 acc_mode |= MAY_WRITE;
 848
 849         /* Allow the LSM permission hook to distinguish append
 850            access from general write access. */
 851         if (flags & O_APPEND)
 852                 acc_mode |= MAY_APPEND;
 853
 854         op->acc_mode = acc_mode;
 855
 856         op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
 857
 858         if (flags & O_CREAT) {
 859                 op->intent |= LOOKUP_CREATE;
 860                 if (flags & O_EXCL)
 861                         op->intent |= LOOKUP_EXCL;
 862         }
 863
 864         if (flags & O_DIRECTORY)
 865                 lookup_flags |= LOOKUP_DIRECTORY;
 866         if (!(flags & O_NOFOLLOW))
 867                 lookup_flags |= LOOKUP_FOLLOW;
 868         return lookup_flags;
 869 }
 870
 871 /**
 872  * file_open_name - open file and return file pointer
 873  *
 874  * @name:       struct filename containing path to open
 875  * @flags:      open flags as per the open(2) second argument
 876  * @mode:       mode for the new file if O_CREAT is set, else ignored
 877  *
 878  * This is the helper to open a file from kernelspace if you really
 879  * have to.  But in generally you should not do this, so please move
 880  * along, nothing to see here..
 881  */
 882 struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 883 {
 884         struct open_flags op;
 885         int lookup = build_open_flags(flags, mode, &op);
 886         return do_filp_open(AT_FDCWD, name, &op, lookup);
 887 }
 888
 889 /**
 890  * filp_open - open file and return file pointer
 891  *
 892  * @filename:   path to open
 893  * @flags:      open flags as per the open(2) second argument
 894  * @mode:       mode for the new file if O_CREAT is set, else ignored
 895  *
 896  * This is the helper to open a file from kernelspace if you really
 897  * have to.  But in generally you should not do this, so please move
 898  * along, nothing to see here..
 899  */
 900 struct file *filp_open(const char *filename, int flags, umode_t mode)
 901 {
 902         struct filename name = {.name = filename};
 903         return file_open_name(&name, flags, mode);
 904 }
 905 EXPORT_SYMBOL(filp_open);
 906
 907 struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 908                             const char *filename, int flags)
 909 {
 910         struct open_flags op;
 911         int lookup = build_open_flags(flags, 0, &op);
 912         if (flags & O_CREAT)
 913                 return ERR_PTR(-EINVAL);
 914         if (!filename && (flags & O_DIRECTORY))
 915                 if (!dentry->d_inode->i_op->lookup)
 916                         return ERR_PTR(-ENOTDIR);
 917         return do_file_open_root(dentry, mnt, filename, &op, lookup);
 918 }
 919 EXPORT_SYMBOL(file_open_root);
 920
 921 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 922 {
 923         struct open_flags op;
 924         int lookup = build_open_flags(flags, mode, &op);
 925         struct filename *tmp = getname(filename);
 926         int fd = PTR_ERR(tmp);
 927
 928         if (!IS_ERR(tmp)) {
 929                 fd = get_unused_fd_flags(flags);
 930                 if (fd >= 0) {
 931                         struct file *f = do_filp_open(dfd, tmp, &op, lookup);
 932                         if (IS_ERR(f)) {
 933                                 put_unused_fd(fd);
 934                                 fd = PTR_ERR(f);
 935                         } else {
 936                                 fsnotify_open(f);
 937                                 fd_install(fd, f);
 938                         }
 939                 }
 940                 putname(tmp);
 941         }
 942         return fd;
 943 }
 944
 945 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 946 {
 947         long ret;
 948
 949         if (force_o_largefile())
 950                 flags |= O_LARGEFILE;
 951
 952         ret = do_sys_open(AT_FDCWD, filename, flags, mode);
 953         /* avoid REGPARM breakage on x86: */
 954         asmlinkage_protect(3, ret, filename, flags, mode);
 955         return ret;
 956 }
 957
 958 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
 959                 umode_t, mode)
 960 {
 961         long ret;
 962
 963         if (force_o_largefile())
 964                 flags |= O_LARGEFILE;
 965
 966         ret = do_sys_open(dfd, filename, flags, mode);
 967         /* avoid REGPARM breakage on x86: */
 968         asmlinkage_protect(4, ret, dfd, filename, flags, mode);
 969         return ret;
 970 }
 971
 972 #ifndef __alpha__
 973
 974 /*
 975  * For backward compatibility?  Maybe this should be moved
 976  * into arch/i386 instead?
 977  */
 978 SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
 979 {
 980         return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
 981 }
 982
 983 #endif
 984
 985 /*
 986  * "id" is the POSIX thread ID. We use the
 987  * files pointer for this..
 988  */
 989 int filp_close(struct file *filp, fl_owner_t id)
 990 {
 991         int retval = 0;
 992
 993         if (!file_count(filp)) {
 994                 printk(KERN_ERR "VFS: Close: file count is 0\n");
 995                 return 0;
 996         }
 997
 998         if (filp->f_op && filp->f_op->flush)
 999                 retval = filp->f_op->flush(filp, id);
1000
1001         if (likely(!(filp->f_mode & FMODE_PATH))) {
1002                 dnotify_flush(filp, id);
1003                 locks_remove_posix(filp, id);
1004         }
1005         fput(filp);
1006         return retval;
1007 }
1008
1009 EXPORT_SYMBOL(filp_close);
1010
1011 /*
1012  * Careful here! We test whether the file pointer is NULL before
1013  * releasing the fd. This ensures that one clone task can't release
1014  * an fd while another clone is opening it.
1015  */
1016 SYSCALL_DEFINE1(close, unsigned int, fd)
1017 {
1018         int retval = __close_fd(current->files, fd);
1019
1020         /* can't restart close syscall because file table entry was cleared */
1021         if (unlikely(retval == -ERESTARTSYS ||
1022                      retval == -ERESTARTNOINTR ||
1023                      retval == -ERESTARTNOHAND ||
1024                      retval == -ERESTART_RESTARTBLOCK))
1025                 retval = -EINTR;
1026
1027         return retval;
1028 }
1029 EXPORT_SYMBOL(sys_close);
1030
1031 /*
1032  * This routine simulates a hangup on the tty, to arrange that users
1033  * are given clean terminals at login time.
1034  */
1035 SYSCALL_DEFINE0(vhangup)
1036 {
1037         if (capable(CAP_SYS_TTY_CONFIG)) {
1038                 tty_vhangup_self();
1039                 return 0;
1040         }
1041         return -EPERM;
1042 }
1043
1044 /*
1045  * Called when an inode is about to be open.
1046  * We use this to disallow opening large files on 32bit systems if
1047  * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
1048  * on this flag in sys_open.
1049  */
1050 int generic_file_open(struct inode * inode, struct file * filp)
1051 {
1052         if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1053                 return -EOVERFLOW;
1054         return 0;
1055 }
1056
1057 EXPORT_SYMBOL(generic_file_open);
1058
1059 /*
1060  * This is used by subsystems that don't want seekable
1061  * file descriptors. The function is not supposed to ever fail, the only
1062  * reason it returns an 'int' and not 'void' is so that it can be plugged
1063  * directly into file_operations structure.
1064  */
1065 int nonseekable_open(struct inode *inode, struct file *filp)
1066 {
1067         filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1068         return 0;
1069 }
1070
1071 EXPORT_SYMBOL(nonseekable_open);