Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -399,6 +399,7 @@ utimensat; numa_setaffinity; numa_getaffinity; + fallocate; }; FBSDprivate_1.0 { Index: lib/libc/sys/fallocate.2 =================================================================== --- /dev/null +++ lib/libc/sys/fallocate.2 @@ -0,0 +1,201 @@ +.\" Copyright (c) 1980, 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd December 28, 2015 +.Dt FALLOCATE 2 +.Os +.Sh NAME +.Nm fallocate +.Nd manage storage for a range in a file +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In fcntl.h +.Ft int +.Fn fallocate "int fd" "int mode" "off_t offset" "off_t len" +.Sh DESCRIPTION +This call manipulates allocated storage for a range of offsets in a file. +.Pp +Its arguments are: +.Bl -tag -width offset +.It Fa fd +The file descriptor of the file. +.It Fa mode +The operation to be performed on the given range. +.It Fa offset +The start of the range to be manipulated in the file. +.It Fa len +The length of the range to be manipulated in the file. +.El +.Pp +The +.Fa mode +argument specifies operation to be performed. +Supported modes are: +.Bl -tag -width offset +.It Fa 0 +Allocate the disk space. +Required storage for the range +.Fa offset +to +.Fa offset + +.Fa len +in the file referenced by +.Fa fd +is guaranteed to be allocated upon successful return. +That is, if +.Fn fallocate +returns successfully, subsequent writes to the specified file data +will not fail due to lack of free space on the file system storage +media. +Any existing file data in the specified range is unmodified. +If +.Fa offset + +.Fa len +is beyond the current file size, then +.Fn fallocate +will adjust the file size to +.Fa offset + +.Fa len . +Otherwise, the file size will not be changed. +.Pp +Space allocated by +.Fn fallocate +will be freed by a successful call to +.Xr creat 2 +or +.Xr open 2 +that truncates the size of the file. +Space allocated via +.Fn fallocate +may be freed by a successful call to +.Xr ftruncate 2 +that reduces the file size to a size smaller than +.Fa offset + +.Fa len . +.It Dv FALLOC_FL_PUNCH_HOLE +De-allocate the disk space, i.e. create a hole. +The file's size is not changed. +Upon successful completion of the operation subsequent reading from the +processed range is guranteed to return zeroes. +Should always be ORed with the +.Dv FALLOC_FL_KEEP_SIZE +below. +.It Dv FALLOC_FL_KEEP_SIZE +Do not extend file even if range defined by the +.Fa offset +and +.Fa len +points out or have a portion outside of the current file size limits. +When the flag is set, only portion of the range that is within current file +size limits is processed and remainder is ignored. +.El +.Sh RETURN VALUES +If successful, +.Fn fallocate +returns zero. +It returns -1 on failure, and sets +.Va errno +to indicate the error. +.Sh ERRORS +Possible failure conditions: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fd +argument is not a valid file descriptor. +.It Bq Er EBADF +The +.Fa fd +argument references a file that was opened without write permission. +.It Bq Er EFBIG +The value of +.Fa offset + +.Fa len +is greater than the maximum file size. +.It Bq Er EINTR +A signal was caught during execution. +.It Bq Er EINVAL +The +.Fa len +argument was less than or equal to zero or the +.Fa offset +argument was less than zero. +.It Bq Er EIO +An I/O error occurred while reading from or writing to a file system. +.It Bq Er ENODEV +The +.Fa fd +argument does not refer to a regular file. +.It Bq Er ENOSPC +There is insufficient free space remaining on the file system storage +media. +.It Bq Er ESPIPE +The +.Fa fd +argument is associated with a pipe or FIFO. +.It Bq Er EOPNOTSUPP +The +.Fa mode +argument is +.Dv FALLOC_FL_PUNCH_HOLE +but the block deallocation operation is not supported by the underlying +filesystem. +.It Bq Er EINVAL +The +.Fa mode +argument is +.Dv FALLOC_FL_PUNCH_HOLE +but the +.Dv FALLOC_FL_KEEP_SIZE +is not set. +.El +.Sh SEE ALSO +.Xr creat 2 , +.Xr ftruncate 2 , +.Xr open 2 , +.Xr unlink 2 , +.Xr posix_fallocate 2 +.Sh STANDARDS +The +.Fn fallocate +is non portable. +It is provided for compatibility with Linux. +.Sh HISTORY +The +.Fn fallocate +function appeared in +.Fx 11.0 . +.Sh AUTHORS +.Fn fallocate +and this manual page were initially written by +.An Matthew Fleming Aq Mt mdf@FreeBSD.org for the +.Xr posix_fallocate 2 +and substantially edited by +.An Maxim Sobolev Aq Mt sobomax@FreeBSD.org . Index: share/man/man9/VOP_ALLOCATE.9 =================================================================== --- share/man/man9/VOP_ALLOCATE.9 +++ share/man/man9/VOP_ALLOCATE.9 @@ -32,36 +32,94 @@ .Os .Sh NAME .Nm VOP_ALLOCATE -.Nd allocate storage for a file +.Nd manage storage for a file .Sh SYNOPSIS .In sys/param.h .In sys/vnode.h .Ft int -.Fn VOP_ALLOCATE "struct vnode *vp" "off_t *offset" "off_t *len" +.Fn VOP_ALLOCATE "struct vnode *vp" "struct ucred *cred" "int mode" "off_t *offset" "off_t *len" .Sh DESCRIPTION -This call allocates storage for a range of offsets in a file. +This call manipulates allocated storage for a range of offsets in a file. It is used to implement the -.Xr posix_fallocate -system call. +.Xr posix_fallocate 2 +and +.Xr fallocate 2 +system calls. .Pp Its arguments are: .Bl -tag -width offset .It Fa vp The vnode of the file. +.It Fa cred +The caller's credentials. +.It Fa mode +The operation to be performed on the given range. .It Fa offset -The start of the range to allocate storage for in the file. +The start of the range to be manipulated in the file. .It Fa len -The length of the range to allocate storage for in the file. +The length of the range to be manipulated in the file. .El .Pp The +.Fa mode +argument specifies operation to be performed. +Supported modes are: +.Bl -tag -width offset +.It Dv VNA_OP_ALLOC +Allocate the disk space. +The file's contents are not changed. +Upon successful completion of the operation subsequent writes into the +processed range are guranteed not to fail due to the insufficient disk +space. +A partial allocation is considered a successful operation. +The .Fa offset and .Fa len arguments are updated to reflect the portion of the range that -still needs to be allocated on return. -A partial allocation is considered a successful operation. -The file's contents are not changed. +has not been processed yet on return. +.It Dv VNA_OP_DEALLOC +De-allocate the disk space, i.e. create a hole. +The file's size is not changed. +Upon successful completion of the operation subsequent reading from the +processed range is guranteed to return zeroes. +Partial completion is only permitted on some fatal error, in which case both +.Fa offset +and +.Fa len +shall be updated to reflect a portion of the original range +that has not been processed and proper error code is returned by the +.Fn VOP_ALLOCATE . +Whether or not disk space is actually reclaimed may depend +on file system implementation details, its configuration, alignment of +the range with respect to the physical block(s), etc. +.El +.Pp +In addition to the primary operation mode +.Po i.e. +.Dv VNA_OP_ALLOC +or +.Dv VNA_OP_DEALLOC +.Pc +extra flags may be ORed into the +.Fa mode +parameter to further refine +.Fn VOP_ALLOCATE +behaviour. +Supported flags are: +.Bl -tag -width offset +.It Dv VNA_KEEP_SIZE +Do not extend file even if range defined by the +.Fa offset +and +.Fa len +points out or have a portion outside of the current file size limits. +Relevant for the +.Dv VNA_OP_ALLOC +mode. +When the flag is set, only portion of the range that is within current file +size limits is processed and remainder is ignored. +.El .Sh LOCKS The file should be exclusively locked on entry and will still be locked on exit. .Sh RETURN VALUES @@ -77,6 +135,12 @@ .It Bq Er EPERM An append-only flag is set on the file, but the caller is attempting to write before the current end of file. +.It Bq Er EOPNOTSUPP +The requested operation is not supported by the underlying filesystem +.Po +.Dv VNA_OP_DEALLOC +mode only +.Pc . .El .Sh SEE ALSO .Xr vnode 9 , Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -316,7 +316,8 @@ extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], uint64_t [2], boolean_t); extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); -extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t, + cred_t *); extern void zfs_znode_init(void); extern void zfs_znode_fini(void); extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -1853,7 +1853,7 @@ /* we can't hold any locks when calling zfs_freesp() */ zfs_dirent_unlock(dl); dl = NULL; - error = zfs_freesp(zp, 0, 0, mode, TRUE); + error = zfs_freesp(zp, 0, 0, mode, TRUE, cr); if (error == 0) { vnevent_create(ZTOV(zp), ct); } @@ -3165,7 +3165,7 @@ * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ - err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); @@ -5154,19 +5154,16 @@ return (0); } +#endif /* illumos */ + /* - * Free or allocate space in a file. Currently, this function only - * supports the `F_FREESP' command. However, this command is somewhat - * misnamed, as its functionality includes the ability to allocate as - * well as free space. + * Free space in a file. * * IN: vp - vnode of file to free data in. - * cmd - action to take (only F_FREESP supported). - * bfp - section of file to free/alloc. + * off - start of the range to free. + * len - length of the range to free. * flag - current file open mode flags. - * offset - current file offset. - * cr - credentials of caller [UNUSED]. - * ct - caller context. + * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * @@ -5175,22 +5172,15 @@ */ /* ARGSUSED */ static int -zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, - offset_t offset, cred_t *cr, caller_context_t *ct) +zfs_freespace(vnode_t *vp, uint64_t off, uint64_t len, int flag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, @@ -5201,6 +5191,7 @@ return (SET_ERROR(EROFS)); } +#ifdef illumos if (error = convoff(vp, bfp, 0, offset)) { ZFS_EXIT(zfsvfs); return (error); @@ -5213,13 +5204,18 @@ off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ +#else + if (off + len > zp->z_size) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENXIO)); + } +#endif - error = zfs_freesp(zp, off, len, flag, TRUE); + error = zfs_freesp(zp, off, len, flag, TRUE, cr); ZFS_EXIT(zfsvfs); return (error); } -#endif /* illumos */ CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); @@ -6056,6 +6052,26 @@ } static int +zfs_freebsd_allocate(ap) + struct vop_allocate_args *ap; +{ + int error; + uint64_t off, len; + + if ((ap->a_mode & VNA_OP_DEALLOC) == 0) + return (vop_stdallocate(ap)); + + off = *ap->a_offset; + len = *ap->a_len; + error = zfs_freespace(ap->a_vp, off, len, FWRITE | FOFFMAX, ap->a_cred); + if (error == 0) { + *ap->a_offset += len; + *ap->a_len = 0; + } + return (error); +} + +static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; @@ -7209,6 +7225,7 @@ .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, + .vop_allocate = zfs_freebsd_allocate, }; struct vop_vector zfs_fifoops = { Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -1727,7 +1727,8 @@ * RETURN: 0 on success, error code on failure */ int -zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log, + cred_t *cr) { vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; @@ -1784,6 +1785,25 @@ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); + + /* + * File content is to be updated, so that clear Set-UID/Set-GID bits + * if not privileged and at least one of the excute bits is set. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); Index: sys/compat/cloudabi/cloudabi_file.c =================================================================== --- sys/compat/cloudabi/cloudabi_file.c +++ sys/compat/cloudabi/cloudabi_file.c @@ -135,7 +135,7 @@ struct cloudabi_sys_file_allocate_args *uap) { - return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len)); + return (kern_fallocate(td, uap->fd, 0, uap->offset, uap->len)); } int Index: sys/compat/freebsd32/freebsd32_misc.c =================================================================== --- sys/compat/freebsd32/freebsd32_misc.c +++ sys/compat/freebsd32/freebsd32_misc.c @@ -2966,11 +2966,27 @@ struct freebsd32_posix_fallocate_args *uap) { - td->td_retval[0] = kern_posix_fallocate(td, uap->fd, + td->td_retval[0] = kern_fallocate(td, uap->fd, 0, PAIR32TO64(off_t, uap->offset), PAIR32TO64(off_t, uap->len)); return (0); } +/* + * Please note that this function has different return value conventions from + * the posix_fallocate() function above. Hence the difference in the error + * handling. + */ +int +freebsd32_fallocate(struct thread *td, + struct freebsd32_fallocate_args *uap) +{ + int error; + + error = kern_fallocate(td, uap->fd, uap->mode, + PAIR32TO64(off_t, uap->offset), PAIR32TO64(off_t, uap->len)); + return (error); +} + int freebsd32_posix_fadvise(struct thread *td, struct freebsd32_posix_fadvise_args *uap) Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -1080,3 +1080,6 @@ 549 AUE_NULL NOPROTO { int numa_setaffinity(cpuwhich_t which, \ id_t id, \ const struct vm_domain_policy *policy); } +550 AUE_NULL STD { int freebsd32_fallocate(int fd, int mode, \ + uint32_t offset1, uint32_t offset2,\ + uint32_t len1, uint32_t len2); } Index: sys/compat/linux/linux_file.c =================================================================== --- sys/compat/linux/linux_file.c +++ sys/compat/linux/linux_file.c @@ -1647,6 +1647,5 @@ if (args->mode != 0) return (ENOSYS); - return (kern_posix_fallocate(td, args->fd, args->offset, - args->len)); + return (kern_fallocate(td, args->fd, 0, args->offset, args->len)); } Index: sys/dev/md/md.c =================================================================== --- sys/dev/md/md.c +++ sys/dev/md/md.c @@ -231,6 +231,7 @@ struct vnode *vnode; char file[PATH_MAX]; struct ucred *cred; + int cannot_dealloc; /* MD_SWAP related fields */ vm_object_t object; @@ -836,7 +837,7 @@ struct buf *pb; bus_dma_segment_t *vlist; struct thread *td; - off_t len, zerosize; + off_t offset, len, zerosize; int ma_offs; switch (bp->bio_cmd) { @@ -876,6 +877,38 @@ return (error); } + /* + * If it's DELETE, first try it via native VOP_ALLOCATE(VNA_OP_DEALLOC). + */ + if (bp->bio_cmd == BIO_DELETE && sc->cannot_dealloc == 0) { + offset = bp->bio_offset; + len = bp->bio_length; + (void) vn_start_write(vp, &mp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_ALLOCATE(vp, sc->cred, VNA_OP_DEALLOC, &offset, + &len); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + if (error != EOPNOTSUPP) { + /* + * Request partially done, should not happen + * for the VNA_OP_DEALLOC. + */ + KASSERT(!(error == 0 && len > 0), + ("VOP_ALLOCATE(mode=VNA_OP_DEALLOC): partially " + "completed, offset + len changed from %jx/%jx to " + "%jx/%jx", bp->bio_offset, bp->bio_length, offset, + len)); + bp->bio_resid = len; + return (error); + } + /* + * VOP_ALLOCATE(mode=VNA_OP_DEALLOC) is not supported, fall + * through to the emulated case. + */ + sc->cannot_dealloc = 1; + } + auio.uio_offset = (vm_ooffset_t)bp->bio_offset; auio.uio_resid = bp->bio_length; auio.uio_segflg = UIO_SYSSPACE; Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -994,6 +994,8 @@ 549 AUE_NULL STD { int numa_setaffinity(cpuwhich_t which, \ id_t id, \ const struct vm_domain_policy_entry *policy); } +550 AUE_NULL STD { int fallocate(int fd, int mode, \ + off_t offset, off_t len); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -917,7 +917,16 @@ struct thread *td; struct vnode *vp; size_t iosize; - int error; + int error, op; + + op = ap->a_mode & VNA_OP_MASK; + KASSERT(((op == VNA_OP_ALLOC) || (op == VNA_OP_DEALLOC)) && + (ap->a_mode & ~VNA_ALL_MFLAGS) == 0, ("a_mode is bogus %d", + ap->a_mode)); + + if (op != VNA_OP_ALLOC) { + return (EOPNOTSUPP); + } buf = NULL; error = 0; @@ -955,19 +964,32 @@ } else #endif if (offset + len > vap->va_size) { - /* - * Test offset + len against the filesystem's maxfilesize. - */ - VATTR_NULL(vap); - vap->va_size = offset + len; - error = VOP_SETATTR(vp, vap, td->td_ucred); - if (error != 0) - goto out; - VATTR_NULL(vap); - vap->va_size = fsize; - error = VOP_SETATTR(vp, vap, td->td_ucred); - if (error != 0) - goto out; + if ((ap->a_mode & VNA_KEEP_SIZE) == 0) { + /* + * Test offset + len against the filesystem's maxfilesize. + */ + VATTR_NULL(vap); + vap->va_size = offset + len; + error = VOP_SETATTR(vp, vap, td->td_ucred); + if (error != 0) + goto out; + VATTR_NULL(vap); + vap->va_size = fsize; + error = VOP_SETATTR(vp, vap, td->td_ucred); + if (error != 0) + goto out; + } else { + if (offset >= vap->va_size) { + /* + * Range outsize of file limits and + * VNA_KEEP_SIZE is set, not an error just + * nothing to do. + */ + error = 0; + goto out; + } + len = vap->va_size - offset; + } } for (;;) { Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -4457,14 +4457,14 @@ } int -kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) +kern_fallocate(struct thread *td, int fd, int mode, off_t offset, off_t len) { struct file *fp; struct mount *mp; struct vnode *vp; cap_rights_t rights; off_t olen, ooffset; - int error; + int error, vop_mode; if (offset < 0 || len <= 0) return (EINVAL); @@ -4491,6 +4491,16 @@ error = ENODEV; goto out; } + if (mode == (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) { + vop_mode = VNA_OP_DEALLOC | VNA_KEEP_SIZE; + } else if (mode == FALLOC_FL_KEEP_SIZE) { + vop_mode = VNA_OP_ALLOC | VNA_KEEP_SIZE; + } else if (mode == 0) { + vop_mode = VNA_OP_ALLOC; + } else { + error = EINVAL; + goto out; + } /* Allocating blocks may take a long time, so iterate. */ for (;;) { @@ -4511,7 +4521,8 @@ error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif - error = VOP_ALLOCATE(vp, &offset, &len); + error = VOP_ALLOCATE(vp, td->td_ucred, vop_mode, + &offset, &len); VOP_UNLOCK(vp, 0); vn_finished_write(mp); @@ -4533,12 +4544,27 @@ sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) { - td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset, + td->td_retval[0] = kern_fallocate(td, uap->fd, 0, uap->offset, uap->len); return (0); } /* + * Please note that this function has different return value conventions from + * the posix_fallocate() function above. Hence the difference in the error + * handling. + */ +int +sys_fallocate(struct thread *td, struct fallocate_args *uap) +{ + int error; + + error = kern_fallocate(td, uap->fd, uap->mode, uap->offset, + uap->len); + return (error); +} + +/* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -636,6 +636,8 @@ vop_allocate { IN struct vnode *vp; + IN struct ucred *cred; + IN int mode; INOUT off_t *offset; INOUT off_t *len; }; Index: sys/sys/fcntl.h =================================================================== --- sys/sys/fcntl.h +++ sys/sys/fcntl.h @@ -311,6 +311,7 @@ int fcntl(int, int, ...); #if __BSD_VISIBLE int flock(int, int); +int fallocate(int, int, off_t, off_t); #endif #if __POSIX_VISIBLE >= 200809 int openat(int, const char *, int, ...); @@ -322,4 +323,8 @@ __END_DECLS #endif +#if __BSD_VISIBLE +#define FALLOC_FL_KEEP_SIZE (1 << 0) +#define FALLOC_FL_PUNCH_HOLE (1 << 1) +#endif #endif /* !_SYS_FCNTL_H_ */ Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -160,7 +160,7 @@ struct timespec *tsp, sigset_t *uset); int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice); -int kern_posix_fallocate(struct thread *td, int fd, off_t offset, +int kern_fallocate(struct thread *td, int fd, int mode, off_t offset, off_t len); int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com, void *data); Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -340,6 +340,16 @@ #define VVERIFY 001000000000 /* verification required */ /* + * Mode flags for the VOP_ALLOCATE() + */ +#define VNA_OP_ALLOC (1 << 0) /* Allocate space */ +#define VNA_OP_DEALLOC (1 << 1) /* Deallocate space */ +#define VNA_KEEP_SIZE (1 << 2) /* Do not expand file size */ + +#define VNA_OP_MASK (VNA_OP_ALLOC | VNA_OP_DEALLOC) +#define VNA_ALL_MFLAGS (VNA_OP_ALLOC | VNA_OP_DEALLOC | VNA_KEEP_SIZE) + +/* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \