[RFC PATCH v2 13/21] fuse-bpf: Add support for read/write iter
From: Daniel Rosenberg
Date: Mon Nov 21 2022 - 21:18:28 EST
This includes adjustments from Amir Goldstein's patch to FUSE
Passthrough
Signed-off-by: Daniel Rosenberg <drosen@xxxxxxxxxx>
Signed-off-by: Paul Lawrence <paullawrence@xxxxxxxxxx>
---
fs/fuse/backing.c | 377 ++++++++++++++++++++++++++++++++++++++++++++++
fs/fuse/control.c | 2 +-
fs/fuse/file.c | 8 +
fs/fuse/fuse_i.h | 19 ++-
fs/fuse/inode.c | 13 ++
5 files changed, 417 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
index a7505d6887e0..425815d7f5dc 100644
--- a/fs/fuse/backing.c
+++ b/fs/fuse/backing.c
@@ -10,6 +10,7 @@
#include <linux/file.h>
#include <linux/fs_stack.h>
#include <linux/namei.h>
+#include <linux/uio.h>
/*
* expression statement to wrap the backing filter logic
@@ -74,6 +75,89 @@
handled; \
})
+#define FUSE_BPF_IOCB_MASK (IOCB_APPEND | IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC)
+
+struct fuse_bpf_aio_req {
+ struct kiocb iocb;
+ refcount_t ref;
+ struct kiocb *iocb_orig;
+ struct timespec64 pre_atime;
+};
+
+static struct kmem_cache *fuse_bpf_aio_request_cachep;
+
+static void fuse_file_accessed(struct file *dst_file, struct file *src_file)
+{
+ struct inode *dst_inode;
+ struct inode *src_inode;
+
+ if (dst_file->f_flags & O_NOATIME)
+ return;
+
+ dst_inode = file_inode(dst_file);
+ src_inode = file_inode(src_file);
+
+ if ((!timespec64_equal(&dst_inode->i_mtime, &src_inode->i_mtime) ||
+ !timespec64_equal(&dst_inode->i_ctime, &src_inode->i_ctime))) {
+ dst_inode->i_mtime = src_inode->i_mtime;
+ dst_inode->i_ctime = src_inode->i_ctime;
+ }
+
+ touch_atime(&dst_file->f_path);
+}
+
+static void fuse_copyattr(struct file *dst_file, struct file *src_file)
+{
+ struct inode *dst = file_inode(dst_file);
+ struct inode *src = file_inode(src_file);
+
+ dst->i_atime = src->i_atime;
+ dst->i_mtime = src->i_mtime;
+ dst->i_ctime = src->i_ctime;
+ i_size_write(dst, i_size_read(src));
+ fuse_invalidate_attr(dst);
+}
+
+static void fuse_file_start_write(struct file *fuse_file, struct file *backing_file,
+ loff_t pos, size_t count)
+{
+ struct inode *inode = file_inode(fuse_file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ if (inode->i_size < pos + count)
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ file_start_write(backing_file);
+}
+
+static void fuse_file_end_write(struct file *fuse_file, struct file *backing_file,
+ loff_t pos, size_t res)
+{
+ struct inode *inode = file_inode(fuse_file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ file_end_write(backing_file);
+
+ if (res > 0)
+ fuse_write_update_attr(inode, pos, res);
+
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ fuse_invalidate_attr(inode);
+}
+
+static void fuse_file_start_read(struct file *backing_file, struct timespec64 *pre_atime)
+{
+ *pre_atime = file_inode(backing_file)->i_atime;
+}
+
+static void fuse_file_end_read(struct file *fuse_file, struct file *backing_file,
+ struct timespec64 *pre_atime)
+{
+ /* Mimic atime update policy of passthrough inode, not the value */
+ if (!timespec64_equal(&file_inode(backing_file)->i_atime, pre_atime))
+ fuse_invalidate_atime(file_inode(fuse_file));
+}
+
static void fuse_get_backing_path(struct file *file, struct path *path)
{
path_get(&file->f_path);
@@ -656,6 +740,283 @@ int fuse_bpf_lseek(loff_t *out, struct inode *inode, struct file *file, loff_t o
file, offset, whence);
}
+static inline void fuse_bpf_aio_put(struct fuse_bpf_aio_req *aio_req)
+{
+ if (refcount_dec_and_test(&aio_req->ref))
+ kmem_cache_free(fuse_bpf_aio_request_cachep, aio_req);
+}
+
+static void fuse_bpf_aio_cleanup_handler(struct fuse_bpf_aio_req *aio_req, long res)
+{
+ struct kiocb *iocb = &aio_req->iocb;
+ struct kiocb *iocb_orig = aio_req->iocb_orig;
+ struct file *filp = iocb->ki_filp;
+ struct file *fuse_filp = iocb_orig->ki_filp;
+
+ if (iocb->ki_flags & IOCB_WRITE) {
+ __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
+ SB_FREEZE_WRITE);
+ fuse_file_end_write(iocb_orig->ki_filp, iocb->ki_filp, iocb->ki_pos, res);
+ } else {
+ fuse_file_end_read(fuse_filp, filp, &aio_req->pre_atime);
+ }
+ iocb_orig->ki_pos = iocb->ki_pos;
+ fuse_bpf_aio_put(aio_req);
+}
+
+static void fuse_bpf_aio_rw_complete(struct kiocb *iocb, long res)
+{
+ struct fuse_bpf_aio_req *aio_req =
+ container_of(iocb, struct fuse_bpf_aio_req, iocb);
+ struct kiocb *iocb_orig = aio_req->iocb_orig;
+
+ fuse_bpf_aio_cleanup_handler(aio_req, res);
+ iocb_orig->ki_complete(iocb_orig, res);
+}
+
+struct fuse_read_iter_out {
+ uint64_t ret;
+};
+struct fuse_file_read_iter_io {
+ struct fuse_read_in fri;
+ struct fuse_read_iter_out frio;
+};
+
+static int fuse_file_read_iter_initialize_in(struct fuse_args *fa, struct fuse_file_read_iter_io *fri,
+ struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ fri->fri = (struct fuse_read_in) {
+ .fh = ff->fh,
+ .offset = iocb->ki_pos,
+ .size = to->count,
+ };
+
+ /* TODO we can't assume 'to' is a kvec */
+ /* TODO we also can't assume the vector has only one component */
+ *fa = (struct fuse_args) {
+ .opcode = FUSE_READ,
+ .nodeid = ff->nodeid,
+ .in_numargs = 1,
+ .in_args[0].size = sizeof(fri->fri),
+ .in_args[0].value = &fri->fri,
+ /*
+ * TODO Design this properly.
+ * Possible approach: do not pass buf to bpf
+ * If going to userland, do a deep copy
+ * For extra credit, do that to/from the vector, rather than
+ * making an extra copy in the kernel
+ */
+ };
+
+ return 0;
+}
+
+static int fuse_file_read_iter_initialize_out(struct fuse_args *fa, struct fuse_file_read_iter_io *fri,
+ struct kiocb *iocb, struct iov_iter *to)
+{
+ fri->frio = (struct fuse_read_iter_out) {
+ .ret = fri->fri.size,
+ };
+
+ fa->out_numargs = 1;
+ fa->out_args[0].size = sizeof(fri->frio);
+ fa->out_args[0].value = &fri->frio;
+
+ return 0;
+}
+
+static int fuse_file_read_iter_backing(struct fuse_args *fa, ssize_t *out,
+ struct kiocb *iocb, struct iov_iter *to)
+{
+ struct fuse_read_iter_out *frio = fa->out_args[0].value;
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ if (!iov_iter_count(to))
+ return 0;
+
+ if ((iocb->ki_flags & IOCB_DIRECT) &&
+ (!ff->backing_file->f_mapping->a_ops ||
+ !ff->backing_file->f_mapping->a_ops->direct_IO))
+ return -EINVAL;
+
+ /* TODO This just plain ignores any change to fuse_read_in */
+ if (is_sync_kiocb(iocb)) {
+ struct timespec64 pre_atime;
+
+ fuse_file_start_read(ff->backing_file, &pre_atime);
+ *out = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos,
+ iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK));
+ fuse_file_end_read(file, ff->backing_file, &pre_atime);
+ } else {
+ struct fuse_bpf_aio_req *aio_req;
+
+ *out = -ENOMEM;
+ aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ aio_req->iocb_orig = iocb;
+ fuse_file_start_read(ff->backing_file, &aio_req->pre_atime);
+ kiocb_clone(&aio_req->iocb, iocb, ff->backing_file);
+ aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
+ *out = vfs_iocb_iter_read(ff->backing_file, &aio_req->iocb, to);
+ fuse_bpf_aio_put(aio_req);
+ if (*out != -EIOCBQUEUED)
+ fuse_bpf_aio_cleanup_handler(aio_req, *out);
+ }
+
+ frio->ret = *out;
+
+ /* TODO Need to point value at the buffer for post-modification */
+
+out:
+ fuse_file_accessed(file, ff->backing_file);
+
+ return *out;
+}
+
+static int fuse_file_read_iter_finalize(struct fuse_args *fa, ssize_t *out,
+ struct kiocb *iocb, struct iov_iter *to)
+{
+ struct fuse_read_iter_out *frio = fa->out_args[0].value;
+
+ *out = frio->ret;
+
+ return 0;
+}
+
+int fuse_bpf_file_read_iter(ssize_t *out, struct inode *inode, struct kiocb *iocb, struct iov_iter *to)
+{
+ return fuse_bpf_backing(inode, struct fuse_file_read_iter_io, out,
+ fuse_file_read_iter_initialize_in,
+ fuse_file_read_iter_initialize_out,
+ fuse_file_read_iter_backing,
+ fuse_file_read_iter_finalize,
+ iocb, to);
+}
+
+struct fuse_write_iter_out {
+ uint64_t ret;
+};
+struct fuse_file_write_iter_io {
+ struct fuse_write_in fwi;
+ struct fuse_write_out fwo;
+ struct fuse_write_iter_out fwio;
+};
+
+static int fuse_file_write_iter_initialize_in(struct fuse_args *fa,
+ struct fuse_file_write_iter_io *fwio,
+ struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+
+ *fwio = (struct fuse_file_write_iter_io) {
+ .fwi.fh = ff->fh,
+ .fwi.offset = iocb->ki_pos,
+ .fwi.size = from->count,
+ };
+
+ /* TODO we can't assume 'from' is a kvec */
+ *fa = (struct fuse_args) {
+ .opcode = FUSE_WRITE,
+ .nodeid = ff->nodeid,
+ .in_numargs = 2,
+ .in_args[0].size = sizeof(fwio->fwi),
+ .in_args[0].value = &fwio->fwi,
+ .in_args[1].size = fwio->fwi.size,
+ .in_args[1].value = from->kvec->iov_base,
+ };
+
+ return 0;
+}
+
+static int fuse_file_write_iter_initialize_out(struct fuse_args *fa,
+ struct fuse_file_write_iter_io *fwio,
+ struct kiocb *iocb, struct iov_iter *from)
+{
+ /* TODO we can't assume 'from' is a kvec */
+ fa->out_numargs = 1;
+ fa->out_args[0].size = sizeof(fwio->fwio);
+ fa->out_args[0].value = &fwio->fwio;
+
+ return 0;
+}
+
+static int fuse_file_write_iter_backing(struct fuse_args *fa, ssize_t *out,
+ struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_write_iter_out *fwio = fa->out_args[0].value;
+ ssize_t count = iov_iter_count(from);
+
+ if (!count)
+ return 0;
+
+ /* TODO This just plain ignores any change to fuse_write_in */
+ /* TODO uint32_t seems smaller than ssize_t.... right? */
+ inode_lock(file_inode(file));
+
+ fuse_copyattr(file, ff->backing_file);
+
+ if (is_sync_kiocb(iocb)) {
+ fuse_file_start_write(file, ff->backing_file, iocb->ki_pos, count);
+ *out = vfs_iter_write(ff->backing_file, from, &iocb->ki_pos,
+ iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK));
+ fuse_file_end_write(file, ff->backing_file, iocb->ki_pos, *out);
+ } else {
+ struct fuse_bpf_aio_req *aio_req;
+
+ *out = -ENOMEM;
+ aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL);
+ if (!aio_req)
+ goto out;
+
+ fuse_file_start_write(file, ff->backing_file, iocb->ki_pos, count);
+ __sb_writers_release(file_inode(ff->backing_file)->i_sb, SB_FREEZE_WRITE);
+ aio_req->iocb_orig = iocb;
+ kiocb_clone(&aio_req->iocb, iocb, ff->backing_file);
+ aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete;
+ refcount_set(&aio_req->ref, 2);
+ *out = vfs_iocb_iter_write(ff->backing_file, &aio_req->iocb, from);
+ fuse_bpf_aio_put(aio_req);
+ if (*out != -EIOCBQUEUED)
+ fuse_bpf_aio_cleanup_handler(aio_req, *out);
+ }
+
+out:
+ inode_unlock(file_inode(file));
+ fwio->ret = *out;
+ if (*out < 0)
+ return *out;
+ return 0;
+}
+
+static int fuse_file_write_iter_finalize(struct fuse_args *fa, ssize_t *out,
+ struct kiocb *iocb, struct iov_iter *from)
+{
+ struct fuse_write_iter_out *fwio = fa->out_args[0].value;
+
+ *out = fwio->ret;
+ return 0;
+}
+
+int fuse_bpf_file_write_iter(ssize_t *out, struct inode *inode, struct kiocb *iocb, struct iov_iter *from)
+{
+ return fuse_bpf_backing(inode, struct fuse_file_write_iter_io, out,
+ fuse_file_write_iter_initialize_in,
+ fuse_file_write_iter_initialize_out,
+ fuse_file_write_iter_backing,
+ fuse_file_write_iter_finalize,
+ iocb, from);
+}
+
ssize_t fuse_backing_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret;
@@ -1280,3 +1641,19 @@ int fuse_bpf_access(int *out, struct inode *inode, int mask)
fuse_access_initialize_in, fuse_access_initialize_out,
fuse_access_backing, fuse_access_finalize, inode, mask);
}
+
+int __init fuse_bpf_init(void)
+{
+ fuse_bpf_aio_request_cachep = kmem_cache_create("fuse_bpf_aio_req",
+ sizeof(struct fuse_bpf_aio_req),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!fuse_bpf_aio_request_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __exit fuse_bpf_cleanup(void)
+{
+ kmem_cache_destroy(fuse_bpf_aio_request_cachep);
+}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 247ef4f76761..685552453751 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -378,7 +378,7 @@ int __init fuse_ctl_init(void)
return register_filesystem(&fuse_ctl_fs_type);
}
-void __exit fuse_ctl_cleanup(void)
+void fuse_ctl_cleanup(void)
{
unregister_filesystem(&fuse_ctl_fs_type);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 70a5bd5403ca..59f3d85106d3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1604,6 +1604,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
+ ssize_t ret;
if (fuse_is_bad(inode))
return -EIO;
@@ -1611,6 +1612,9 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (FUSE_IS_DAX(inode))
return fuse_dax_read_iter(iocb, to);
+ if (fuse_bpf_file_read_iter(&ret, inode, iocb, to))
+ return ret;
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
@@ -1622,6 +1626,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
+ ssize_t ret = 0;
if (fuse_is_bad(inode))
return -EIO;
@@ -1629,6 +1634,9 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (FUSE_IS_DAX(inode))
return fuse_dax_write_iter(iocb, from);
+ if (fuse_bpf_file_write_iter(&ret, inode, iocb, from))
+ return ret;
+
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_write_iter(iocb, from);
else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dc5bba2a75ab..25cedaa9014c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1130,7 +1130,7 @@ int fuse_dev_init(void);
void fuse_dev_cleanup(void);
int fuse_ctl_init(void);
-void __exit fuse_ctl_cleanup(void);
+void fuse_ctl_cleanup(void);
/**
* Simple request sending that does request allocation and freeing
@@ -1410,6 +1410,8 @@ int fuse_bpf_unlink(int *out, struct inode *dir, struct dentry *entry);
int fuse_bpf_release(int *out, struct inode *inode, struct file *file);
int fuse_bpf_releasedir(int *out, struct inode *inode, struct file *file);
int fuse_bpf_lseek(loff_t *out, struct inode *inode, struct file *file, loff_t offset, int whence);
+int fuse_bpf_file_read_iter(ssize_t *out, struct inode *inode, struct kiocb *iocb, struct iov_iter *to);
+int fuse_bpf_file_write_iter(ssize_t *out, struct inode *inode, struct kiocb *iocb, struct iov_iter *from);
int fuse_bpf_file_fallocate(int *out, struct inode *inode, struct file *file, int mode, loff_t offset, loff_t length);
int fuse_bpf_lookup(struct dentry **out, struct inode *dir, struct dentry *entry, unsigned int flags);
int fuse_bpf_access(int *out, struct inode *inode, int mask);
@@ -1462,6 +1464,16 @@ static inline int fuse_bpf_lseek(loff_t *out, struct inode *inode, struct file *
return 0;
}
+static inline int fuse_bpf_file_read_iter(ssize_t *out, struct inode *inode, struct kiocb *iocb, struct iov_iter *to)
+{
+ return 0;
+}
+
+static inline int fuse_bpf_file_write_iter(ssize_t *out, struct inode *inode, struct kiocb *iocb, struct iov_iter *from)
+{
+ return 0;
+}
+
static inline int fuse_bpf_file_fallocate(int *out, struct inode *inode, struct file *file, int mode, loff_t offset, loff_t length)
{
return 0;
@@ -1512,4 +1524,9 @@ static inline u64 attr_timeout(struct fuse_attr_out *o)
return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
}
+#ifdef CONFIG_FUSE_BPF
+int __init fuse_bpf_init(void);
+void __exit fuse_bpf_cleanup(void);
+#endif /* CONFIG_FUSE_BPF */
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index bafb2832627d..9781faff6df6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -2094,11 +2094,21 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+#ifdef CONFIG_FUSE_BPF
+ res = fuse_bpf_init();
+ if (res)
+ goto err_ctl_cleanup;
+#endif
+
sanitize_global_limit(&max_user_bgreq);
sanitize_global_limit(&max_user_congthresh);
return 0;
+#ifdef CONFIG_FUSE_BPF
+ err_ctl_cleanup:
+ fuse_ctl_cleanup();
+#endif
err_sysfs_cleanup:
fuse_sysfs_cleanup();
err_dev_cleanup:
@@ -2116,6 +2126,9 @@ static void __exit fuse_exit(void)
fuse_ctl_cleanup();
fuse_sysfs_cleanup();
fuse_fs_cleanup();
+#ifdef CONFIG_FUSE_BPF
+ fuse_bpf_cleanup();
+#endif
fuse_dev_cleanup();
}
--
2.38.1.584.g0f3c55d4c2-goog