[PATCH v5 03/10] block: add emulation for copy

From: Nitesh Shetty
Date: Wed Nov 23 2022 - 01:13:54 EST


For the devices which does not support copy, copy emulation is
added. Copy-emulation is implemented by reading from source ranges
into memory and writing to the corresponding destination asynchronously.
For zoned device we maintain a linked list of read submission and try to
submit corresponding write in same order.
Also emulation is used, if copy offload fails or partially completes.

Signed-off-by: Nitesh Shetty <nj.shetty@xxxxxxxxxxx>
Signed-off-by: Vincent Fu <vincent.fu@xxxxxxxxxxx>
Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx>
---
block/blk-lib.c | 241 ++++++++++++++++++++++++++++++++++++++++-
block/blk-map.c | 4 +-
include/linux/blkdev.h | 3 +
3 files changed, 245 insertions(+), 3 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2ce3c872ca49..43b1d0ef5732 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -428,6 +428,239 @@ static inline int blk_copy_sanity_check(struct block_device *src_bdev,
return 0;
}

+static void *blk_alloc_buf(sector_t req_size, sector_t *alloc_size,
+ gfp_t gfp_mask)
+{
+ int min_size = PAGE_SIZE;
+ void *buf;
+
+ while (req_size >= min_size) {
+ buf = kvmalloc(req_size, gfp_mask);
+ if (buf) {
+ *alloc_size = req_size;
+ return buf;
+ }
+ /* retry half the requested size */
+ req_size >>= 1;
+ }
+
+ return NULL;
+}
+
+static void blk_copy_emulate_write_end_io(struct bio *bio)
+{
+ struct copy_ctx *ctx = bio->bi_private;
+ struct cio *cio = ctx->cio;
+ sector_t clen;
+ int ri = ctx->range_idx;
+
+ if (bio->bi_status) {
+ cio->io_err = blk_status_to_errno(bio->bi_status);
+ clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ cio->ranges[ri].dst;
+ cio->ranges[ri].comp_len = min_t(sector_t, clen,
+ cio->ranges[ri].comp_len);
+ }
+ kvfree(page_address(bio->bi_io_vec[0].bv_page));
+ bio_map_kern_endio(bio);
+ if (atomic_dec_and_test(&ctx->refcount))
+ kfree(ctx);
+ if (atomic_dec_and_test(&cio->refcount)) {
+ if (cio->endio) {
+ cio->endio(cio->private, cio->io_err);
+ kfree(cio);
+ } else
+ blk_wake_io_task(cio->waiter);
+ }
+}
+
+static void blk_copy_emulate_read_end_io(struct bio *read_bio)
+{
+ struct copy_ctx *ctx = read_bio->bi_private;
+ struct cio *cio = ctx->cio;
+ sector_t clen;
+ int ri = ctx->range_idx;
+ unsigned long flags;
+
+ if (read_bio->bi_status) {
+ cio->io_err = blk_status_to_errno(read_bio->bi_status);
+ goto err_rw_bio;
+ }
+
+ /* For zoned device, we check if completed bio is first entry in linked
+ * list,
+ * if yes, we start the worker to submit write bios.
+ * if not, then we just update status of bio in ctx,
+ * once the worker gets scheduled, it will submit writes for all
+ * the consecutive REQ_COPY_READ_COMPLETE bios.
+ */
+ if (bdev_is_zoned(ctx->write_bio->bi_bdev)) {
+ spin_lock_irqsave(&cio->list_lock, flags);
+ ctx->status = REQ_COPY_READ_COMPLETE;
+ if (ctx == list_first_entry(&cio->list,
+ struct copy_ctx, list)) {
+ spin_unlock_irqrestore(&cio->list_lock, flags);
+ schedule_work(&ctx->dispatch_work);
+ goto free_read_bio;
+ }
+ spin_unlock_irqrestore(&cio->list_lock, flags);
+ } else
+ schedule_work(&ctx->dispatch_work);
+
+free_read_bio:
+ kfree(read_bio);
+
+ return;
+
+err_rw_bio:
+ clen = (read_bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ cio->ranges[ri].src;
+ cio->ranges[ri].comp_len = min_t(sector_t, clen,
+ cio->ranges[ri].comp_len);
+ __free_page(read_bio->bi_io_vec[0].bv_page);
+ bio_map_kern_endio(read_bio);
+ if (atomic_dec_and_test(&ctx->refcount))
+ kfree(ctx);
+ if (atomic_dec_and_test(&cio->refcount)) {
+ if (cio->endio) {
+ cio->endio(cio->private, cio->io_err);
+ kfree(cio);
+ } else
+ blk_wake_io_task(cio->waiter);
+ }
+}
+
+/*
+ * If native copy offload feature is absent, this function tries to emulate,
+ * by copying data from source to a temporary buffer and from buffer to
+ * destination device.
+ */
+static int blk_copy_emulate(struct block_device *src_bdev,
+ struct block_device *dst_bdev, struct range_entry *ranges,
+ int nr, cio_iodone_t end_io, void *private, gfp_t gfp_mask)
+{
+ struct request_queue *sq = bdev_get_queue(src_bdev);
+ struct request_queue *dq = bdev_get_queue(dst_bdev);
+ struct bio *read_bio, *write_bio;
+ void *buf = NULL;
+ struct copy_ctx *ctx;
+ struct cio *cio;
+ sector_t src, dst, offset, buf_len, req_len, rem = 0;
+ int ri = 0, ret = 0;
+ unsigned long flags;
+ sector_t max_src_hw_len = min_t(unsigned int, queue_max_hw_sectors(sq),
+ queue_max_segments(sq) << (PAGE_SHIFT - SECTOR_SHIFT))
+ << SECTOR_SHIFT;
+ sector_t max_dst_hw_len = min_t(unsigned int, queue_max_hw_sectors(dq),
+ queue_max_segments(dq) << (PAGE_SHIFT - SECTOR_SHIFT))
+ << SECTOR_SHIFT;
+ sector_t max_hw_len = min_t(unsigned int,
+ max_src_hw_len, max_dst_hw_len);
+
+ cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+ if (!cio)
+ return -ENOMEM;
+ cio->ranges = ranges;
+ atomic_set(&cio->refcount, 1);
+ cio->waiter = current;
+ cio->endio = end_io;
+ cio->private = private;
+
+ if (bdev_is_zoned(dst_bdev)) {
+ INIT_LIST_HEAD(&cio->list);
+ spin_lock_init(&cio->list_lock);
+ }
+
+ for (ri = 0; ri < nr; ri++) {
+ offset = ranges[ri].comp_len;
+ src = ranges[ri].src + offset;
+ dst = ranges[ri].dst + offset;
+ /* If IO fails, we truncate comp_len */
+ ranges[ri].comp_len = ranges[ri].len;
+
+ for (rem = ranges[ri].len - offset; rem > 0; rem -= buf_len) {
+ req_len = min_t(int, max_hw_len, rem);
+
+ buf = blk_alloc_buf(req_len, &buf_len, gfp_mask);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto err_alloc_buf;
+ }
+
+ ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto err_ctx;
+ }
+
+ read_bio = bio_map_kern(sq, buf, buf_len, gfp_mask);
+ if (IS_ERR(read_bio)) {
+ ret = PTR_ERR(read_bio);
+ goto err_read_bio;
+ }
+
+ write_bio = bio_map_kern(dq, buf, buf_len, gfp_mask);
+ if (IS_ERR(write_bio)) {
+ ret = PTR_ERR(write_bio);
+ goto err_write_bio;
+ }
+
+ ctx->cio = cio;
+ ctx->range_idx = ri;
+ ctx->write_bio = write_bio;
+ atomic_set(&ctx->refcount, 1);
+
+ read_bio->bi_iter.bi_sector = src >> SECTOR_SHIFT;
+ read_bio->bi_iter.bi_size = buf_len;
+ read_bio->bi_opf = REQ_OP_READ | REQ_SYNC;
+ bio_set_dev(read_bio, src_bdev);
+ read_bio->bi_end_io = blk_copy_emulate_read_end_io;
+ read_bio->bi_private = ctx;
+
+ write_bio->bi_iter.bi_size = buf_len;
+ write_bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+ bio_set_dev(write_bio, dst_bdev);
+ write_bio->bi_end_io = blk_copy_emulate_write_end_io;
+ write_bio->bi_iter.bi_sector = dst >> SECTOR_SHIFT;
+ write_bio->bi_private = ctx;
+
+ if (bdev_is_zoned(dst_bdev)) {
+ INIT_WORK(&ctx->dispatch_work,
+ blk_zoned_copy_dispatch_work_fn);
+ INIT_LIST_HEAD(&ctx->list);
+ spin_lock_irqsave(&cio->list_lock, flags);
+ ctx->status = REQ_COPY_READ_PROGRESS;
+ list_add_tail(&ctx->list, &cio->list);
+ spin_unlock_irqrestore(&cio->list_lock, flags);
+ } else
+ INIT_WORK(&ctx->dispatch_work,
+ blk_copy_dispatch_work_fn);
+
+ atomic_inc(&cio->refcount);
+ submit_bio(read_bio);
+
+ src += buf_len;
+ dst += buf_len;
+ }
+ }
+
+ /* Wait for completion of all IO's*/
+ return cio_await_completion(cio);
+
+err_write_bio:
+ bio_put(read_bio);
+err_read_bio:
+ kfree(ctx);
+err_ctx:
+ kvfree(buf);
+err_alloc_buf:
+ ranges[ri].comp_len -= min_t(sector_t,
+ ranges[ri].comp_len, (ranges[ri].len - rem));
+
+ cio->io_err = ret;
+ return cio_await_completion(cio);
+}
+
static inline bool blk_check_copy_offload(struct request_queue *src_q,
struct request_queue *dst_q)
{
@@ -460,15 +693,21 @@ int blkdev_issue_copy(struct block_device *src_bdev,
struct request_queue *src_q = bdev_get_queue(src_bdev);
struct request_queue *dst_q = bdev_get_queue(dst_bdev);
int ret = -EINVAL;
+ bool offload = false;

ret = blk_copy_sanity_check(src_bdev, dst_bdev, ranges, nr);
if (ret)
return ret;

- if (blk_check_copy_offload(src_q, dst_q))
+ offload = blk_check_copy_offload(src_q, dst_q);
+ if (offload)
ret = blk_copy_offload(src_bdev, dst_bdev, ranges, nr,
end_io, private, gfp_mask);

+ if (ret || !offload)
+ ret = blk_copy_emulate(src_bdev, dst_bdev, ranges, nr,
+ end_io, private, gfp_mask);
+
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_issue_copy);
diff --git a/block/blk-map.c b/block/blk-map.c
index 19940c978c73..bcf8db2b75f1 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -363,7 +363,7 @@ static void bio_invalidate_vmalloc_pages(struct bio *bio)
#endif
}

-static void bio_map_kern_endio(struct bio *bio)
+void bio_map_kern_endio(struct bio *bio)
{
bio_invalidate_vmalloc_pages(bio);
bio_uninit(bio);
@@ -380,7 +380,7 @@ static void bio_map_kern_endio(struct bio *bio)
* Map the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
-static struct bio *bio_map_kern(struct request_queue *q, void *data,
+struct bio *bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a3b12ad42ed7..b0b18c30a60b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1068,6 +1068,9 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
int blkdev_issue_copy(struct block_device *src_bdev,
struct block_device *dst_bdev, struct range_entry *ranges,
int nr, cio_iodone_t end_io, void *private, gfp_t gfp_mask);
+struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
+ gfp_t gfp_mask);
+void bio_map_kern_endio(struct bio *bio);

#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
--
2.35.1.500.gb896f729e2