Skip to content

Commit 189f07b

Browse files
committed
zvol: implement platform-independent part of block cloning
In Linux, block devices currently lack support for `copy_file_range` API because the kernel does not provide the necessary functionality. However, there is an ongoing upstream effort to address this limitation: https://patchwork.kernel.org/project/dm-devel/cover/[email protected]/. We have adopted this upstream kernel patch into the TrueNAS kernel and made some additional modifications to enable block cloning specifically for the zvol block device. This patch implements the platform- independent portions of these changes for inclusion in OpenZFS. This patch does not introduce any new functionality directly into OpenZFS. The `TX_CLONE_RANGE` replay capability is only relevant when zvols are migrated to non-TrueNAS systems that support Clone Range replay in the ZIL. Signed-off-by: Ameer Hamza <[email protected]>
1 parent 1acd246 commit 189f07b

File tree

3 files changed

+289
-2
lines changed

3 files changed

+289
-2
lines changed

include/sys/zvol_impl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
8888
int zvol_init_impl(void);
8989
void zvol_fini_impl(void);
9090
void zvol_wait_close(zvol_state_t *zv);
91+
int zvol_clone_range(zvol_state_handle_t *, uint64_t,
92+
zvol_state_handle_t *, uint64_t, uint64_t);
93+
void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
94+
uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
95+
size_t nbps);
9196

9297
/*
9398
* platform dependent functions exported to platform independent code

module/zfs/zfs_vnops.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ int zfs_bclone_enabled = 1;
7171
* a copy of the file and is therefore not the default. However, in certain
7272
* scenarios this behavior may be desirable so a tunable is provided.
7373
*/
74-
static int zfs_bclone_wait_dirty = 0;
74+
int zfs_bclone_wait_dirty = 0;
7575

7676
/*
7777
* Enable Direct I/O. If this setting is 0, then all I/O requests will be

module/zfs/zvol.c

Lines changed: 283 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
9393
struct hlist_head *zvol_htable;
9494
static list_t zvol_state_list;
9595
krwlock_t zvol_state_lock;
96+
extern int zfs_bclone_wait_dirty;
9697

9798
typedef enum {
9899
ZVOL_ASYNC_REMOVE_MINORS,
@@ -516,6 +517,285 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
516517
return (error);
517518
}
518519

520+
/*
521+
* Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
522+
* after a system failure
523+
*/
524+
static int
525+
zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
526+
{
527+
zvol_state_t *zv = arg1;
528+
lr_clone_range_t *lr = arg2;
529+
objset_t *os = zv->zv_objset;
530+
dmu_tx_t *tx;
531+
int error;
532+
uint64_t blksz;
533+
uint64_t off;
534+
uint64_t len;
535+
536+
ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
537+
ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
538+
lr_bps[lr->lr_nbps]));
539+
540+
if (byteswap)
541+
byteswap_uint64_array(lr, sizeof (*lr));
542+
543+
ASSERT(spa_feature_is_enabled(dmu_objset_spa(os),
544+
SPA_FEATURE_BLOCK_CLONING));
545+
546+
off = lr->lr_offset;
547+
len = lr->lr_length;
548+
blksz = lr->lr_blksz;
549+
550+
if ((off % blksz) != 0) {
551+
return (SET_ERROR(EINVAL));
552+
}
553+
554+
error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
555+
if (error != 0 || !zv->zv_dn)
556+
return (error);
557+
tx = dmu_tx_create(os);
558+
dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len);
559+
error = dmu_tx_assign(tx, TXG_WAIT);
560+
if (error != 0) {
561+
dmu_tx_abort(tx);
562+
goto out;
563+
}
564+
error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len,
565+
tx, lr->lr_bps, lr->lr_nbps);
566+
if (error != 0) {
567+
dmu_tx_commit(tx);
568+
goto out;
569+
}
570+
571+
/*
572+
* zil_replaying() not only check if we are replaying ZIL, but also
573+
* updates the ZIL header to record replay progress.
574+
*/
575+
VERIFY(zil_replaying(zv->zv_zilog, tx));
576+
dmu_tx_commit(tx);
577+
578+
out:
579+
dnode_rele(zv->zv_dn, zv);
580+
zv->zv_dn = NULL;
581+
return (error);
582+
}
583+
584+
int
585+
zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
586+
uint64_t outoff, uint64_t len)
587+
{
588+
zilog_t *zilog_dst;
589+
zfs_locked_range_t *inlr, *outlr;
590+
objset_t *inos, *outos;
591+
dmu_tx_t *tx;
592+
blkptr_t *bps;
593+
size_t maxblocks;
594+
int error = EINVAL;
595+
596+
rw_enter(&zv_dst->zv_suspend_lock, RW_READER);
597+
if (zv_dst->zv_zilog == NULL) {
598+
rw_exit(&zv_dst->zv_suspend_lock);
599+
rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER);
600+
if (zv_dst->zv_zilog == NULL) {
601+
zv_dst->zv_zilog = zil_open(zv_dst->zv_objset,
602+
zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums);
603+
zv_dst->zv_flags |= ZVOL_WRITTEN_TO;
604+
VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags &
605+
ZIL_REPLAY_NEEDED));
606+
}
607+
rw_downgrade(&zv_dst->zv_suspend_lock);
608+
}
609+
if (zv_src != zv_dst)
610+
rw_enter(&zv_src->zv_suspend_lock, RW_READER);
611+
612+
inos = zv_src->zv_objset;
613+
outos = zv_dst->zv_objset;
614+
615+
/*
616+
* Sanity checks
617+
*/
618+
if (!spa_feature_is_enabled(dmu_objset_spa(outos),
619+
SPA_FEATURE_BLOCK_CLONING)) {
620+
error = EOPNOTSUPP;
621+
goto out;
622+
}
623+
if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
624+
error = EXDEV;
625+
goto out;
626+
}
627+
if (inos->os_encrypted != outos->os_encrypted) {
628+
error = EXDEV;
629+
goto out;
630+
}
631+
if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) {
632+
error = EINVAL;
633+
goto out;
634+
}
635+
if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) {
636+
error = 0;
637+
goto out;
638+
}
639+
640+
/*
641+
* Do not read beyond boundary
642+
*/
643+
if (len > zv_src->zv_volsize - inoff)
644+
len = zv_src->zv_volsize - inoff;
645+
if (len > zv_dst->zv_volsize - outoff)
646+
len = zv_dst->zv_volsize - outoff;
647+
if (len == 0) {
648+
error = 0;
649+
goto out;
650+
}
651+
652+
/*
653+
* No overlapping if we are cloning within the same file
654+
*/
655+
if (zv_src == zv_dst) {
656+
if (inoff < outoff + len && outoff < inoff + len) {
657+
error = EINVAL;
658+
goto out;
659+
}
660+
}
661+
662+
/*
663+
* Offsets and length must be at block boundaries
664+
*/
665+
if ((inoff % zv_src->zv_volblocksize) != 0 ||
666+
(outoff % zv_dst->zv_volblocksize) != 0) {
667+
error = EINVAL;
668+
goto out;
669+
}
670+
671+
/*
672+
* Length must be multiple of block size
673+
*/
674+
if ((len % zv_src->zv_volblocksize) != 0) {
675+
error = EINVAL;
676+
goto out;
677+
}
678+
679+
zilog_dst = zv_dst->zv_zilog;
680+
maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) /
681+
sizeof (bps[0]);
682+
bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
683+
/*
684+
* Maintain predictable lock order.
685+
*/
686+
if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) {
687+
inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
688+
RL_READER);
689+
outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
690+
RL_WRITER);
691+
} else {
692+
outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
693+
RL_WRITER);
694+
inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
695+
RL_READER);
696+
}
697+
698+
while (len > 0) {
699+
uint64_t size, last_synced_txg;
700+
size_t nbps = maxblocks;
701+
size = MIN(zv_src->zv_volblocksize * maxblocks, len);
702+
last_synced_txg = spa_last_synced_txg(
703+
dmu_objset_spa(zv_src->zv_objset));
704+
error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff,
705+
size, bps, &nbps);
706+
if (error != 0) {
707+
/*
708+
* If we are trying to clone a block that was created
709+
* in the current transaction group, the error will be
710+
* EAGAIN here. Based on zfs_bclone_wait_dirty either
711+
* return a shortened range to the caller so it can
712+
* fallback, or wait for the next TXG and check again.
713+
*/
714+
if (error == EAGAIN && zfs_bclone_wait_dirty) {
715+
txg_wait_synced(dmu_objset_pool
716+
(zv_src->zv_objset), last_synced_txg + 1);
717+
continue;
718+
}
719+
break;
720+
}
721+
722+
tx = dmu_tx_create(zv_dst->zv_objset);
723+
dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size);
724+
error = dmu_tx_assign(tx, TXG_WAIT);
725+
if (error != 0) {
726+
dmu_tx_abort(tx);
727+
break;
728+
}
729+
error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size,
730+
tx, bps, nbps);
731+
if (error != 0) {
732+
dmu_tx_commit(tx);
733+
break;
734+
}
735+
zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff,
736+
size, zv_src->zv_volblocksize, bps, nbps);
737+
dmu_tx_commit(tx);
738+
inoff += size;
739+
outoff += size;
740+
len -= size;
741+
}
742+
vmem_free(bps, sizeof (bps[0]) * maxblocks);
743+
zfs_rangelock_exit(outlr);
744+
zfs_rangelock_exit(inlr);
745+
if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) {
746+
zil_commit(zilog_dst, ZVOL_OBJ);
747+
}
748+
out:
749+
if (zv_src != zv_dst)
750+
rw_exit(&zv_src->zv_suspend_lock);
751+
rw_exit(&zv_dst->zv_suspend_lock);
752+
return (SET_ERROR(error));
753+
}
754+
755+
/*
756+
* Handles TX_CLONE_RANGE transactions.
757+
*/
758+
void
759+
zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off,
760+
uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps)
761+
{
762+
itx_t *itx;
763+
lr_clone_range_t *lr;
764+
uint64_t partlen, max_log_data;
765+
size_t partnbps;
766+
767+
if (zil_replaying(zilog, tx))
768+
return;
769+
770+
max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
771+
772+
while (nbps > 0) {
773+
partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
774+
partlen = partnbps * blksz;
775+
ASSERT3U(partlen, <, len + blksz);
776+
partlen = MIN(partlen, len);
777+
778+
itx = zil_itx_create(txtype,
779+
sizeof (*lr) + sizeof (bps[0]) * partnbps);
780+
lr = (lr_clone_range_t *)&itx->itx_lr;
781+
lr->lr_foid = ZVOL_OBJ;
782+
lr->lr_offset = off;
783+
lr->lr_length = partlen;
784+
lr->lr_blksz = blksz;
785+
lr->lr_nbps = partnbps;
786+
memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
787+
788+
zil_itx_assign(zilog, itx, tx);
789+
790+
bps += partnbps;
791+
ASSERT3U(nbps, >=, partnbps);
792+
nbps -= partnbps;
793+
off += partlen;
794+
ASSERT3U(len, >=, partlen);
795+
len -= partlen;
796+
}
797+
}
798+
519799
static int
520800
zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
521801
{
@@ -540,7 +820,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
540820
zvol_replay_write, /* TX_WRITE */
541821
zvol_replay_truncate, /* TX_TRUNCATE */
542822
zvol_replay_err, /* TX_SETATTR */
823+
zvol_replay_err, /* TX_ACL_V0 */
543824
zvol_replay_err, /* TX_ACL */
825+
zvol_replay_err, /* TX_CREATE_ACL */
544826
zvol_replay_err, /* TX_CREATE_ATTR */
545827
zvol_replay_err, /* TX_CREATE_ACL_ATTR */
546828
zvol_replay_err, /* TX_MKDIR_ACL */
@@ -550,7 +832,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
550832
zvol_replay_err, /* TX_SETSAXATTR */
551833
zvol_replay_err, /* TX_RENAME_EXCHANGE */
552834
zvol_replay_err, /* TX_RENAME_WHITEOUT */
553-
zvol_replay_err, /* TX_CLONE_RANGE */
835+
zvol_replay_clone_range, /* TX_CLONE_RANGE */
554836
};
555837

556838
/*

0 commit comments

Comments
 (0)