@@ -93,6 +93,7 @@ unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
93
93
struct hlist_head * zvol_htable ;
94
94
static list_t zvol_state_list ;
95
95
krwlock_t zvol_state_lock ;
96
+ extern int zfs_bclone_wait_dirty ;
96
97
97
98
typedef enum {
98
99
ZVOL_ASYNC_REMOVE_MINORS ,
@@ -516,6 +517,285 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
516
517
return (error );
517
518
}
518
519
520
+ /*
521
+ * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
522
+ * after a system failure
523
+ */
524
+ static int
525
+ zvol_replay_clone_range (void * arg1 , void * arg2 , boolean_t byteswap )
526
+ {
527
+ zvol_state_t * zv = arg1 ;
528
+ lr_clone_range_t * lr = arg2 ;
529
+ objset_t * os = zv -> zv_objset ;
530
+ dmu_tx_t * tx ;
531
+ int error ;
532
+ uint64_t blksz ;
533
+ uint64_t off ;
534
+ uint64_t len ;
535
+
536
+ ASSERT3U (lr -> lr_common .lrc_reclen , >=, sizeof (* lr ));
537
+ ASSERT3U (lr -> lr_common .lrc_reclen , >=, offsetof(lr_clone_range_t ,
538
+ lr_bps [lr -> lr_nbps ]));
539
+
540
+ if (byteswap )
541
+ byteswap_uint64_array (lr , sizeof (* lr ));
542
+
543
+ ASSERT (spa_feature_is_enabled (dmu_objset_spa (os ),
544
+ SPA_FEATURE_BLOCK_CLONING ));
545
+
546
+ off = lr -> lr_offset ;
547
+ len = lr -> lr_length ;
548
+ blksz = lr -> lr_blksz ;
549
+
550
+ if ((off % blksz ) != 0 ) {
551
+ return (SET_ERROR (EINVAL ));
552
+ }
553
+
554
+ error = dnode_hold (os , ZVOL_OBJ , zv , & zv -> zv_dn );
555
+ if (error != 0 || !zv -> zv_dn )
556
+ return (error );
557
+ tx = dmu_tx_create (os );
558
+ dmu_tx_hold_clone_by_dnode (tx , zv -> zv_dn , off , len );
559
+ error = dmu_tx_assign (tx , TXG_WAIT );
560
+ if (error != 0 ) {
561
+ dmu_tx_abort (tx );
562
+ goto out ;
563
+ }
564
+ error = dmu_brt_clone (zv -> zv_objset , ZVOL_OBJ , off , len ,
565
+ tx , lr -> lr_bps , lr -> lr_nbps );
566
+ if (error != 0 ) {
567
+ dmu_tx_commit (tx );
568
+ goto out ;
569
+ }
570
+
571
+ /*
572
+ * zil_replaying() not only check if we are replaying ZIL, but also
573
+ * updates the ZIL header to record replay progress.
574
+ */
575
+ VERIFY (zil_replaying (zv -> zv_zilog , tx ));
576
+ dmu_tx_commit (tx );
577
+
578
+ out :
579
+ dnode_rele (zv -> zv_dn , zv );
580
+ zv -> zv_dn = NULL ;
581
+ return (error );
582
+ }
583
+
584
+ int
585
+ zvol_clone_range (zvol_state_t * zv_src , uint64_t inoff , zvol_state_t * zv_dst ,
586
+ uint64_t outoff , uint64_t len )
587
+ {
588
+ zilog_t * zilog_dst ;
589
+ zfs_locked_range_t * inlr , * outlr ;
590
+ objset_t * inos , * outos ;
591
+ dmu_tx_t * tx ;
592
+ blkptr_t * bps ;
593
+ size_t maxblocks ;
594
+ int error = EINVAL ;
595
+
596
+ rw_enter (& zv_dst -> zv_suspend_lock , RW_READER );
597
+ if (zv_dst -> zv_zilog == NULL ) {
598
+ rw_exit (& zv_dst -> zv_suspend_lock );
599
+ rw_enter (& zv_dst -> zv_suspend_lock , RW_WRITER );
600
+ if (zv_dst -> zv_zilog == NULL ) {
601
+ zv_dst -> zv_zilog = zil_open (zv_dst -> zv_objset ,
602
+ zvol_get_data , & zv_dst -> zv_kstat .dk_zil_sums );
603
+ zv_dst -> zv_flags |= ZVOL_WRITTEN_TO ;
604
+ VERIFY0 ((zv_dst -> zv_zilog -> zl_header -> zh_flags &
605
+ ZIL_REPLAY_NEEDED ));
606
+ }
607
+ rw_downgrade (& zv_dst -> zv_suspend_lock );
608
+ }
609
+ if (zv_src != zv_dst )
610
+ rw_enter (& zv_src -> zv_suspend_lock , RW_READER );
611
+
612
+ inos = zv_src -> zv_objset ;
613
+ outos = zv_dst -> zv_objset ;
614
+
615
+ /*
616
+ * Sanity checks
617
+ */
618
+ if (!spa_feature_is_enabled (dmu_objset_spa (outos ),
619
+ SPA_FEATURE_BLOCK_CLONING )) {
620
+ error = EOPNOTSUPP ;
621
+ goto out ;
622
+ }
623
+ if (dmu_objset_spa (inos ) != dmu_objset_spa (outos )) {
624
+ error = EXDEV ;
625
+ goto out ;
626
+ }
627
+ if (inos -> os_encrypted != outos -> os_encrypted ) {
628
+ error = EXDEV ;
629
+ goto out ;
630
+ }
631
+ if (zv_src -> zv_volblocksize != zv_dst -> zv_volblocksize ) {
632
+ error = EINVAL ;
633
+ goto out ;
634
+ }
635
+ if (inoff >= zv_src -> zv_volsize || outoff >= zv_dst -> zv_volsize ) {
636
+ error = 0 ;
637
+ goto out ;
638
+ }
639
+
640
+ /*
641
+ * Do not read beyond boundary
642
+ */
643
+ if (len > zv_src -> zv_volsize - inoff )
644
+ len = zv_src -> zv_volsize - inoff ;
645
+ if (len > zv_dst -> zv_volsize - outoff )
646
+ len = zv_dst -> zv_volsize - outoff ;
647
+ if (len == 0 ) {
648
+ error = 0 ;
649
+ goto out ;
650
+ }
651
+
652
+ /*
653
+ * No overlapping if we are cloning within the same file
654
+ */
655
+ if (zv_src == zv_dst ) {
656
+ if (inoff < outoff + len && outoff < inoff + len ) {
657
+ error = EINVAL ;
658
+ goto out ;
659
+ }
660
+ }
661
+
662
+ /*
663
+ * Offsets and length must be at block boundaries
664
+ */
665
+ if ((inoff % zv_src -> zv_volblocksize ) != 0 ||
666
+ (outoff % zv_dst -> zv_volblocksize ) != 0 ) {
667
+ error = EINVAL ;
668
+ goto out ;
669
+ }
670
+
671
+ /*
672
+ * Length must be multiple of block size
673
+ */
674
+ if ((len % zv_src -> zv_volblocksize ) != 0 ) {
675
+ error = EINVAL ;
676
+ goto out ;
677
+ }
678
+
679
+ zilog_dst = zv_dst -> zv_zilog ;
680
+ maxblocks = zil_max_log_data (zilog_dst , sizeof (lr_clone_range_t )) /
681
+ sizeof (bps [0 ]);
682
+ bps = vmem_alloc (sizeof (bps [0 ]) * maxblocks , KM_SLEEP );
683
+ /*
684
+ * Maintain predictable lock order.
685
+ */
686
+ if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff )) {
687
+ inlr = zfs_rangelock_enter (& zv_src -> zv_rangelock , inoff , len ,
688
+ RL_READER );
689
+ outlr = zfs_rangelock_enter (& zv_dst -> zv_rangelock , outoff , len ,
690
+ RL_WRITER );
691
+ } else {
692
+ outlr = zfs_rangelock_enter (& zv_dst -> zv_rangelock , outoff , len ,
693
+ RL_WRITER );
694
+ inlr = zfs_rangelock_enter (& zv_src -> zv_rangelock , inoff , len ,
695
+ RL_READER );
696
+ }
697
+
698
+ while (len > 0 ) {
699
+ uint64_t size , last_synced_txg ;
700
+ size_t nbps = maxblocks ;
701
+ size = MIN (zv_src -> zv_volblocksize * maxblocks , len );
702
+ last_synced_txg = spa_last_synced_txg (
703
+ dmu_objset_spa (zv_src -> zv_objset ));
704
+ error = dmu_read_l0_bps (zv_src -> zv_objset , ZVOL_OBJ , inoff ,
705
+ size , bps , & nbps );
706
+ if (error != 0 ) {
707
+ /*
708
+ * If we are trying to clone a block that was created
709
+ * in the current transaction group, the error will be
710
+ * EAGAIN here. Based on zfs_bclone_wait_dirty either
711
+ * return a shortened range to the caller so it can
712
+ * fallback, or wait for the next TXG and check again.
713
+ */
714
+ if (error == EAGAIN && zfs_bclone_wait_dirty ) {
715
+ txg_wait_synced (dmu_objset_pool
716
+ (zv_src -> zv_objset ), last_synced_txg + 1 );
717
+ continue ;
718
+ }
719
+ break ;
720
+ }
721
+
722
+ tx = dmu_tx_create (zv_dst -> zv_objset );
723
+ dmu_tx_hold_clone_by_dnode (tx , zv_dst -> zv_dn , outoff , size );
724
+ error = dmu_tx_assign (tx , TXG_WAIT );
725
+ if (error != 0 ) {
726
+ dmu_tx_abort (tx );
727
+ break ;
728
+ }
729
+ error = dmu_brt_clone (zv_dst -> zv_objset , ZVOL_OBJ , outoff , size ,
730
+ tx , bps , nbps );
731
+ if (error != 0 ) {
732
+ dmu_tx_commit (tx );
733
+ break ;
734
+ }
735
+ zvol_log_clone_range (zilog_dst , tx , TX_CLONE_RANGE , outoff ,
736
+ size , zv_src -> zv_volblocksize , bps , nbps );
737
+ dmu_tx_commit (tx );
738
+ inoff += size ;
739
+ outoff += size ;
740
+ len -= size ;
741
+ }
742
+ vmem_free (bps , sizeof (bps [0 ]) * maxblocks );
743
+ zfs_rangelock_exit (outlr );
744
+ zfs_rangelock_exit (inlr );
745
+ if (error == 0 && zv_dst -> zv_objset -> os_sync == ZFS_SYNC_ALWAYS ) {
746
+ zil_commit (zilog_dst , ZVOL_OBJ );
747
+ }
748
+ out :
749
+ if (zv_src != zv_dst )
750
+ rw_exit (& zv_src -> zv_suspend_lock );
751
+ rw_exit (& zv_dst -> zv_suspend_lock );
752
+ return (SET_ERROR (error ));
753
+ }
754
+
755
+ /*
756
+ * Handles TX_CLONE_RANGE transactions.
757
+ */
758
+ void
759
+ zvol_log_clone_range (zilog_t * zilog , dmu_tx_t * tx , int txtype , uint64_t off ,
760
+ uint64_t len , uint64_t blksz , const blkptr_t * bps , size_t nbps )
761
+ {
762
+ itx_t * itx ;
763
+ lr_clone_range_t * lr ;
764
+ uint64_t partlen , max_log_data ;
765
+ size_t partnbps ;
766
+
767
+ if (zil_replaying (zilog , tx ))
768
+ return ;
769
+
770
+ max_log_data = zil_max_log_data (zilog , sizeof (lr_clone_range_t ));
771
+
772
+ while (nbps > 0 ) {
773
+ partnbps = MIN (nbps , max_log_data / sizeof (bps [0 ]));
774
+ partlen = partnbps * blksz ;
775
+ ASSERT3U (partlen , < , len + blksz );
776
+ partlen = MIN (partlen , len );
777
+
778
+ itx = zil_itx_create (txtype ,
779
+ sizeof (* lr ) + sizeof (bps [0 ]) * partnbps );
780
+ lr = (lr_clone_range_t * )& itx -> itx_lr ;
781
+ lr -> lr_foid = ZVOL_OBJ ;
782
+ lr -> lr_offset = off ;
783
+ lr -> lr_length = partlen ;
784
+ lr -> lr_blksz = blksz ;
785
+ lr -> lr_nbps = partnbps ;
786
+ memcpy (lr -> lr_bps , bps , sizeof (bps [0 ]) * partnbps );
787
+
788
+ zil_itx_assign (zilog , itx , tx );
789
+
790
+ bps += partnbps ;
791
+ ASSERT3U (nbps , >=, partnbps );
792
+ nbps -= partnbps ;
793
+ off += partlen ;
794
+ ASSERT3U (len , >=, partlen );
795
+ len -= partlen ;
796
+ }
797
+ }
798
+
519
799
static int
520
800
zvol_replay_err (void * arg1 , void * arg2 , boolean_t byteswap )
521
801
{
@@ -540,7 +820,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
540
820
zvol_replay_write , /* TX_WRITE */
541
821
zvol_replay_truncate , /* TX_TRUNCATE */
542
822
zvol_replay_err , /* TX_SETATTR */
823
+ zvol_replay_err , /* TX_ACL_V0 */
543
824
zvol_replay_err , /* TX_ACL */
825
+ zvol_replay_err , /* TX_CREATE_ACL */
544
826
zvol_replay_err , /* TX_CREATE_ATTR */
545
827
zvol_replay_err , /* TX_CREATE_ACL_ATTR */
546
828
zvol_replay_err , /* TX_MKDIR_ACL */
@@ -550,7 +832,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
550
832
zvol_replay_err , /* TX_SETSAXATTR */
551
833
zvol_replay_err , /* TX_RENAME_EXCHANGE */
552
834
zvol_replay_err , /* TX_RENAME_WHITEOUT */
553
- zvol_replay_err , /* TX_CLONE_RANGE */
835
+ zvol_replay_clone_range , /* TX_CLONE_RANGE */
554
836
};
555
837
556
838
/*
0 commit comments