Merge pull request #332 from bghira/main

bghira · web-flow · commit d9758342603b · 2024-03-25T05:15:54.000-06:00
sd2x: num steps remaining fix | vaecache: exit with problematic data backend id
diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
@@ -392,7 +392,7 @@ def encode_images(self, images, filepaths, load_from_cache=True):
         if len(uncached_image_indices) > 0 and load_from_cache:
             # We wanted only uncached images. Something went wrong.
             raise Exception(
-                f"Some images were not correctly cached during the VAE Cache operations. Ensure --skip_file_discovery=vae is not set.\nProblematic images: {uncached_image_paths}"
+                f"(id={self.id}) Some images were not correctly cached during the VAE Cache operations. Ensure --skip_file_discovery=vae is not set.\nProblematic images: {uncached_image_paths}"
             )
 
         if load_from_cache:
diff --git a/train_sd21.py b/train_sd21.py
@@ -209,13 +209,13 @@ def main():
 
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
-    if args.allow_tf32:
+    if args.allow_tf32 and not torch.backends.mps.is_available():
         logger.info(
             "Enabling tf32 precision boost for NVIDIA devices due to --allow_tf32."
         )
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
-    else:
+    elif torch.backends.cuda.is_available():
         logger.warning(
             "If using an Ada or Ampere NVIDIA device, --allow_tf32 could add a bit more performance."
         )
@@ -870,8 +870,9 @@ def main():
     total_steps_remaining_at_start = args.max_train_steps
     # We store the number of dataset resets that have occurred inside the checkpoint.
     if first_epoch > 1:
-        steps_to_remove = first_epoch * num_update_steps_per_epoch
-        total_steps_remaining_at_start -= steps_to_remove
+        total_steps_remaining_at_start = (
+            total_steps_remaining_at_start - resume_global_step
+        )
         logger.debug(
             f"Resuming from epoch {first_epoch}, which leaves us with {total_steps_remaining_at_start}."
         )

Original file line number	Diff line number	Diff line change
`@@ -392,7 +392,7 @@ def encode_images(self, images, filepaths, load_from_cache=True):`
`392`	`392`	`if len(uncached_image_indices) > 0 and load_from_cache:`
`393`	`393`	`# We wanted only uncached images. Something went wrong.`
`394`	`394`	`raise Exception(`
`395`		`- f"Some images were not correctly cached during the VAE Cache operations. Ensure --skip_file_discovery=vae is not set.\nProblematic images: {uncached_image_paths}"`
	`395`	`+ f"(id={self.id}) Some images were not correctly cached during the VAE Cache operations. Ensure --skip_file_discovery=vae is not set.\nProblematic images: {uncached_image_paths}"`
`396`	`396`	`)`
`397`	`397`
`398`	`398`	`if load_from_cache:`