Merge pull request #44 from sail-sg/inv

gasvn · web-flow · commit 572298811b13 · 2023-06-30T00:42:13.000+08:00
support text invt for drag prompt
diff --git a/editany_demo.py b/editany_demo.py
@@ -80,7 +80,7 @@ def create_demo_template(
                     a_prompt = gr.Textbox(
                         label="Positive Prompt",
                         info="Text in the expected things of edited region",
-                        value="best quality, extremely detailed",
+                        value="best quality, extremely detailed,",
                     )
                     n_prompt = gr.Textbox(
                         label="Negative Prompt",
@@ -177,7 +177,7 @@ def create_demo_template(
                         label="SAM Control Scale",
                         minimum=0,
                         maximum=1.0,
-                        value=1.0,
+                        value=0.3,
                         step=0.1,
                     )
                     ref_inpaint_scale = gr.Slider(
@@ -187,6 +187,15 @@ def create_demo_template(
                         value=0.2,
                         step=0.1,
                     )
+                    with gr.Row():
+                        ref_textinv = gr.Checkbox(
+                            label="Use textual inversion token", value=False
+                        )
+                        ref_textinv_path = gr.Textbox(
+                            label="textual inversion token path",
+                            info="Text in the inversion token path",
+                            value=None,
+                        )
 
                 with gr.Accordion("Advanced options", open=False):
                     mask_image = gr.Image(
@@ -277,6 +286,8 @@ def create_demo_template(
             ref_sam_scale,
             ref_inpaint_scale,
             ref_auto_prompt,
+            ref_textinv,
+            ref_textinv_path,
         ]
         run_button.click(
             fn=process,
@@ -321,6 +332,8 @@ def create_demo_template(
             ref_sam_scale,
             ref_inpaint_scale,
             ref_auto_prompt,
+            ref_textinv,
+            ref_textinv_path,
         ]
 
         run_button_click.click(
diff --git a/editany_lora.py b/editany_lora.py
@@ -600,6 +600,8 @@ def process(
         ref_sam_scale=None,
         ref_inpaint_scale=None,
         ref_auto_prompt=False,
+        ref_textinv=True,
+        ref_textinv_path=None,
     ):
 
         if condition_model is None or condition_model == "EditAnything":
@@ -652,7 +654,7 @@ def process(
             )
             self.default_controlnet_path = this_controlnet_path
             torch.cuda.empty_cache()
-        if self.last_ref_infer and ref_image is None:
+        if self.last_ref_infer:
             print("Redefine the model to overwrite the ref mode")
             self.pipe = obtain_generation_model(
                 self.base_model_path,
@@ -661,11 +663,12 @@ def process(
                 enable_all_generate,
                 self.extra_inpaint,
             )
+            self.last_ref_infer = False
 
         if ref_image is not None:
             ref_mask = ref_image["mask"]
             ref_image = ref_image["image"]
-            if ref_auto_prompt:
+            if ref_auto_prompt or ref_textinv:
                 bbox = get_bounding_box(
                     np.array(ref_mask) / 255
                 )  # reverse the mask to make 1 the choosen region
@@ -680,13 +683,27 @@ def process(
                 cropped_ref_image = Image.fromarray(
                     cropped_ref_image.astype("uint8"))
 
+            if ref_auto_prompt:
                 generated_prompt = self.get_blip2_text(cropped_ref_image)
                 ref_prompt += generated_prompt
                 a_prompt += generated_prompt
             print("Generated ref text:", ref_prompt)
             print("Generated input text:", a_prompt)
+            self.last_ref_infer = True
             # ref_image = cropped_ref_image
             # ref_mask = cropped_ref_mask
+            if ref_textinv:
+                try:
+                    self.pipe.load_textual_inversion(ref_textinv_path)
+                    print("Load textinv embedding from:", ref_textinv_path)
+                except:
+                    print("No textinvert embeddings found.")
+                    ref_data_path = "./utils/tmp/textinv/img"
+                    if not os.path.exists(ref_data_path):   
+                        os.makedirs(ref_data_path)
+                    cropped_ref_image.save(os.path.join(ref_data_path, 'ref.png'))
+                    print("Ref image region is save to:", ref_data_path)
+                    print("Plese finetune with run_texutal_inversion.sh in utils folder to get the textinvert embeddings.")
 
         else:
             ref_mask = None
diff --git a/environment.yaml b/environment.yaml
@@ -10,7 +10,7 @@ dependencies:
   - torchvision=0.14.1
   - numpy=1.23.1
   - pip:
-      - gradio==3.16.2
+      - gradio==3.35.2
       - albumentations==1.3.0
       - opencv-contrib-python==4.3.0.36
       - imageio==2.9.0
@@ -32,6 +32,7 @@ dependencies:
       - prettytable==3.6.0
       - safetensors==0.2.7
       - basicsr==1.4.2
-      - diffusers==0.14.0
+      - diffusers==0.17.1
       - accelerate==0.17.0
-      - transformers==4.27.4
+      - transformers==4.30.2
+      - xformers
diff --git a/utils/run_texutal_inversion.sh b/utils/run_texutal_inversion.sh
@@ -0,0 +1,18 @@
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export DATA_DIR="./tmp/textinv/img"
+export OUTPUT_DIR="./tmp/textinv/model"
+
+CUDA_VISIBLE_DEVICES=0 accelerate launch --main_process_port 1111 texutal_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<new-obj>" --initializer_token="mark" \
+  --resolution=512 \
+  --train_batch_size=4 \
+  --gradient_accumulation_steps=1 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir=$OUTPUT_DIR \
+  --num_vectors 10
diff --git a/utils/stable_diffusion_reference.py b/utils/stable_diffusion_reference.py
@@ -468,7 +468,7 @@ def hack_CrossAttnDownBlock2D_forward(
                         and len(self.mean_bank0) > 0
                         and len(self.var_bank0) > 0
                     ):
-                        print("hacked_CrossAttnDownBlock2D_forward0")
+                        # print("hacked_CrossAttnDownBlock2D_forward0")
                         scale_ratio = self.inpaint_mask.shape[2] / \
                             hidden_states.shape[2]
                         this_inpaint_mask = F.interpolate(
@@ -548,7 +548,7 @@ def hack_CrossAttnDownBlock2D_forward(
                         and len(self.mean_bank) > 0
                         and len(self.var_bank) > 0
                     ):
-                        print("hack_CrossAttnDownBlock2D_forward")
+                        # print("hack_CrossAttnDownBlock2D_forward")
                         scale_ratio = self.inpaint_mask.shape[2] / \
                             hidden_states.shape[2]
                         this_inpaint_mask = F.interpolate(
@@ -645,7 +645,7 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None):
                         and len(self.mean_bank) > 0
                         and len(self.var_bank) > 0
                     ):
-                        print("hacked_DownBlock2D_forward")
+                        # print("hacked_DownBlock2D_forward")
                         scale_ratio = self.inpaint_mask.shape[2] / \
                             hidden_states.shape[2]
                         this_inpaint_mask = F.interpolate(
@@ -753,7 +753,7 @@ def hacked_CrossAttnUpBlock2D_forward(
                         and len(self.mean_bank0) > 0
                         and len(self.var_bank0) > 0
                     ):
-                        print("hacked_CrossAttnUpBlock2D_forward1")
+                        # print("hacked_CrossAttnUpBlock2D_forward1")
                         scale_ratio = self.inpaint_mask.shape[2] / \
                             hidden_states.shape[2]
                         this_inpaint_mask = F.interpolate(
@@ -835,7 +835,7 @@ def hacked_CrossAttnUpBlock2D_forward(
                         and len(self.mean_bank) > 0
                         and len(self.var_bank) > 0
                     ):
-                        print("hacked_CrossAttnUpBlock2D_forward")
+                        # print("hacked_CrossAttnUpBlock2D_forward")
                         scale_ratio = self.inpaint_mask.shape[2] / \
                             hidden_states.shape[2]
                         this_inpaint_mask = F.interpolate(
@@ -932,7 +932,7 @@ def hacked_UpBlock2D_forward(
                         and len(self.mean_bank) > 0
                         and len(self.var_bank) > 0
                     ):
-                        print("hacked_UpBlock2D_forward")
+                        # print("hacked_UpBlock2D_forward")
                         scale_ratio = self.inpaint_mask.shape[2] / \
                             hidden_states.shape[2]
                         this_inpaint_mask = F.interpolate(
diff --git a/utils/texutal_inversion.py b/utils/texutal_inversion.py
@@ -18,6 +18,7 @@
 import math
 import os
 import random
+import shutil
 import warnings
 from pathlib import Path
 
@@ -77,7 +78,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-# check_min_version("0.17.0.dev0")
+# check_min_version("0.18.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -394,11 +395,7 @@ def parse_args():
         "--checkpoints_total_limit",
         type=int,
         default=None,
-        help=(
-            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
-            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
-            " for more docs"
-        ),
+        help=("Max number of checkpoints to store."),
     )
     parser.add_argument(
         "--resume_from_checkpoint",
@@ -423,38 +420,36 @@ def parse_args():
 
     return args
 
+
 imagenet_templates_small = [
     "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
 ]
-# imagenet_templates_small = [
-#     "a photo of a {}",
-#     "a rendering of a {}",
-#     "a cropped photo of the {}",
-#     "the photo of a {}",
-#     "a photo of a clean {}",
-#     "a photo of a dirty {}",
-#     "a dark photo of the {}",
-#     "a photo of my {}",
-#     "a photo of the cool {}",
-#     "a close-up photo of a {}",
-#     "a bright photo of the {}",
-#     "a cropped photo of a {}",
-#     "a photo of the {}",
-#     "a good photo of the {}",
-#     "a photo of one {}",
-#     "a close-up photo of the {}",
-#     "a rendition of the {}",
-#     "a photo of the clean {}",
-#     "a rendition of a {}",
-#     "a photo of a nice {}",
-#     "a good photo of a {}",
-#     "a photo of the nice {}",
-#     "a photo of the small {}",
-#     "a photo of the weird {}",
-#     "a photo of the large {}",
-#     "a photo of a cool {}",
-#     "a photo of a small {}",
-# ]
 
 imagenet_style_templates_small = [
     "a painting in the style of {}",
@@ -568,14 +563,11 @@ def __getitem__(self, i):
 def main():
     args = parse_args()
     logging_dir = os.path.join(args.output_dir, args.logging_dir)
-
-    accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit)
-
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
         log_with=args.report_to,
-        logging_dir=logging_dir,
         project_config=accelerator_project_config,
     )
 
@@ -755,8 +747,8 @@ def main():
         text_encoder, optimizer, train_dataloader, lr_scheduler
     )
 
-    # For mixed precision training we cast the unet and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
     weight_dtype = torch.float32
     if accelerator.mixed_precision == "fp16":
         weight_dtype = torch.float16
@@ -890,6 +882,26 @@ def main():
 
                 if accelerator.is_main_process:
                     if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         accelerator.save_state(save_path)
                         logger.info(f"Saved state to {save_path}")