Merge pull request #204 from Dartvauder/dev

Dartvauder · web-flow · commit a32162da7ffd · 2024-09-17T13:46:30.000+03:00
Dev
diff --git a/LaunchFiles/appEN.py b/LaunchFiles/appEN.py
@@ -12,6 +12,9 @@
 cache_dir = os.path.join("cache")
 os.makedirs(cache_dir, exist_ok=True)
 os.environ["XDG_CACHE_HOME"] = cache_dir
+temp_dir = os.path.join("temp")
+os.makedirs(temp_dir, exist_ok=True)
+os.environ["TMPDIR"] = temp_dir
 import gradio as gr
 import langdetect
 from datasets import load_dataset, Audio
@@ -102,6 +105,7 @@ def wrapper():
 
 # Diffusers import
 diffusers = lazy_import('diffusers', '')
+BlipDiffusionPipeline = lazy_import('diffusers.pipelines', 'BlipDiffusionPipeline')
 StableDiffusionPipeline = lazy_import('diffusers', 'StableDiffusionPipeline')
 StableDiffusion3Pipeline = lazy_import('diffusers', 'StableDiffusion3Pipeline')
 StableDiffusionXLPipeline = lazy_import('diffusers', 'StableDiffusionXLPipeline')
@@ -3380,6 +3384,54 @@ def generate_image_diffedit(source_prompt, source_negative_prompt, target_prompt
         flush()
 
 
+def generate_image_blip_diffusion(text_prompt_input, negative_prompt, cond_image, cond_subject, tgt_subject,
+                                  num_inference_steps, guidance_scale, height, width, output_format):
+    blip_diffusion_path = os.path.join("inputs", "image", "sd_models", "blip-diff")
+
+    if not os.path.exists(blip_diffusion_path):
+        print("Downloading BlipDiffusion model...")
+        os.makedirs(blip_diffusion_path, exist_ok=True)
+        Repo.clone_from("https://huggingface.co/Salesforce/blipdiffusion", blip_diffusion_path)
+        print("BlipDiffusion model downloaded")
+
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        blip_diffusion_pipe = BlipDiffusionPipeline().BlipDiffusionPipeline.from_pretrained(
+            blip_diffusion_path, torch_dtype=torch.float16
+        ).to(device)
+
+        cond_image = Image.open(cond_image).convert("RGB")
+
+        output = blip_diffusion_pipe(
+            text_prompt_input,
+            cond_image,
+            cond_subject,
+            tgt_subject,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+        ).images
+
+        today = datetime.now().date()
+        image_dir = os.path.join('outputs', f"BlipDiffusion_{today.strftime('%Y%m%d')}")
+        os.makedirs(image_dir, exist_ok=True)
+        image_filename = f"blip_diffusion_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{output_format}"
+        image_path = os.path.join(image_dir, image_filename)
+
+        output[0].save(image_path, format=output_format.upper())
+
+        return image_path, "Image generated successfully."
+
+    except Exception as e:
+        return None, str(e)
+
+    finally:
+        del blip_diffusion_pipe
+        flush()
+
+
 def generate_image_animatediff(prompt, negative_prompt, input_video, strength, model_type, stable_diffusion_model_name, seed, motion_lora_name, num_frames, num_inference_steps,
                                guidance_scale, width, height, clip_skip):
 
@@ -8550,6 +8602,32 @@ def reload_interface():
     submit_btn="Generate"
 )
 
+blip_diffusion_interface = gr.Interface(
+    fn=generate_image_blip_diffusion,
+    inputs=[
+        gr.Textbox(label="Prompt"),
+        gr.Textbox(label="Negative Prompt", value=""),
+        gr.Image(label="Conditioning Image", type="filepath"),
+        gr.Textbox(label="Conditioning Subject"),
+        gr.Textbox(label="Target Subject"),
+        gr.Slider(minimum=1, maximum=100, value=30, step=1, label="Inference Steps"),
+        gr.Slider(minimum=0.1, maximum=30.0, value=8, step=0.1, label="Guidance Scale"),
+        gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Height"),
+        gr.Slider(minimum=64, maximum=2048, value=512, step=64, label="Width"),
+        gr.Radio(choices=["png", "jpeg"], label="Output Format", value="png")
+    ],
+    outputs=[
+        gr.Image(type="filepath", label="Generated Image"),
+        gr.Textbox(label="Message")
+    ],
+    title="NeuroSandboxWebUI - BlipDiffusion",
+    description="This interface allows you to generate images using BlipDiffusion. Upload a conditioning image, provide text prompts and subjects, and customize generation parameters.",
+    allow_flagging="never",
+    clear_btn=None,
+    stop_btn="Stop",
+    submit_btn="Generate"
+)
+
 animatediff_interface = gr.Interface(
     fn=generate_image_animatediff,
     inputs=[
@@ -10301,11 +10379,11 @@ def reload_interface():
         gr.TabbedInterface(
             [
                 gr.TabbedInterface(
-                    [txt2img_interface, img2img_interface, depth2img_interface, marigold_interface, pix2pix_interface, controlnet_interface, latent_upscale_interface, supir_upscale_interface, sdxl_refiner_interface, inpaint_interface, outpaint_interface, gligen_interface, diffedit_interface, animatediff_interface, hotshotxl_interface, video_interface, ldm3d_interface,
+                    [txt2img_interface, img2img_interface, depth2img_interface, marigold_interface, pix2pix_interface, controlnet_interface, latent_upscale_interface, supir_upscale_interface, sdxl_refiner_interface, inpaint_interface, outpaint_interface, gligen_interface, diffedit_interface, blip_diffusion_interface, animatediff_interface, hotshotxl_interface, video_interface, ldm3d_interface,
                      gr.TabbedInterface([sd3_txt2img_interface, sd3_img2img_interface, sd3_controlnet_interface, sd3_inpaint_interface],
                                         tab_names=["txt2img", "img2img", "controlnet", "inpaint"]),
                      cascade_interface, t2i_ip_adapter_interface, ip_adapter_faceid_interface, riffusion_interface],
-                    tab_names=["txt2img", "img2img", "depth2img", "marigold", "pix2pix", "controlnet", "upscale(latent)", "upscale(SUPIR)", "refiner", "inpaint", "outpaint", "gligen", "diffedit", "animatediff", "hotshotxl", "video", "ldm3d", "sd3", "cascade", "t2i-ip-adapter", "ip-adapter-faceid", "riffusion"]
+                    tab_names=["txt2img", "img2img", "depth2img", "marigold", "pix2pix", "controlnet", "upscale(latent)", "upscale(SUPIR)", "refiner", "inpaint", "outpaint", "gligen", "diffedit", "blip-diffusion", "animatediff", "hotshotxl", "video", "ldm3d", "sd3", "cascade", "t2i-ip-adapter", "ip-adapter-faceid", "riffusion"]
                 ),
                 kandinsky_interface, flux_interface, hunyuandit_interface, lumina_interface, kolors_interface, auraflow_interface, wurstchen_interface, deepfloyd_if_interface, pixart_interface, playgroundv2_interface
             ],
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ The goal of the project - to create the easiest possible application to use neur
 
 ### Text: <img width="1118" alt="1" src="https://github.com/user-attachments/assets/d0947d54-eb8b-4f20-986b-579f9652ff95">
 
-### Image: <img width="1118" alt="2" src="https://github.com/user-attachments/assets/39506653-23e1-432b-b250-362146a693a5">
+### Image: <img width="1112" alt="2" src="https://github.com/user-attachments/assets/02085575-1ae3-4e71-93eb-499c3103623a">
 
 ### Video: <img width="1115" alt="3" src="https://github.com/user-attachments/assets/032b248e-1ea8-4661-8a96-267e4a9ef01c">
 
@@ -31,7 +31,7 @@ The goal of the project - to create the easiest possible application to use neur
 * Flexible and optimized interface (By Gradio)
 * Debug logging to logs from `Install` and `Update` files
 * Support for Transformers and llama.cpp models (LLM)
-* Support for diffusers and safetensors models (StableDiffusion) - txt2img, img2img, depth2img, marigold, pix2pix, controlnet, upscale (latent), upscale (SUPIR), refiner, inpaint, outpaint, gligen, diffedit, animatediff, hotshot-xl, video, ldm3d, sd3, cascade, t2i-ip-adapter, ip-adapter-faceid and riffusion tabs
+* Support for diffusers and safetensors models (StableDiffusion) - txt2img, img2img, depth2img, marigold, pix2pix, controlnet, upscale (latent), upscale (SUPIR), refiner, inpaint, outpaint, gligen, diffedit, blip-diffusion, animatediff, hotshot-xl, video, ldm3d, sd3, cascade, t2i-ip-adapter, ip-adapter-faceid and riffusion tabs
 * Support for stable-diffusion-cpp models for FLUX
 * Support of additional models for image generation: Kandinsky (txt2img, img2img, inpaint), Flux (with LoRA support), HunyuanDiT (txt2img, controlnet), Lumina-T2X, Kolors (txt2img with LoRA support, img2img, ip-adapter-plus), AuraFlow (with LoRA and AuraSR support), Würstchen, DeepFloydIF (txt2img, img2img, inpaint), PixArt and PlaygroundV2.5
 * Support Extras with Rembg, CodeFormer, PixelOE, DDColor, DownScale, Format changer, FaceSwap (Roop) and Upscale (Real-ESRGAN) models for image, video and audio
@@ -40,7 +40,7 @@ The goal of the project - to create the easiest possible application to use neur
 * Support AudioLDM 2 (Models: audio and music)
 * Supports TTS and Whisper models (For LLM and TTS-STT)
 * Support MMS for text-to-speech and speech-to-text
-* Supports Lora, Textual inversion (embedding), Vae, MagicPrompt, Img2img, Depth, Marigold, Pix2Pix, Controlnet, Upscalers (latent and SUPIR), Refiner, Inpaint, Outpaint, GLIGEN, DiffEdit, AnimateDiff, HotShot-XL, Videos, LDM3D, SD3, Cascade, T2I-IP-ADAPTER, IP-Adapter-FaceID and Riffusion models (For StableDiffusion)
+* Supports Lora, Textual inversion (embedding), Vae, MagicPrompt, Img2img, Depth, Marigold, Pix2Pix, Controlnet, Upscalers (latent and SUPIR), Refiner, Inpaint, Outpaint, GLIGEN, DiffEdit, BLIP-Diffusion, AnimateDiff, HotShot-XL, Videos, LDM3D, SD3, Cascade, T2I-IP-ADAPTER, IP-Adapter-FaceID and Riffusion models (For StableDiffusion)
 * Support Multiband Diffusion model (For AudioCraft)
 * Support LibreTranslate (Local API) and SeamlessM4Tv2 for language translations
 * Support ModelScope, ZeroScope 2, CogVideoX and Latte for video generation
@@ -213,6 +213,7 @@ First of all, I want to thank the developers of [PyCharm](https://www.jetbrains.
 * [SUPIR](https://github.com/Fanghua-Yu/SUPIR/blob/master/LICENSE)
 * [MagicPrompt](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/mit.md)
 * [Marigold](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
+* [BLIP-Diffusion](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md)
 
 #### These third-party repository codes are also used in my project:
 
diff --git a/Wikies/WikiEN.md b/Wikies/WikiEN.md
@@ -50,7 +50,7 @@
 
 # Image:
 
-### StableDiffusion - has twenty three sub-tabs:
+### StableDiffusion - has twenty four sub-tabs:
 
 #### txt2img:
 
@@ -162,9 +162,19 @@
 
 #### diffedit:
 
-1) Enter your Source Prompt and Source Negative Prompt for image masking (+ and - for prompt weighting)
-2) Enter your Target Prompt and Target Negative Prompt for image diffediting (+ and - for prompt weighting)
-3) Click the `Submit` button to get the generated image
+1) Enter your Source Prompt and Source Negative Prompt for image masking
+2) Enter your Target Prompt and Target Negative Prompt for image diff-editing
+3) Upload the initial image
+4) Set up the model according to the parameters you need
+5) Click the `Submit` button to get the generated image
+
+#### blip-diffusion:
+
+1) Enter your Prompt
+2) Upload the initial image
+3) Enter your Conditioning and Target Subjects
+4) Set up the model according to the parameters you need
+5) Click the `Submit` button to get the generated image
 
 #### animatediff:
 
@@ -478,7 +488,7 @@
 * LLM models can be taken from [HuggingFace](https://huggingface.co/models) or from ModelDownloader inside interface 
 * StableDiffusion, vae, inpaint, embedding and lora models can be taken from [CivitAI](https://civitai.com/models) or from ModelDownloader inside interface
 * RVC models can be taken from [VoiceModels](https://voice-models.com)
-* StableAudio, AudioCraft, AudioLDM 2, TTS, Whisper, MMS, SeamlessM4Tv2, Wav2Lip, LivePortrait, SunoBark, MoonDream2, Upscalers (Latent and Real-ESRGAN), Refiner, GLIGEN, DiffEdit, Depth, Marigold, Pix2Pix, Controlnet, AnimateDiff, HotShot-XL, Videos, LDM3D, SD3, Cascade, T2I-IP-ADAPTER, IP-Adapter-FaceID, Riffusion, Rembg, Roop, CodeFormer, DDColor, PixelOE, Real-ESRGAN, StableFast3D, Shap-E, SV34D, Zero123Plus, UVR, Demucs, Kandinsky, Flux, HunyuanDiT, Lumina-T2X, Kolors, AuraFlow, AuraSR, Würstchen, DeepFloydIF, PixArt, PlaygroundV2.5, ModelScope, ZeroScope 2, CogVideoX, MagicPrompt, Latte and Multiband diffusion models are downloads automatically in *inputs* folder when are they used 
+* StableAudio, AudioCraft, AudioLDM 2, TTS, Whisper, MMS, SeamlessM4Tv2, Wav2Lip, LivePortrait, SunoBark, MoonDream2, Upscalers (Latent and Real-ESRGAN), Refiner, GLIGEN, DiffEdit, BLIP-Diffusion, Depth, Marigold, Pix2Pix, Controlnet, AnimateDiff, HotShot-XL, Videos, LDM3D, SD3, Cascade, T2I-IP-ADAPTER, IP-Adapter-FaceID, Riffusion, Rembg, Roop, CodeFormer, DDColor, PixelOE, Real-ESRGAN, StableFast3D, Shap-E, SV34D, Zero123Plus, UVR, Demucs, Kandinsky, Flux, HunyuanDiT, Lumina-T2X, Kolors, AuraFlow, AuraSR, Würstchen, DeepFloydIF, PixArt, PlaygroundV2.5, ModelScope, ZeroScope 2, CogVideoX, MagicPrompt, Latte and Multiband diffusion models are downloads automatically in *inputs* folder when are they used 
 * You can take voices anywhere. Record yours or take a recording from the Internet. Or just use those that are already in the project. The main thing is that it is pre-processed!
 
 ## Known Bugs: