# ═══════════════════════════════════════════════════════════════════════════════ # ⚡ CRITTIKS GLOBAL - ZEROGPU CREATIVE STUDIO v3.0 ULTIMATE # ═══════════════════════════════════════════════════════════════════════════════ # BLEEDING-EDGE MODELS: # - Z-Image-Turbo: 8-step ultra-fast image generation # - Wan 2.2 I2V 14B: FP8 quantized + AoTI + Lightning LoRA (4-step video!) # - MusicGen: AI music generation # ═══════════════════════════════════════════════════════════════════════════════ import os import gc import spaces import torch import numpy as np from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageEnhance import textwrap import tempfile import random import gradio as gr # ═══════════════════════════════════════════════════════════════════════════════ # 📦 GLOBAL MODEL HOLDERS # ═══════════════════════════════════════════════════════════════════════════════ music_model = None music_processor = None video_pipe = None video_loaded = False # ═══════════════════════════════════════════════════════════════════════════════ # 🎨 CONFIGURATION # ═══════════════════════════════════════════════════════════════════════════════ MAX_SEED = np.iinfo(np.int32).max # Video settings for Wan 2.2 VIDEO_MAX_DIM = 832 VIDEO_MIN_DIM = 480 VIDEO_SQUARE_DIM = 640 VIDEO_MULTIPLE_OF = 16 VIDEO_FIXED_FPS = 16 VIDEO_MIN_FRAMES = 8 VIDEO_MAX_FRAMES = 80 # Image settings for Z-Image-Turbo IMAGE_DEFAULT_SIZE = 1024 # Sample preview image def create_sample_image(): """Create a sample gradient image for preview""" img = Image.new('RGB', (512, 512)) for y in range(512): for x in range(512): r = int(50 + (x / 512) * 100) g = int(30 + (y / 512) * 80) b = int(80 + ((x + y) / 1024) * 120) img.putpixel((x, y), (r, g, b)) return img SAMPLE_IMAGE = create_sample_image() # ═══════════════════════════════════════════════════════════════════════════════ # 🎨 STYLE & OPTION CONFIGURATIONS # ═══════════════════════════════════════════════════════════════════════════════ OCCASIONS = [ "Christmas", "New Year", "Birthday", "Wedding", "Valentine's Day", "Anniversary", "Graduation", "Thank You", "Diwali", "Eid", "Mother's Day", "Father's Day", "Halloween", "Easter" ] STYLES = { "Festive Magic": "magical christmas scene, golden lights, snow, warm cozy atmosphere, cinematic", "Luxury Gold": "luxurious black and gold, elegant marble, premium design, sophisticated", "Soft Dreams": "soft pastel colors, dreamy clouds, ethereal glow, romantic", "Neon Future": "cyberpunk neon lights, futuristic city, purple cyan pink, sci-fi", "Nature Beauty": "beautiful nature, flowers, green forest, sunlight rays, peaceful", "Ocean Calm": "serene ocean sunset, beach, turquoise water, golden hour", "Cosmic Galaxy": "cosmic nebula, stars, aurora borealis, space, mystical", "Dark Elegance": "dark moody aesthetic, silver accents, dramatic lighting, premium" } OVERLAY_STYLES = { "None": None, "Frosted Glass": {"color": (20, 30, 50), "opacity": 0.6}, "Dark Luxury": {"color": (0, 0, 0), "opacity": 0.7}, "Light Dream": {"color": (255, 255, 255), "opacity": 0.4}, "Neon Glow": {"color": (30, 0, 60), "opacity": 0.65}, "Forest Green": {"color": (20, 50, 30), "opacity": 0.6}, "Sunset Warm": {"color": (70, 35, 20), "opacity": 0.6}, "Ocean Blue": {"color": (20, 40, 70), "opacity": 0.55} } MUSIC_STYLES = { "Peaceful Piano": "peaceful piano melody, emotional, cinematic, gentle", "Acoustic Guitar": "warm acoustic guitar, soft strumming, heartfelt melody", "Epic Orchestra": "epic orchestral music, cinematic strings, powerful", "Lo-Fi Chill": "lofi hip hop beats, relaxing, chill vibes, jazzy", "Ambient Space": "ambient space music, ethereal pads, dreamy atmosphere", "Holiday Magic": "christmas holiday music, bells, magical festive cheer" } # ═══════════════════════════════════════════════════════════════════════════════ # 🎨 GLASS OVERLAY FUNCTION # ═══════════════════════════════════════════════════════════════════════════════ def add_glass_overlay(image, text, footer, overlay_style="Frosted Glass", font_size=42, enable_overlay=True): """Add glass overlay with customizable text size""" if image.mode != 'RGB': image = image.convert('RGB') width, height = image.size result = image.copy() draw = ImageDraw.Draw(result) style = OVERLAY_STYLES.get(overlay_style) # Calculate panel dimensions margin = int(width * 0.04) panel_height = int(height * 0.28) panel_top = height - panel_height - margin panel_bottom = panel_top + panel_height panel_left = margin panel_right = width - margin # Apply overlay panel if enabled and style exists if enable_overlay and style is not None: region = image.crop((panel_left, panel_top, panel_right, panel_bottom)) blurred = region.filter(ImageFilter.GaussianBlur(radius=20)) blurred = ImageEnhance.Brightness(blurred).enhance(0.6) color_overlay = Image.new('RGB', (panel_right - panel_left, panel_height), style["color"]) blended = Image.blend(blurred, color_overlay, style["opacity"]) result.paste(blended, (panel_left, panel_top)) draw = ImageDraw.Draw(result) for i in range(4): draw.rectangle( [(panel_left + i, panel_top + i), (panel_right - i, panel_bottom - i)], outline=(255, 255, 255) ) # Load font with custom size try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", int(font_size)) small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", int(font_size * 0.45)) except: font = ImageFont.load_default() small_font = font # Calculate text wrapping avg_char_width = font_size * 0.55 max_chars = max(15, int((panel_right - panel_left - 40) / avg_char_width)) lines = textwrap.wrap(text, width=max_chars) line_height = int(font_size * 1.25) total_height = len(lines) * line_height text_y = panel_top + (panel_height - total_height) // 2 - 10 # Draw text with shadow for line in lines: bbox = draw.textbbox((0, 0), line, font=font) text_x = (width - (bbox[2] - bbox[0])) // 2 draw.text((text_x + 2, text_y + 2), line, font=font, fill=(0, 0, 0)) draw.text((text_x + 1, text_y + 1), line, font=font, fill=(0, 0, 0)) draw.text((text_x, text_y), line, font=font, fill=(255, 255, 255)) text_y += line_height # Draw footer if footer: bbox = draw.textbbox((0, 0), footer, font=small_font) footer_x = (width - (bbox[2] - bbox[0])) // 2 draw.text((footer_x + 1, panel_bottom - 30 + 1), footer, font=small_font, fill=(0, 0, 0)) draw.text((footer_x, panel_bottom - 30), footer, font=small_font, fill=(200, 200, 200)) return result # ═══════════════════════════════════════════════════════════════════════════════ # 👁️ LIVE PREVIEW (No GPU) # ═══════════════════════════════════════════════════════════════════════════════ def update_preview(message, overlay_style, font_size, enable_overlay, recipient): """Generate live preview with current settings""" if not message: message = "Your message here..." if not recipient: recipient = "Friend" footer = f"For {recipient} | Crittiks Global" preview = add_glass_overlay( SAMPLE_IMAGE.copy(), message, footer, overlay_style, font_size, enable_overlay ) return preview # ═══════════════════════════════════════════════════════════════════════════════ # 🖼️ STABLE DIFFUSION 3.5 LARGE - High Quality Image Generation # ═══════════════════════════════════════════════════════════════════════════════ image_pipe = None @spaces.GPU def generate_image_gpu(prompt, style_desc, height=1024, width=1024, steps=28, seed=None, randomize_seed=True): """Generate image using Stable Diffusion 3.5 Large""" global image_pipe from diffusers import StableDiffusion3Pipeline if image_pipe is None: print("Loading Stable Diffusion 3.5 Large...") image_pipe = StableDiffusion3Pipeline.from_pretrained( "stabilityai/stable-diffusion-3.5-large", torch_dtype=torch.bfloat16, ) image_pipe.to("cuda") print("SD 3.5 Large loaded!") # Handle seed if randomize_seed or seed is None: seed = torch.randint(0, 2**32 - 1, (1,)).item() generator = torch.Generator("cuda").manual_seed(int(seed)) full_prompt = f"{prompt}, {style_desc}, masterpiece, ultra detailed, 8k quality, cinematic lighting, professional photography, NO TEXT NO WORDS NO LETTERS" image = image_pipe( prompt=full_prompt, num_inference_steps=28, guidance_scale=3.5, generator=generator, ).images[0] return image, seed # ═══════════════════════════════════════════════════════════════════════════════ # 🎵 MUSICGEN: AI Music Generation # ═══════════════════════════════════════════════════════════════════════════════ @spaces.GPU def generate_music_gpu(prompt): """Generate music using MusicGen""" global music_model, music_processor from transformers import AutoProcessor, MusicgenForConditionalGeneration import scipy.io.wavfile as wavfile if music_model is None: print("Loading MusicGen...") music_processor = AutoProcessor.from_pretrained("facebook/musicgen-small") music_model = MusicgenForConditionalGeneration.from_pretrained( "facebook/musicgen-small", torch_dtype=torch.float16 ) music_model.to("cuda") print("MusicGen loaded!") inputs = music_processor(text=[prompt], padding=True, return_tensors="pt").to("cuda") audio_values = music_model.generate(**inputs, max_new_tokens=256, do_sample=True) sampling_rate = music_model.config.audio_encoder.sampling_rate audio_data = audio_values[0, 0].cpu().numpy() audio_data = audio_data / np.max(np.abs(audio_data)) audio_data = (audio_data * 32767).astype(np.int16) temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) wavfile.write(temp_file.name, sampling_rate, audio_data) return temp_file.name # ═══════════════════════════════════════════════════════════════════════════════ # 🎬 WAN 2.2 I2V: FP8 + AoTI + Lightning LoRA (4-Step Video!) # ═══════════════════════════════════════════════════════════════════════════════ def resize_image_for_video(image: Image.Image) -> Image.Image: """Resize image for Wan 2.2 video generation""" width, height = image.size if width == height: return image.resize((VIDEO_SQUARE_DIM, VIDEO_SQUARE_DIM), Image.LANCZOS) aspect_ratio = width / height MAX_ASPECT_RATIO = VIDEO_MAX_DIM / VIDEO_MIN_DIM MIN_ASPECT_RATIO = VIDEO_MIN_DIM / VIDEO_MAX_DIM image_to_resize = image if aspect_ratio > MAX_ASPECT_RATIO: crop_width = int(round(height * MAX_ASPECT_RATIO)) left = (width - crop_width) // 2 image_to_resize = image.crop((left, 0, left + crop_width, height)) elif aspect_ratio < MIN_ASPECT_RATIO: crop_height = int(round(width / MIN_ASPECT_RATIO)) top = (height - crop_height) // 2 image_to_resize = image.crop((0, top, width, top + crop_height)) if width > height: target_w = VIDEO_MAX_DIM target_h = int(round(target_w / aspect_ratio)) else: target_h = VIDEO_MAX_DIM target_w = int(round(target_h * aspect_ratio)) final_w = round(target_w / VIDEO_MULTIPLE_OF) * VIDEO_MULTIPLE_OF final_h = round(target_h / VIDEO_MULTIPLE_OF) * VIDEO_MULTIPLE_OF final_w = max(VIDEO_MIN_DIM, min(VIDEO_MAX_DIM, final_w)) final_h = max(VIDEO_MIN_DIM, min(VIDEO_MAX_DIM, final_h)) return image_to_resize.resize((final_w, final_h), Image.LANCZOS) def get_num_frames(duration_seconds: float): """Calculate number of frames from duration""" return 1 + int(np.clip( int(round(duration_seconds * VIDEO_FIXED_FPS)), VIDEO_MIN_FRAMES, VIDEO_MAX_FRAMES, )) def load_video_pipeline(): """Load Wan 2.2 I2V pipeline with FP8 quantization and AoTI""" global video_pipe, video_loaded if video_loaded: return video_pipe print("Loading Wan 2.2 I2V 14B with FP8 + AoTI optimization...") from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline from diffusers.models.transformers.transformer_wan import WanTransformer3DModel MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" # Load pipeline with bf16 transformers video_pipe = WanImageToVideoPipeline.from_pretrained( MODEL_ID, transformer=WanTransformer3DModel.from_pretrained( 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', subfolder='transformer', torch_dtype=torch.bfloat16, device_map='cuda', ), transformer_2=WanTransformer3DModel.from_pretrained( 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers', subfolder='transformer_2', torch_dtype=torch.bfloat16, device_map='cuda', ), torch_dtype=torch.bfloat16, ).to('cuda') # Load Lightning LoRA for fast inference print("Loading Lightning LoRA for 4-step inference...") video_pipe.load_lora_weights( "Kijai/WanVideo_comfy", weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", adapter_name="lightx2v" ) video_pipe.load_lora_weights( "Kijai/WanVideo_comfy", weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors", adapter_name="lightx2v_2", load_into_transformer_2=True ) video_pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.]) video_pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"]) video_pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"]) video_pipe.unload_lora_weights() # Apply FP8 quantization print("Applying FP8 quantization...") from torchao.quantization import quantize_ from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Int8WeightOnlyConfig quantize_(video_pipe.text_encoder, Int8WeightOnlyConfig()) quantize_(video_pipe.transformer, Float8DynamicActivationFloat8WeightConfig()) quantize_(video_pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig()) # Load AoTI compiled blocks print("Loading AoTI compiled blocks...") import aoti aoti.aoti_blocks_load(video_pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da') aoti.aoti_blocks_load(video_pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da') video_loaded = True print("Wan 2.2 I2V fully loaded with all optimizations!") return video_pipe # Dynamic duration calculation def get_video_duration(image, duration_seconds, steps): """Calculate GPU duration based on parameters""" BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624 BASE_STEP_DURATION = 15 resized = resize_image_for_video(image) width, height = resized.size frames = get_num_frames(duration_seconds) factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH step_duration = BASE_STEP_DURATION * factor ** 1.5 return 30 + int(steps) * step_duration @spaces.GPU def generate_video_gpu(image, text, footer, overlay_style, font_size, enable_overlay, duration_seconds=3.0, steps=6, seed=None, randomize_seed=True): """Generate video using Wan 2.2 I2V with text overlay""" from diffusers.utils import export_to_video if image is None: raise gr.Error("Please generate an image first!") # Load pipeline pipe = load_video_pipeline() # Handle seed if randomize_seed or seed is None: current_seed = random.randint(0, MAX_SEED) else: current_seed = int(seed) # Resize image resized_image = resize_image_for_video(image) num_frames = get_num_frames(duration_seconds) # Video generation prompt video_prompt = "make this image come alive, cinematic motion, smooth animation, natural movement" negative_prompt = "low quality, worst quality, motion artifacts, jitter, unstable, blurry, static" # Generate video frames output_frames = pipe( image=resized_image, prompt=video_prompt, negative_prompt=negative_prompt, height=resized_image.height, width=resized_image.width, num_frames=num_frames, guidance_scale=1.0, guidance_scale_2=1.0, num_inference_steps=int(steps), generator=torch.Generator(device="cuda").manual_seed(current_seed), ).frames[0] # Apply text overlay to each frame processed_frames = [] for frame in output_frames: if isinstance(frame, np.ndarray): frame = Image.fromarray(frame) # Scale font for video dimensions video_font_size = font_size * (resized_image.width / 1024) frame_with_text = add_glass_overlay( frame, text, footer, overlay_style, video_font_size, enable_overlay ) processed_frames.append(frame_with_text) # Export video temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) export_to_video(processed_frames, temp_file.name, fps=VIDEO_FIXED_FPS) # Cleanup gc.collect() torch.cuda.empty_cache() return temp_file.name, current_seed # ═══════════════════════════════════════════════════════════════════════════════ # 🎬 MAIN GENERATION FUNCTION # ═══════════════════════════════════════════════════════════════════════════════ def generate_all(recipient, occasion, visual_style, message, overlay_style, font_size, enable_overlay, enable_music, music_style, enable_video, video_duration, video_steps, progress=gr.Progress()): if not recipient: recipient = "Friend" if not message: message = f"Wishing you a wonderful {occasion}!" footer = f"For {recipient} | Crittiks Global" status = [] seed_used = None # === IMAGE (Z-Image-Turbo - 8 steps!) === progress(0.1, desc="Generating image with Z-Image-Turbo (8-step fast)...") try: style_desc = STYLES.get(visual_style, "beautiful elegant cinematic") img_prompt = f"Beautiful {occasion} greeting card background, artistic composition" image, seed_used = generate_image_gpu(img_prompt, style_desc) status.append("Image OK") except Exception as e: return None, None, None, f"Image error: {str(e)}", None # === OVERLAY === progress(0.3, desc="Adding text overlay...") final_image = add_glass_overlay(image, message, footer, overlay_style, font_size, enable_overlay) status.append("Overlay OK") # === MUSIC (MusicGen) === audio_path = None if enable_music: progress(0.4, desc="Generating music with MusicGen...") try: music_prompt = MUSIC_STYLES.get(music_style, "peaceful ambient music") audio_path = generate_music_gpu(music_prompt) status.append("Music OK") except Exception as e: status.append(f"Music: {str(e)[:30]}") # === VIDEO (Wan 2.2 with FP8+AoTI - 4-6 steps!) === video_path = None if enable_video: progress(0.6, desc="Generating video with Wan 2.2 I2V (FP8 + Lightning LoRA)...") try: video_path, _ = generate_video_gpu( image, message, footer, overlay_style, font_size, enable_overlay, video_duration, video_steps ) status.append("Video OK") except Exception as e: status.append(f"Video: {str(e)[:50]}") progress(1.0, desc="Complete!") final_status = " | ".join(status) + f" | Seed: {seed_used}" return final_image, audio_path, video_path, final_status, seed_used # ═══════════════════════════════════════════════════════════════════════════════ # 🖥️ GRADIO UI # ═══════════════════════════════════════════════════════════════════════════════ custom_theme = gr.themes.Soft( primary_hue="cyan", secondary_hue="purple", neutral_hue="slate", ).set( button_primary_background_fill="*primary_500", button_primary_background_fill_hover="*primary_600", ) with gr.Blocks( title="Crittiks Global | ZeroGPU Studio v3.0", theme=custom_theme ) as demo: gr.Markdown(""" # Crittiks Global - ZeroGPU Studio v3.0 ### Premium AI Image, Video & Music Generation **Models:** Stable Diffusion 3.5 Large | Wan 2.2 I2V 14B | MusicGen """) with gr.Row(): # === LEFT PANEL === with gr.Column(scale=1): gr.Markdown("### Card Settings") recipient = gr.Textbox(label="Recipient", placeholder="Who is this for?", value="") occasion = gr.Dropdown(choices=OCCASIONS, label="Occasion", value="Christmas") visual_style = gr.Dropdown(choices=list(STYLES.keys()), label="Visual Style", value="Festive Magic") message = gr.Textbox(label="Message", placeholder="Your greeting message...", lines=2, value="") gr.Markdown("### Text & Overlay") enable_overlay = gr.Checkbox(label="Enable Glass Overlay", value=True) overlay_style = gr.Dropdown( choices=list(OVERLAY_STYLES.keys()), label="Overlay Style", value="Frosted Glass" ) font_size = gr.Slider(minimum=24, maximum=72, step=2, value=42, label="Text Size") gr.Markdown("### Media Options") enable_music = gr.Checkbox(label="Generate Music (MusicGen)", value=False) music_style = gr.Dropdown( choices=list(MUSIC_STYLES.keys()), label="Music Style", value="Peaceful Piano" ) gr.Markdown("### Video Settings (Wan 2.2 I2V)") enable_video = gr.Checkbox(label="Generate Video (4-6 step fast!)", value=False) video_duration = gr.Slider( minimum=1.0, maximum=5.0, step=0.5, value=3.0, label="Video Duration (seconds)" ) video_steps = gr.Slider( minimum=4, maximum=12, step=1, value=6, label="Video Steps (4-6 recommended)" ) generate_btn = gr.Button("GENERATE", variant="primary", size="lg") seed_output = gr.Number(label="Seed Used", interactive=False) # === RIGHT PANEL === with gr.Column(scale=2): gr.Markdown("### Live Preview") preview_image = gr.Image(label="Preview (updates live)", type="pil", height=200) gr.Markdown("### Generated Content") with gr.Tabs(): with gr.TabItem("Card"): output_image = gr.Image(label="Your Card", type="pil", height=450) with gr.TabItem("Video"): output_video = gr.Video(label="AI Video", height=450, autoplay=True) with gr.TabItem("Music"): output_audio = gr.Audio(label="Generated Music", type="filepath") output_status = gr.Textbox(label="Status") # === EVENT HANDLERS === preview_inputs = [message, overlay_style, font_size, enable_overlay, recipient] message.change(fn=update_preview, inputs=preview_inputs, outputs=preview_image) overlay_style.change(fn=update_preview, inputs=preview_inputs, outputs=preview_image) font_size.change(fn=update_preview, inputs=preview_inputs, outputs=preview_image) enable_overlay.change(fn=update_preview, inputs=preview_inputs, outputs=preview_image) recipient.change(fn=update_preview, inputs=preview_inputs, outputs=preview_image) generate_btn.click( fn=generate_all, inputs=[ recipient, occasion, visual_style, message, overlay_style, font_size, enable_overlay, enable_music, music_style, enable_video, video_duration, video_steps ], outputs=[output_image, output_audio, output_video, output_status, seed_output] ) demo.load(fn=update_preview, inputs=preview_inputs, outputs=preview_image) gr.Markdown(""" --- **Performance:** - Image: ~10-15 seconds (SD 3.5 Large, 28 steps) - Video: ~30-60 seconds (Wan 2.2 I2V) - First generation loads models, then faster **Model Credits:** - [stabilityai/stable-diffusion-3.5-large](https://huggingface.co/stabilityai/stable-diffusion-3.5-large) - [Wan-AI/Wan2.2-I2V-A14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers) """) demo.queue() if __name__ == "__main__": # Require HuggingFace login for ZeroGPU quota demo.launch()