Spaces:
Runtime error
Runtime error
| """ | |
| BubbleScribe - AI Manga & Comic Translator | |
| Translate manga/comics using GLM-4.6V for OCR + Translation and LaMa for inpainting. | |
| Optimized for NVIDIA T4 GPU | |
| """ | |
| import gradio as gr | |
| import torch | |
| import os | |
| import json | |
| import base64 | |
| import re | |
| import numpy as np | |
| from PIL import Image, ImageDraw, ImageFont | |
| from io import BytesIO | |
| from openai import OpenAI | |
| from concurrent.futures import ThreadPoolExecutor | |
| import threading | |
| # ============================================================ | |
| # HARDWARE OPTIMIZATION: NVIDIA T4 (16GB VRAM) | |
| # ============================================================ | |
| import cv2 | |
| # Enable CUDA optimizations | |
| torch.backends.cudnn.benchmark = True | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.backends.cudnn.allow_tf32 = True | |
| # Thread pool for parallel operations | |
| executor = ThreadPoolExecutor(max_workers=4) | |
| # ============================================================ | |
| # MODEL PRELOADING (Load at startup for faster inference) | |
| # ============================================================ | |
| print("🚀 BubbleScribe starting up...") | |
| print(f" Hardware: NVIDIA T4 (16GB VRAM)") | |
| print(f" OCR: Qwen2-VL (API)") | |
| print(f" Inpainting: LaMa (GPU)") | |
| # Load LaMa model at startup | |
| print("📦 Loading LaMa model...") | |
| from simple_lama_inpainting import SimpleLama | |
| lama_model = SimpleLama() | |
| print("✅ LaMa model loaded and ready!") | |
| def load_lama(): | |
| """Get LaMa model (already loaded at startup).""" | |
| return lama_model | |
| # ============================================================ | |
| # FONT CACHING | |
| # ============================================================ | |
| _font_cache = {} | |
| _font_lock = threading.Lock() | |
| def get_font(size: int): | |
| """Get a font with caching.""" | |
| cache_key = size | |
| if cache_key in _font_cache: | |
| return _font_cache[cache_key] | |
| with _font_lock: | |
| if cache_key in _font_cache: | |
| return _font_cache[cache_key] | |
| font_paths = [ | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", | |
| "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", | |
| "/usr/share/fonts/truetype/noto/NotoSansCJK-Bold.ttc", # CJK support | |
| "/usr/share/fonts/opentype/noto/NotoSansCJK-Bold.ttc", | |
| "C:/Windows/Fonts/arial.ttf", | |
| "C:/Windows/Fonts/arialbd.ttf", | |
| ] | |
| for path in font_paths: | |
| if os.path.exists(path): | |
| try: | |
| font = ImageFont.truetype(path, size) | |
| _font_cache[cache_key] = font | |
| return font | |
| except: | |
| continue | |
| font = ImageFont.load_default() | |
| _font_cache[cache_key] = font | |
| return font | |
| # ============================================================ | |
| # GLM-4.6V CLIENT (Z.ai API) | |
| # ============================================================ | |
| _glm_client = None | |
| def get_glm_client(): | |
| """Get or create GLM client.""" | |
| global _glm_client | |
| if _glm_client is None: | |
| api_key = os.environ.get("GLM_API_KEY") | |
| if not api_key: | |
| return None | |
| _glm_client = OpenAI(api_key=api_key, base_url="https://api.z.ai/api/paas/v4") | |
| return _glm_client | |
| # ============================================================ | |
| # IMAGE UTILITIES | |
| # ============================================================ | |
| def encode_image_base64(image: Image.Image, max_size: int = 2048) -> str: | |
| """Convert PIL Image to base64 string with optional resize.""" | |
| # Resize if too large to save bandwidth and API costs | |
| if max(image.size) > max_size: | |
| ratio = max_size / max(image.size) | |
| new_size = (int(image.width * ratio), int(image.height * ratio)) | |
| image = image.resize(new_size, Image.Resampling.LANCZOS) | |
| buffered = BytesIO() | |
| image.save(buffered, format="PNG", optimize=True) | |
| return base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| def scale_bbox(bbox: list, original_size: tuple, processed_size: tuple) -> list: | |
| """Scale bounding box coordinates if image was resized.""" | |
| if original_size == processed_size: | |
| return bbox | |
| scale_x = original_size[0] / processed_size[0] | |
| scale_y = original_size[1] / processed_size[1] | |
| return [ | |
| int(bbox[0] * scale_x), | |
| int(bbox[1] * scale_y), | |
| int(bbox[2] * scale_x), | |
| int(bbox[3] * scale_y) | |
| ] | |
| # ============================================================ | |
| # JSON REPAIR (Handle malformed model responses) | |
| # ============================================================ | |
| def repair_json(text: str) -> str: | |
| """Attempt to repair common JSON issues from LLM responses.""" | |
| # Remove any markdown code blocks | |
| text = re.sub(r'```json\s*', '', text) | |
| text = re.sub(r'```\s*', '', text) | |
| # Fix unescaped newlines in strings | |
| text = re.sub(r'(?<!\\)\n(?=[^"]*"[^"]*(?:"[^"]*"[^"]*)*$)', '\\n', text) | |
| # Fix trailing commas before ] or } | |
| text = re.sub(r',\s*([}\]])', r'\1', text) | |
| # Fix missing commas between objects | |
| text = re.sub(r'\}\s*\{', '},{', text) | |
| # Fix unescaped quotes inside strings (rough heuristic) | |
| # Replace Japanese quotes with escaped ones | |
| text = text.replace('「', '\\"').replace('」', '\\"') | |
| text = text.replace('『', '\\"').replace('』', '\\"') | |
| return text | |
| def safe_parse_json(text: str) -> list: | |
| """Safely parse JSON with multiple fallback strategies.""" | |
| # Strategy 1: Direct parse | |
| try: | |
| json_match = re.search(r'\[[\s\S]*\]', text) | |
| if json_match: | |
| return json.loads(json_match.group()) | |
| except json.JSONDecodeError: | |
| pass | |
| # Strategy 2: Repair and parse | |
| try: | |
| repaired = repair_json(text) | |
| json_match = re.search(r'\[[\s\S]*\]', repaired) | |
| if json_match: | |
| return json.loads(json_match.group()) | |
| except json.JSONDecodeError: | |
| pass | |
| # Strategy 3: Extract individual objects | |
| try: | |
| objects = re.findall(r'\{[^{}]*\}', text) | |
| results = [] | |
| for obj in objects: | |
| try: | |
| parsed = json.loads(repair_json(obj)) | |
| if 'bbox' in parsed: | |
| results.append(parsed) | |
| except: | |
| continue | |
| if results: | |
| return results | |
| except: | |
| pass | |
| # Strategy 4: Manual extraction with regex | |
| try: | |
| results = [] | |
| # Find bbox patterns | |
| bbox_matches = re.findall(r'"bbox"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]', text) | |
| original_matches = re.findall(r'"original"\s*:\s*"([^"]*)"', text) | |
| translated_matches = re.findall(r'"translated"\s*:\s*"([^"]*)"', text) | |
| for i, bbox in enumerate(bbox_matches): | |
| result = { | |
| "bbox": [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])], | |
| "original": original_matches[i] if i < len(original_matches) else "", | |
| "translated": translated_matches[i] if i < len(translated_matches) else "" | |
| } | |
| results.append(result) | |
| if results: | |
| return results | |
| except: | |
| pass | |
| return [] | |
| # ============================================================ | |
| # DETECTION & TRANSLATION | |
| # ============================================================ | |
| def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str, progress=gr.Progress()): | |
| """Use GLM-4.6V to detect text regions and translate.""" | |
| client = get_glm_client() | |
| if not client: | |
| return None, "Error: GLM_API_KEY not set in Space secrets" | |
| progress(0.1, desc="Analyzing image with GLM-4.6V...") | |
| original_size = image.size | |
| # Convert image to base64 (may resize for API) | |
| img_base64 = encode_image_base64(image, max_size=2048) | |
| # Calculate processed size for bbox scaling | |
| processed_size = original_size | |
| if max(original_size) > 2048: | |
| ratio = 2048 / max(original_size) | |
| processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio)) | |
| prompt = f"""You are a professional manga translator. Your task is to find and translate EVERY piece of {source_lang} text in this image. | |
| IMPORTANT: Scan the ENTIRE image from top to bottom, left to right. Do NOT miss any text! | |
| Find ALL of these text types: | |
| - Main titles and headers | |
| - Character names (above/below portraits) | |
| - Speech bubbles and dialogue | |
| - Narration boxes | |
| - Sound effects (onomatopoeia) | |
| - Labels, captions, descriptions | |
| - Small text and annotations | |
| - Relationship indicators (arrows, connections) | |
| - ANY other visible {source_lang} text | |
| For EACH text region found: | |
| 1. bbox: [x1, y1, x2, y2] pixel coordinates | |
| 2. original: the exact {source_lang} text | |
| 3. translated: natural {target_lang} translation | |
| TRANSLATION GUIDELINES: | |
| - Keep character names in ROMAJI (e.g., 田中太郎 → "Tanaka Tarou", not "Rice Field Middle Fat Man") | |
| - Keep honorifics: -san, -kun, -chan, -sama, -sensei | |
| - Sound effects: Keep original + add meaning (e.g., "ドキドキ" → "Dokidoki (heart pounding)") | |
| - Make dialogue natural and conversational, not literal | |
| - Preserve emotional tone and nuance | |
| - For titles/roles, translate the meaning (e.g., 社長 → "President", 先生 → "Teacher") | |
| Return a JSON array. Example: | |
| [ | |
| {{"bbox": [100, 50, 200, 80], "original": "山田花子", "translated": "Yamada Hanako"}}, | |
| {{"bbox": [300, 100, 400, 130], "original": "よろしくお願いします", "translated": "Nice to meet you"}} | |
| ] | |
| CRITICAL: Find at least 20-50 text regions. This image has many text elements. Scan every corner carefully. Include ALL small labels and character descriptions.""" | |
| try: | |
| response = client.chat.completions.create( | |
| model="glm-4.6v-flash", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/png;base64,{img_base64}"} | |
| }, | |
| {"type": "text", "text": prompt} | |
| ] | |
| } | |
| ], | |
| max_tokens=8192 | |
| ) | |
| progress(0.4, desc="Processing response...") | |
| result_text = "" | |
| msg = response.choices[0].message | |
| # Try multiple response fields | |
| if hasattr(msg, 'content') and msg.content: | |
| result_text = msg.content | |
| if hasattr(msg, 'reasoning_content') and msg.reasoning_content: | |
| result_text = result_text + "\n" + msg.reasoning_content if result_text else msg.reasoning_content | |
| # Strip GLM special tokens | |
| result_text = result_text.replace('<|begin_of_box|>', '').replace('<|end_of_box|>', '') | |
| print(f"📝 GLM-4.6V Response length: {len(result_text)} chars") | |
| print(f"📝 GLM-4.6V Response preview: {result_text[:500] if result_text else 'EMPTY'}...") | |
| # Parse JSON from response with robust error handling | |
| detections = safe_parse_json(result_text) | |
| print(f"📝 Parsed detections: {len(detections)} items") | |
| if detections: | |
| # Scale bboxes back to original size if needed | |
| if original_size != processed_size: | |
| for det in detections: | |
| if 'bbox' in det and len(det['bbox']) == 4: | |
| det['bbox'] = scale_bbox(det['bbox'], original_size, processed_size) | |
| return detections, f"Found {len(detections)} text regions" | |
| else: | |
| # Return debug info when no detections | |
| debug_info = f"No text detected.\n\nDEBUG - API Response ({len(result_text)} chars):\n{result_text[:1000] if result_text else 'EMPTY RESPONSE'}" | |
| return [], debug_info | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| # ============================================================ | |
| # INPAINTING (Optimized for 8 vCPU) | |
| # ============================================================ | |
| def create_text_mask(image: Image.Image, detections: list, padding: int = 12) -> Image.Image: | |
| """Create a mask for inpainting based on detected text regions.""" | |
| mask = Image.new('L', image.size, 0) | |
| draw = ImageDraw.Draw(mask) | |
| for det in detections: | |
| bbox = det.get('bbox', []) | |
| if len(bbox) == 4: | |
| x1, y1, x2, y2 = [int(v) for v in bbox] | |
| # Ensure coordinates are valid (x2 > x1, y2 > y1) | |
| if x2 < x1: | |
| x1, x2 = x2, x1 | |
| if y2 < y1: | |
| y1, y2 = y2, y1 | |
| # Skip invalid boxes | |
| if x2 <= x1 or y2 <= y1: | |
| continue | |
| # Larger padding for cleaner inpainting | |
| x1 = max(0, x1 - padding) | |
| y1 = max(0, y1 - padding) | |
| x2 = min(image.width, x2 + padding) | |
| y2 = min(image.height, y2 + padding) | |
| # Final validation | |
| if x2 > x1 and y2 > y1: | |
| draw.rectangle([x1, y1, x2, y2], fill=255) | |
| return mask | |
| def inpaint_image(image: Image.Image, mask: Image.Image) -> Image.Image: | |
| """High-quality inpainting using LaMa (GPU-accelerated).""" | |
| try: | |
| lama = load_lama() | |
| # LaMa expects RGB image and binary mask | |
| result = lama(image.convert('RGB'), mask.convert('L')) | |
| return result | |
| except Exception as e: | |
| print(f"⚠️ LaMa failed, falling back to OpenCV: {e}") | |
| # Fallback to OpenCV | |
| img_array = np.array(image.convert('RGB')) | |
| mask_array = np.array(mask) | |
| result = cv2.inpaint(img_array, mask_array, inpaintRadius=12, flags=cv2.INPAINT_NS) | |
| return Image.fromarray(result) | |
| # ============================================================ | |
| # TEXT RENDERING (Optimized with word wrapping) | |
| # ============================================================ | |
| def wrap_text(text: str, font: ImageFont.FreeTypeFont, max_width: int, draw: ImageDraw.Draw) -> list: | |
| """Wrap text to fit within max_width.""" | |
| words = text.split() | |
| lines = [] | |
| current_line = [] | |
| for word in words: | |
| test_line = ' '.join(current_line + [word]) | |
| bbox = draw.textbbox((0, 0), test_line, font=font) | |
| if bbox[2] - bbox[0] <= max_width: | |
| current_line.append(word) | |
| else: | |
| if current_line: | |
| lines.append(' '.join(current_line)) | |
| current_line = [word] | |
| if current_line: | |
| lines.append(' '.join(current_line)) | |
| return lines if lines else [text] | |
| def add_translated_text(image: Image.Image, detections: list) -> Image.Image: | |
| """Add translated text to the inpainted image with smart sizing and positioning.""" | |
| result = image.copy() | |
| draw = ImageDraw.Draw(result) | |
| for det in detections: | |
| bbox = det.get('bbox', []) | |
| translated = det.get('translated', '') | |
| if len(bbox) == 4 and translated: | |
| x1, y1, x2, y2 = [int(v) for v in bbox] | |
| # Ensure coordinates are valid | |
| if x2 < x1: | |
| x1, x2 = x2, x1 | |
| if y2 < y1: | |
| y1, y2 = y2, y1 | |
| box_width = x2 - x1 | |
| box_height = y2 - y1 | |
| # Skip very small or invalid boxes | |
| if box_width < 20 or box_height < 10: | |
| continue | |
| # Detect if vertical text (tall narrow box with short text) | |
| is_vertical = box_height > box_width * 2 and len(translated) < 10 | |
| # Calculate optimal font size based on box dimensions | |
| text_len = max(len(translated), 1) | |
| if is_vertical: | |
| # Vertical: size based on width | |
| estimated_size = min(box_width - 4, 24) | |
| else: | |
| # Horizontal: balance between height and text length | |
| estimated_size = min( | |
| box_height - 4, | |
| int((box_width / text_len) * 1.5), | |
| 28 | |
| ) | |
| estimated_size = max(10, estimated_size) | |
| font = get_font(estimated_size) | |
| # Word wrap for long text | |
| lines = wrap_text(translated, font, box_width - 8, draw) | |
| # Calculate total text height | |
| line_height = estimated_size + 2 | |
| total_height = len(lines) * line_height | |
| # If text doesn't fit, reduce font size progressively | |
| while total_height > box_height - 6 and estimated_size > 8: | |
| estimated_size -= 1 | |
| font = get_font(estimated_size) | |
| lines = wrap_text(translated, font, box_width - 8, draw) | |
| line_height = estimated_size + 2 | |
| total_height = len(lines) * line_height | |
| # Center vertically and horizontally | |
| start_y = y1 + max(2, (box_height - total_height) // 2) | |
| # Draw each line centered | |
| for i, line in enumerate(lines): | |
| text_bbox = draw.textbbox((0, 0), line, font=font) | |
| text_width = text_bbox[2] - text_bbox[0] | |
| text_x = x1 + max(2, (box_width - text_width) // 2) | |
| text_y = start_y + i * line_height | |
| # Ensure text stays within bounds | |
| text_x = max(x1 + 2, min(text_x, x2 - text_width - 2)) | |
| text_y = max(y1 + 2, min(text_y, y2 - estimated_size - 2)) | |
| # Draw outline for readability (thicker outline) | |
| outline_range = [-1, 0, 1] | |
| for dx in outline_range: | |
| for dy in outline_range: | |
| if dx != 0 or dy != 0: | |
| draw.text((text_x + dx, text_y + dy), line, font=font, fill="black") | |
| # Draw main text in white | |
| draw.text((text_x, text_y), line, font=font, fill="white") | |
| return result | |
| def draw_detections(image: Image.Image, detections: list) -> Image.Image: | |
| """Draw bounding boxes and labels on image for visualization.""" | |
| result = image.copy() | |
| draw = ImageDraw.Draw(result) | |
| font = get_font(12) | |
| colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7", "#DDA0DD", "#98D8C8"] | |
| for i, det in enumerate(detections): | |
| bbox = det.get('bbox', []) | |
| original = det.get('original', '')[:30] | |
| translated = det.get('translated', '')[:30] | |
| if len(bbox) == 4: | |
| x1, y1, x2, y2 = [int(v) for v in bbox] | |
| color = colors[i % len(colors)] | |
| draw.rectangle([x1, y1, x2, y2], outline=color, width=3) | |
| label = f"{i+1}: {original} → {translated}" | |
| # Draw label background | |
| label_bbox = draw.textbbox((x1, y1 - 18), label, font=font) | |
| draw.rectangle(label_bbox, fill=color) | |
| draw.text((x1, y1 - 18), label, font=font, fill="white") | |
| return result | |
| # ============================================================ | |
| # MAIN PIPELINE | |
| # ============================================================ | |
| def translate_manga(image, source_lang, target_lang, show_boxes, apply_inpaint, progress=gr.Progress()): | |
| """Main translation pipeline (GPU-accelerated on T4).""" | |
| if image is None: | |
| return None, None, "Please upload an image" | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| # Convert to RGB if needed | |
| if image.mode != 'RGB': | |
| image = image.convert('RGB') | |
| # Step 1: Detect and translate | |
| progress(0.1, desc="🔍 Detecting text with GLM-4.6V...") | |
| detections, status = detect_and_translate(image, source_lang, target_lang, progress) | |
| if detections is None: | |
| return None, None, status | |
| if len(detections) == 0: | |
| return image, image, status # status contains debug info | |
| # Step 2: Create visualization | |
| progress(0.5, desc="🎨 Creating visualization...") | |
| viz_image = draw_detections(image, detections) | |
| # Step 3: Inpaint and add translated text | |
| if apply_inpaint: | |
| progress(0.6, desc="🖌️ Creating mask...") | |
| mask = create_text_mask(image, detections) | |
| progress(0.7, desc="✨ Inpainting (removing original text)...") | |
| inpainted = inpaint_image(image, mask) | |
| progress(0.9, desc="✍️ Adding translated text...") | |
| result = add_translated_text(inpainted, detections) | |
| else: | |
| result = add_translated_text(image, detections) | |
| det_text = json.dumps(detections, indent=2, ensure_ascii=False) | |
| progress(1.0, desc="✅ Done!") | |
| if show_boxes: | |
| return viz_image, result, det_text | |
| else: | |
| return image, result, det_text | |
| # ============================================================ | |
| # BATCH PROCESSING (Utilize all 8 CPUs) | |
| # ============================================================ | |
| def translate_batch(images: list, source_lang: str, target_lang: str, progress=gr.Progress()): | |
| """Process multiple pages in parallel.""" | |
| if not images: | |
| return [], "No images uploaded" | |
| results = [] | |
| total = len(images) | |
| def process_single(idx_img): | |
| idx, img = idx_img | |
| try: | |
| _, result, _ = translate_manga(img, source_lang, target_lang, False, True) | |
| return (idx, result) | |
| except Exception as e: | |
| return (idx, None) | |
| # Process in parallel using thread pool | |
| progress(0.1, desc=f"Processing {total} pages...") | |
| futures = list(executor.map(process_single, enumerate(images))) | |
| futures.sort(key=lambda x: x[0]) | |
| results = [f[1] for f in futures if f[1] is not None] | |
| progress(1.0, desc=f"✅ Processed {len(results)}/{total} pages") | |
| return results, f"Processed {len(results)} pages successfully" | |
| # ============================================================ | |
| # UI | |
| # ============================================================ | |
| LANGUAGES = [ | |
| "Japanese", | |
| "Korean", | |
| "Chinese (Simplified)", | |
| "Chinese (Traditional)", | |
| "English", | |
| "Spanish", | |
| "Portuguese", | |
| "French", | |
| "German", | |
| "Italian", | |
| "Russian", | |
| "Thai", | |
| "Vietnamese", | |
| "Indonesian", | |
| "Arabic" | |
| ] | |
| css = """ | |
| .gradio-container { | |
| max-width: 1400px !important; | |
| } | |
| .header { | |
| text-align: center; | |
| padding: 20px; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 10px; | |
| margin-bottom: 20px; | |
| } | |
| .header h1 { | |
| color: white; | |
| margin: 0; | |
| font-size: 2.2em; | |
| } | |
| .header p { | |
| color: rgba(255,255,255,0.9); | |
| margin: 5px 0 0 0; | |
| } | |
| .stats { | |
| background: rgba(102, 126, 234, 0.1); | |
| padding: 10px; | |
| border-radius: 8px; | |
| margin: 10px 0; | |
| font-size: 0.9em; | |
| } | |
| """ | |
| with gr.Blocks(title="BubbleScribe", css=css, theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>✍️ BubbleScribe</h1> | |
| <p>AI-powered manga & comic translator using GLM-4.6V + LaMa</p> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div class="stats"> | |
| ⚡ <strong>Models:</strong> GLM-4.6V (OCR & Translation) + LaMa (Inpainting) | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # Single Page Tab | |
| with gr.Tab("📄 Single Page"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image(label="📤 Upload Manga Page", type="pil") | |
| with gr.Row(): | |
| source_lang = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="Japanese", | |
| label="Source Language" | |
| ) | |
| target_lang = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="English", | |
| label="Target Language" | |
| ) | |
| with gr.Row(): | |
| show_boxes = gr.Checkbox(label="Show detection boxes", value=True) | |
| apply_inpaint = gr.Checkbox(label="Apply inpainting", value=True) | |
| translate_btn = gr.Button("🔄 Translate", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| with gr.Row(): | |
| detection_output = gr.Image(label="🔍 Detected Text Regions") | |
| result_output = gr.Image(label="✨ Translated Result") | |
| detections_json = gr.Textbox( | |
| label="📋 Detected Text (JSON)", | |
| lines=8, | |
| max_lines=15 | |
| ) | |
| translate_btn.click( | |
| fn=translate_manga, | |
| inputs=[input_image, source_lang, target_lang, show_boxes, apply_inpaint], | |
| outputs=[detection_output, result_output, detections_json] | |
| ) | |
| # Batch Processing Tab | |
| with gr.Tab("📚 Batch (Multi-Page)"): | |
| gr.Markdown("**Upload multiple pages** to translate them all at once using parallel processing.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| batch_images = gr.Gallery( | |
| label="📤 Upload Multiple Pages", | |
| columns=3, | |
| height=300, | |
| object_fit="contain" | |
| ) | |
| with gr.Row(): | |
| batch_source = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="Japanese", | |
| label="Source Language" | |
| ) | |
| batch_target = gr.Dropdown( | |
| choices=LANGUAGES, | |
| value="English", | |
| label="Target Language" | |
| ) | |
| batch_btn = gr.Button("🚀 Translate All Pages", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| batch_output = gr.Gallery( | |
| label="✨ Translated Pages", | |
| columns=3, | |
| height=400, | |
| object_fit="contain" | |
| ) | |
| batch_status = gr.Textbox(label="Status", interactive=False) | |
| batch_btn.click( | |
| fn=translate_batch, | |
| inputs=[batch_images, batch_source, batch_target], | |
| outputs=[batch_output, batch_status] | |
| ) | |
| gr.Markdown(""" | |
| ### 💡 Tips | |
| - **Single Page:** Best for previewing detections and fine-tuning | |
| - **Batch Mode:** Process entire chapters quickly (parallel processing) | |
| - Works best with clear, high-contrast text in speech bubbles | |
| - Sound effects may not always be detected | |
| ### 🔧 Powered By | |
| - **GLM-4.6V** - Text detection & translation (Z.ai API) | |
| - **LaMa** - Text removal inpainting (GPU-accelerated) | |
| """) | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-top: 20px; padding: 10px; background: rgba(0,0,0,0.05); border-radius: 8px;"> | |
| <strong>Models:</strong> <a href="https://huggingface.co/zai-org/GLM-4.6V" target="_blank">GLM-4.6V</a> (OCR & Translation) • | |
| <a href="https://github.com/advimman/lama" target="_blank">LaMa</a> (Inpainting) • | |
| <strong>Created by:</strong> <a href="https://huggingface.co/lulavc" target="_blank">@lulavc</a> | |
| </div> | |
| """) | |
| print("✅ BubbleScribe ready!") | |
| if __name__ == "__main__": | |
| demo.launch() | |