""" BubbleScribe - AI Manga & Comic Translator Translate manga/comics using GLM-4.6V for OCR + Translation and LaMa for inpainting. Optimized for NVIDIA T4 GPU """ import gradio as gr import torch import os import json import base64 import re import numpy as np from PIL import Image, ImageDraw, ImageFont from io import BytesIO from openai import OpenAI from concurrent.futures import ThreadPoolExecutor import threading # ============================================================ # HARDWARE OPTIMIZATION: NVIDIA T4 (16GB VRAM) # ============================================================ import cv2 # Enable CUDA optimizations torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Thread pool for parallel operations executor = ThreadPoolExecutor(max_workers=4) # ============================================================ # MODEL PRELOADING (Load at startup for faster inference) # ============================================================ print("π BubbleScribe starting up...") print(f" Hardware: NVIDIA T4 (16GB VRAM)") print(f" OCR: Qwen2-VL (API)") print(f" Inpainting: LaMa (GPU)") # Load LaMa model at startup print("π¦ Loading LaMa model...") from simple_lama_inpainting import SimpleLama lama_model = SimpleLama() print("β LaMa model loaded and ready!") def load_lama(): """Get LaMa model (already loaded at startup).""" return lama_model # ============================================================ # FONT CACHING # ============================================================ _font_cache = {} _font_lock = threading.Lock() def get_font(size: int): """Get a font with caching.""" cache_key = size if cache_key in _font_cache: return _font_cache[cache_key] with _font_lock: if cache_key in _font_cache: return _font_cache[cache_key] font_paths = [ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", "/usr/share/fonts/truetype/noto/NotoSansCJK-Bold.ttc", # CJK support "/usr/share/fonts/opentype/noto/NotoSansCJK-Bold.ttc", "C:/Windows/Fonts/arial.ttf", "C:/Windows/Fonts/arialbd.ttf", ] for path in font_paths: if os.path.exists(path): try: font = ImageFont.truetype(path, size) _font_cache[cache_key] = font return font except: continue font = ImageFont.load_default() _font_cache[cache_key] = font return font # ============================================================ # GLM-4.6V CLIENT (Z.ai API) # ============================================================ _glm_client = None def get_glm_client(): """Get or create GLM client.""" global _glm_client if _glm_client is None: api_key = os.environ.get("GLM_API_KEY") if not api_key: return None _glm_client = OpenAI(api_key=api_key, base_url="https://api.z.ai/api/paas/v4") return _glm_client # ============================================================ # IMAGE UTILITIES # ============================================================ def encode_image_base64(image: Image.Image, max_size: int = 2048) -> str: """Convert PIL Image to base64 string with optional resize.""" # Resize if too large to save bandwidth and API costs if max(image.size) > max_size: ratio = max_size / max(image.size) new_size = (int(image.width * ratio), int(image.height * ratio)) image = image.resize(new_size, Image.Resampling.LANCZOS) buffered = BytesIO() image.save(buffered, format="PNG", optimize=True) return base64.b64encode(buffered.getvalue()).decode("utf-8") def scale_bbox(bbox: list, original_size: tuple, processed_size: tuple) -> list: """Scale bounding box coordinates if image was resized.""" if original_size == processed_size: return bbox scale_x = original_size[0] / processed_size[0] scale_y = original_size[1] / processed_size[1] return [ int(bbox[0] * scale_x), int(bbox[1] * scale_y), int(bbox[2] * scale_x), int(bbox[3] * scale_y) ] # ============================================================ # JSON REPAIR (Handle malformed model responses) # ============================================================ def repair_json(text: str) -> str: """Attempt to repair common JSON issues from LLM responses.""" # Remove any markdown code blocks text = re.sub(r'```json\s*', '', text) text = re.sub(r'```\s*', '', text) # Fix unescaped newlines in strings text = re.sub(r'(? list: """Safely parse JSON with multiple fallback strategies.""" # Strategy 1: Direct parse try: json_match = re.search(r'\[[\s\S]*\]', text) if json_match: return json.loads(json_match.group()) except json.JSONDecodeError: pass # Strategy 2: Repair and parse try: repaired = repair_json(text) json_match = re.search(r'\[[\s\S]*\]', repaired) if json_match: return json.loads(json_match.group()) except json.JSONDecodeError: pass # Strategy 3: Extract individual objects try: objects = re.findall(r'\{[^{}]*\}', text) results = [] for obj in objects: try: parsed = json.loads(repair_json(obj)) if 'bbox' in parsed: results.append(parsed) except: continue if results: return results except: pass # Strategy 4: Manual extraction with regex try: results = [] # Find bbox patterns bbox_matches = re.findall(r'"bbox"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]', text) original_matches = re.findall(r'"original"\s*:\s*"([^"]*)"', text) translated_matches = re.findall(r'"translated"\s*:\s*"([^"]*)"', text) for i, bbox in enumerate(bbox_matches): result = { "bbox": [int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])], "original": original_matches[i] if i < len(original_matches) else "", "translated": translated_matches[i] if i < len(translated_matches) else "" } results.append(result) if results: return results except: pass return [] # ============================================================ # DETECTION & TRANSLATION # ============================================================ def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str, progress=gr.Progress()): """Use GLM-4.6V to detect text regions and translate.""" client = get_glm_client() if not client: return None, "Error: GLM_API_KEY not set in Space secrets" progress(0.1, desc="Analyzing image with GLM-4.6V...") original_size = image.size # Convert image to base64 (may resize for API) img_base64 = encode_image_base64(image, max_size=2048) # Calculate processed size for bbox scaling processed_size = original_size if max(original_size) > 2048: ratio = 2048 / max(original_size) processed_size = (int(original_size[0] * ratio), int(original_size[1] * ratio)) prompt = f"""You are a professional manga translator. Your task is to find and translate EVERY piece of {source_lang} text in this image. IMPORTANT: Scan the ENTIRE image from top to bottom, left to right. Do NOT miss any text! Find ALL of these text types: - Main titles and headers - Character names (above/below portraits) - Speech bubbles and dialogue - Narration boxes - Sound effects (onomatopoeia) - Labels, captions, descriptions - Small text and annotations - Relationship indicators (arrows, connections) - ANY other visible {source_lang} text For EACH text region found: 1. bbox: [x1, y1, x2, y2] pixel coordinates 2. original: the exact {source_lang} text 3. translated: natural {target_lang} translation TRANSLATION GUIDELINES: - Keep character names in ROMAJI (e.g., η°δΈε€ͺι β "Tanaka Tarou", not "Rice Field Middle Fat Man") - Keep honorifics: -san, -kun, -chan, -sama, -sensei - Sound effects: Keep original + add meaning (e.g., "γγγγ" β "Dokidoki (heart pounding)") - Make dialogue natural and conversational, not literal - Preserve emotional tone and nuance - For titles/roles, translate the meaning (e.g., η€Ύι· β "President", ε η β "Teacher") Return a JSON array. Example: [ {{"bbox": [100, 50, 200, 80], "original": "ε±±η°θ±ε", "translated": "Yamada Hanako"}}, {{"bbox": [300, 100, 400, 130], "original": "γγγγγι‘γγγΎγ", "translated": "Nice to meet you"}} ] CRITICAL: Find at least 20-50 text regions. This image has many text elements. Scan every corner carefully. Include ALL small labels and character descriptions.""" try: response = client.chat.completions.create( model="glm-4.6v-flash", messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"} }, {"type": "text", "text": prompt} ] } ], max_tokens=8192 ) progress(0.4, desc="Processing response...") result_text = "" msg = response.choices[0].message # Try multiple response fields if hasattr(msg, 'content') and msg.content: result_text = msg.content if hasattr(msg, 'reasoning_content') and msg.reasoning_content: result_text = result_text + "\n" + msg.reasoning_content if result_text else msg.reasoning_content # Strip GLM special tokens result_text = result_text.replace('<|begin_of_box|>', '').replace('<|end_of_box|>', '') print(f"π GLM-4.6V Response length: {len(result_text)} chars") print(f"π GLM-4.6V Response preview: {result_text[:500] if result_text else 'EMPTY'}...") # Parse JSON from response with robust error handling detections = safe_parse_json(result_text) print(f"π Parsed detections: {len(detections)} items") if detections: # Scale bboxes back to original size if needed if original_size != processed_size: for det in detections: if 'bbox' in det and len(det['bbox']) == 4: det['bbox'] = scale_bbox(det['bbox'], original_size, processed_size) return detections, f"Found {len(detections)} text regions" else: # Return debug info when no detections debug_info = f"No text detected.\n\nDEBUG - API Response ({len(result_text)} chars):\n{result_text[:1000] if result_text else 'EMPTY RESPONSE'}" return [], debug_info except Exception as e: return None, f"Error: {str(e)}" # ============================================================ # INPAINTING (Optimized for 8 vCPU) # ============================================================ def create_text_mask(image: Image.Image, detections: list, padding: int = 12) -> Image.Image: """Create a mask for inpainting based on detected text regions.""" mask = Image.new('L', image.size, 0) draw = ImageDraw.Draw(mask) for det in detections: bbox = det.get('bbox', []) if len(bbox) == 4: x1, y1, x2, y2 = [int(v) for v in bbox] # Ensure coordinates are valid (x2 > x1, y2 > y1) if x2 < x1: x1, x2 = x2, x1 if y2 < y1: y1, y2 = y2, y1 # Skip invalid boxes if x2 <= x1 or y2 <= y1: continue # Larger padding for cleaner inpainting x1 = max(0, x1 - padding) y1 = max(0, y1 - padding) x2 = min(image.width, x2 + padding) y2 = min(image.height, y2 + padding) # Final validation if x2 > x1 and y2 > y1: draw.rectangle([x1, y1, x2, y2], fill=255) return mask def inpaint_image(image: Image.Image, mask: Image.Image) -> Image.Image: """High-quality inpainting using LaMa (GPU-accelerated).""" try: lama = load_lama() # LaMa expects RGB image and binary mask result = lama(image.convert('RGB'), mask.convert('L')) return result except Exception as e: print(f"β οΈ LaMa failed, falling back to OpenCV: {e}") # Fallback to OpenCV img_array = np.array(image.convert('RGB')) mask_array = np.array(mask) result = cv2.inpaint(img_array, mask_array, inpaintRadius=12, flags=cv2.INPAINT_NS) return Image.fromarray(result) # ============================================================ # TEXT RENDERING (Optimized with word wrapping) # ============================================================ def wrap_text(text: str, font: ImageFont.FreeTypeFont, max_width: int, draw: ImageDraw.Draw) -> list: """Wrap text to fit within max_width.""" words = text.split() lines = [] current_line = [] for word in words: test_line = ' '.join(current_line + [word]) bbox = draw.textbbox((0, 0), test_line, font=font) if bbox[2] - bbox[0] <= max_width: current_line.append(word) else: if current_line: lines.append(' '.join(current_line)) current_line = [word] if current_line: lines.append(' '.join(current_line)) return lines if lines else [text] def add_translated_text(image: Image.Image, detections: list) -> Image.Image: """Add translated text to the inpainted image with smart sizing and positioning.""" result = image.copy() draw = ImageDraw.Draw(result) for det in detections: bbox = det.get('bbox', []) translated = det.get('translated', '') if len(bbox) == 4 and translated: x1, y1, x2, y2 = [int(v) for v in bbox] # Ensure coordinates are valid if x2 < x1: x1, x2 = x2, x1 if y2 < y1: y1, y2 = y2, y1 box_width = x2 - x1 box_height = y2 - y1 # Skip very small or invalid boxes if box_width < 20 or box_height < 10: continue # Detect if vertical text (tall narrow box with short text) is_vertical = box_height > box_width * 2 and len(translated) < 10 # Calculate optimal font size based on box dimensions text_len = max(len(translated), 1) if is_vertical: # Vertical: size based on width estimated_size = min(box_width - 4, 24) else: # Horizontal: balance between height and text length estimated_size = min( box_height - 4, int((box_width / text_len) * 1.5), 28 ) estimated_size = max(10, estimated_size) font = get_font(estimated_size) # Word wrap for long text lines = wrap_text(translated, font, box_width - 8, draw) # Calculate total text height line_height = estimated_size + 2 total_height = len(lines) * line_height # If text doesn't fit, reduce font size progressively while total_height > box_height - 6 and estimated_size > 8: estimated_size -= 1 font = get_font(estimated_size) lines = wrap_text(translated, font, box_width - 8, draw) line_height = estimated_size + 2 total_height = len(lines) * line_height # Center vertically and horizontally start_y = y1 + max(2, (box_height - total_height) // 2) # Draw each line centered for i, line in enumerate(lines): text_bbox = draw.textbbox((0, 0), line, font=font) text_width = text_bbox[2] - text_bbox[0] text_x = x1 + max(2, (box_width - text_width) // 2) text_y = start_y + i * line_height # Ensure text stays within bounds text_x = max(x1 + 2, min(text_x, x2 - text_width - 2)) text_y = max(y1 + 2, min(text_y, y2 - estimated_size - 2)) # Draw outline for readability (thicker outline) outline_range = [-1, 0, 1] for dx in outline_range: for dy in outline_range: if dx != 0 or dy != 0: draw.text((text_x + dx, text_y + dy), line, font=font, fill="black") # Draw main text in white draw.text((text_x, text_y), line, font=font, fill="white") return result def draw_detections(image: Image.Image, detections: list) -> Image.Image: """Draw bounding boxes and labels on image for visualization.""" result = image.copy() draw = ImageDraw.Draw(result) font = get_font(12) colors = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7", "#DDA0DD", "#98D8C8"] for i, det in enumerate(detections): bbox = det.get('bbox', []) original = det.get('original', '')[:30] translated = det.get('translated', '')[:30] if len(bbox) == 4: x1, y1, x2, y2 = [int(v) for v in bbox] color = colors[i % len(colors)] draw.rectangle([x1, y1, x2, y2], outline=color, width=3) label = f"{i+1}: {original} β {translated}" # Draw label background label_bbox = draw.textbbox((x1, y1 - 18), label, font=font) draw.rectangle(label_bbox, fill=color) draw.text((x1, y1 - 18), label, font=font, fill="white") return result # ============================================================ # MAIN PIPELINE # ============================================================ def translate_manga(image, source_lang, target_lang, show_boxes, apply_inpaint, progress=gr.Progress()): """Main translation pipeline (GPU-accelerated on T4).""" if image is None: return None, None, "Please upload an image" if not isinstance(image, Image.Image): image = Image.fromarray(image) # Convert to RGB if needed if image.mode != 'RGB': image = image.convert('RGB') # Step 1: Detect and translate progress(0.1, desc="π Detecting text with GLM-4.6V...") detections, status = detect_and_translate(image, source_lang, target_lang, progress) if detections is None: return None, None, status if len(detections) == 0: return image, image, status # status contains debug info # Step 2: Create visualization progress(0.5, desc="π¨ Creating visualization...") viz_image = draw_detections(image, detections) # Step 3: Inpaint and add translated text if apply_inpaint: progress(0.6, desc="ποΈ Creating mask...") mask = create_text_mask(image, detections) progress(0.7, desc="β¨ Inpainting (removing original text)...") inpainted = inpaint_image(image, mask) progress(0.9, desc="βοΈ Adding translated text...") result = add_translated_text(inpainted, detections) else: result = add_translated_text(image, detections) det_text = json.dumps(detections, indent=2, ensure_ascii=False) progress(1.0, desc="β Done!") if show_boxes: return viz_image, result, det_text else: return image, result, det_text # ============================================================ # BATCH PROCESSING (Utilize all 8 CPUs) # ============================================================ def translate_batch(images: list, source_lang: str, target_lang: str, progress=gr.Progress()): """Process multiple pages in parallel.""" if not images: return [], "No images uploaded" results = [] total = len(images) def process_single(idx_img): idx, img = idx_img try: _, result, _ = translate_manga(img, source_lang, target_lang, False, True) return (idx, result) except Exception as e: return (idx, None) # Process in parallel using thread pool progress(0.1, desc=f"Processing {total} pages...") futures = list(executor.map(process_single, enumerate(images))) futures.sort(key=lambda x: x[0]) results = [f[1] for f in futures if f[1] is not None] progress(1.0, desc=f"β Processed {len(results)}/{total} pages") return results, f"Processed {len(results)} pages successfully" # ============================================================ # UI # ============================================================ LANGUAGES = [ "Japanese", "Korean", "Chinese (Simplified)", "Chinese (Traditional)", "English", "Spanish", "Portuguese", "French", "German", "Italian", "Russian", "Thai", "Vietnamese", "Indonesian", "Arabic" ] css = """ .gradio-container { max-width: 1400px !important; } .header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px; } .header h1 { color: white; margin: 0; font-size: 2.2em; } .header p { color: rgba(255,255,255,0.9); margin: 5px 0 0 0; } .stats { background: rgba(102, 126, 234, 0.1); padding: 10px; border-radius: 8px; margin: 10px 0; font-size: 0.9em; } """ with gr.Blocks(title="BubbleScribe", css=css, theme=gr.themes.Soft()) as demo: gr.HTML("""
AI-powered manga & comic translator using GLM-4.6V + LaMa