Spaces:

VolariGlobal
/

volaris-pdf-tool

Running

App Files Files Community

saifisvibinn commited on Nov 16

Commit

a20a1e3

1 Parent(s): 5b16707

Deploy PDF extraction tool with API and progress tracking

Browse files

Files changed (10) hide show

.dockerignore +36 -20
.gitattributes +0 -35
.gitignore +32 -0
Dockerfile +23 -28
README.md +181 -39
app.py +171 -77
entrypoint.sh +0 -7
requirements.txt +27 -12
static/js/app.js +92 -14
templates/index.html +12 -2

.dockerignore CHANGED Viewed

@@ -1,25 +1,41 @@
-__pycache__
-*.pyc
-*.pyo
-*.pyd
-.Python
 *.so
-*.egg
-*.egg-info
-dist
-build
-.git
 .gitignore
-.vscode
-.idea
-*.md
-!README.md
-pdfs/
 output/
 uploads/
-uv.lock
-.env
-.venv
-venv/
-env/

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
 *.so
+.Python
+*.egg-info/
+dist/
+build/
+# Virtual environments
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Git
+.git/
 .gitignore
+# Output directories (will be created in container)
 output/
 uploads/
+pdfs/
+# Documentation
+*.md
+!README.md
+# Docker
+Dockerfile
+.dockerignore
+# Other
+.DS_Store
+*.log

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+# Virtual environments
+venv/
+env/
+ENV/
+# Output directories
+output/
+uploads/
+pdfs/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Logs
+*.log

Dockerfile CHANGED Viewed

@@ -1,45 +1,40 @@
-# Use Python 3.12 slim image as base
 FROM python:3.12-slim
-# Install system dependencies (as root)
 RUN apt-get update && apt-get install -y \
-    build-essential \
-    libgl1 \
     libglib2.0-0 \
     && rm -rf /var/lib/apt/lists/*
-# Create a non-root user (Hugging Face Spaces best practice)
-RUN useradd -m -u 1000 user
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
-# Set working directory
-WORKDIR /app
 # Copy requirements first for better caching
-COPY --chown=user ./requirements.txt requirements.txt
 # Install Python dependencies
-RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # Copy application files
-COPY --chown=user . /app
-# Make entrypoint script executable
-RUN chmod +x /app/entrypoint.sh
 # Create necessary directories
-RUN mkdir -p uploads output
-# Expose port (Hugging Face Spaces uses 7860)
 EXPOSE 7860
-# Set environment variables
-ENV FLASK_APP=app.py
-ENV PYTHONUNBUFFERED=1
-ENV PORT=7860
-# Run the Flask app
-# Hugging Face Spaces expects the app to listen on 0.0.0.0 and port 7860
-CMD ["/app/entrypoint.sh"]

+# Hugging Face Spaces Docker Runtime
 FROM python:3.12-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    poppler-utils \
     libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    curl \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
+COPY requirements.txt .
 # Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
 # Copy application files
+COPY . .
 # Create necessary directories
+RUN mkdir -p output pdfs uploads
+# Expose Hugging Face Spaces default port
 EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+    CMD curl -f http://localhost:7860/ || exit 1
+# Run Flask app
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,56 +1,198 @@
----
-title: PDF Layout Extractor
-emoji: 📄
-colorFrom: blue
-colorTo: purple
-sdk: docker
-pinned: false
-license: mit
-app_port: 7860
----
-# PDF Layout Extractor
-A web application for extracting figures, tables, annotated layouts, and markdown text from scientific PDFs using [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO).
 ## Features
 - **Layout-aware extraction** of figures and tables with YOLO-based detection
 - **Cross-page stitching** for multi-page tables, captions, titles, and body text
 - **Annotated PDF output** with bounding boxes for detected regions
-- **Markdown export** powered by `pymupdf4llm`
-- **Modern Flask Web UI** with dark/light theme support
-## Usage
-1. Upload one or more PDF files (max 500MB per file)
-2. Choose extraction mode:
-   - **Images Only**: Extract figures and tables with layout detection
-   - **Markdown Only**: Extract text content as markdown
-   - **Both**: Extract both images and markdown
-3. Wait for processing to complete
-4. View and download extracted figures, tables, annotated PDFs, and markdown files
-## Technical Details
-- Built with Flask and DocLayout-YOLO
-- Supports both CPU and GPU processing (GPU recommended for faster processing)
-- Maximum file size: 500MB per PDF
-- Model: DocLayout-YOLO from `juliozhao/DocLayout-YOLO-DocStructBench`
-## Output Structure
-Each processed PDF creates a directory with:
-- `*_content_list.json` - Metadata for extracted figures/tables
-- `*_layout.pdf` - Annotated PDF with layout bounding boxes
-- `*.md` - Markdown export of text content
-- `figures/` - Extracted figure images (PNG)
-- `tables/` - Extracted table images (PNG)
-## Model Information
-This application uses the DocLayout-YOLO model for document layout detection. The model is automatically downloaded from Hugging Face Hub on first use.
-## License
-MIT License

+# PDF Layout Extraction Companion
+A streamlined workflow for extracting figures, tables, annotated layouts, and markdown text from scientific PDFs using [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO), PyMuPDF, and Flask. The project exposes a command-line pipeline (`main.py`) and a modern Flask web UI (`app.py`).
+---
 ## Features
 - **Layout-aware extraction** of figures and tables with YOLO-based detection
 - **Cross-page stitching** for multi-page tables, captions, titles, and body text
 - **Annotated PDF output** with bounding boxes for detected regions
+- **Markdown export** powered by `pymupdf4llm` / `pymupdf-layout`
+- **Flask Web UI** with modern design, dark/light theme, GPU/CPU status, and individual PDF viewing
+- Unified `output/<PDF stem>/` directory structure for CLI + UI runs
+---
+## Requirements
+- Python 3.12+
+- [uv](https://docs.astral.sh/uv/latest/) (recommended) or `pip`
+- GPU optional (DocLayout-YOLO runs on CPU as well)
+Install dependencies:
+```bash
+uv pip install
+```
+> If you prefer a virtualenv, create/activate it first, then run `uv pip install` inside.
+---
+## Quick Start
+### Command Line Pipeline
+Process all PDFs in `./pdfs` and write outputs to `./output/<PDF stem>/`:
+```bash
+uv run python main.py
+```
+Each subdirectory contains:
+- `* _content_list.json` – metadata for extracted figures/tables
+- `*_layout.pdf` – annotated PDF with layout boxes
+- `*.md` – markdown export (if `pymupdf4llm` is installed)
+- `figures/` & `tables/` – cropped PNGs with stitched captions/titles
+### Flask Web App (Recommended)
+Launch the modern Flask web interface locally:
+```bash
+python run_flask_gpu.py
+```
+Then open your browser to `http://localhost:5000`
+**Features:**
+- Clean, modern UI with dark/light theme support
+- Multiple PDF upload and processing
+- **Real-time progress bar** with status updates
+- Individual PDF output viewing with sidebar navigation
+- Real-time GPU/CPU status display
+- Image gallery for figures and tables
+- Markdown preview and download
+- Responsive design for mobile and desktop
+- **REST API** for programmatic access
+All Flask app runs also write into `./output/<PDF stem>/` using the same structure as the CLI.
+### Deploy to Hugging Face Spaces (Docker)
+Deploy your Flask app to Hugging Face Spaces with Docker:
+1. **Create a new Space on Hugging Face:**
+   - Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+   - Click "Create new Space"
+   - Choose "Docker" as the SDK
+   - Set visibility (public/private)
+2. **Push your code:**
+   ```bash
+   # Clone your space (replace with your space name)
+   git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
+   cd YOUR_SPACE_NAME
+   # Copy your project files
+   cp -r /path/to/pdf-minor-allegations/* .
+   # Commit and push
+   git add .
+   git commit -m "Initial deployment"
+   git push
+   ```
+3. **Your Space will automatically build and deploy!**
+**Features:**
+- **REST API endpoints** for programmatic access
+- **Real-time progress tracking** with progress bar
+- **Multiple processing modes:** Images only, Markdown only, or Both
+- **Background processing** - upload files and track progress via API
+- **Modern web UI** with dark/light theme
+- **GPU/CPU support** - automatically detects available hardware
+- **Free tier available** with CPU instances
+- **Automatic HTTPS** and custom domain support
+**API Endpoints:**
+- `POST /api/upload` - Upload PDFs for processing (returns `task_id`)
+- `GET /api/progress/<task_id>` - Get processing progress (0-100%)
+- `GET /api/pdf-list` - List all processed PDFs
+- `GET /api/pdf-details/<pdf_stem>` - Get details for a processed PDF
+- `GET /api/device-info` - Get GPU/CPU device information
+- `GET /output/<path>` - Download processed files (PDFs, images, markdown)
+**Example API Usage:**
+```python
+import requests
+import time
+# Upload a PDF
+files = {'files[]': open('document.pdf', 'rb')}
+data = {'extraction_mode': 'both'}  # or 'images' or 'markdown'
+response = requests.post('https://YOUR_SPACE.hf.space/api/upload', files=files, data=data)
+task_id = response.json()['task_id']
+# Poll for progress
+while True:
+    progress = requests.get(f'https://YOUR_SPACE.hf.space/api/progress/{task_id}').json()
+    print(f"Progress: {progress['progress']}% - {progress['message']}")
+    if progress['status'] == 'completed':
+        break
+    time.sleep(0.5)
+# Get results
+results = progress['results']
+```
+### Deploy to Modal.com (Cloud with GPU)
+Deploy your Flask app online with GPU support using Modal:
+```bash
+# Install Modal CLI
+pip install modal
+# Authenticate with Modal
+modal token new
+# Deploy to Modal
+modal deploy modal_app.py
+```
+See [MODAL_DEPLOYMENT.md](MODAL_DEPLOYMENT.md) for detailed instructions.
+**Benefits:**
+- GPU support (T4, A10G, or A100)
+- Pay-per-use pricing
+- Automatic HTTPS
+- Auto-scaling
+- Global deployment
+---
+## Configuration Highlights
+- **Detection model:** DocLayout-YOLO (`doclayout_yolo_docstructbench_imgsz1024.pt`)
+- **Detection thresholds:** configurable in `main.py`
+- **Layout stitching:** tables, captions, titles, body text
+- **Markdown extraction:** defaults to enabled (`pymupdf4llm.to_markdown`); falls back gracefully if the package is missing
+- **Output directory:** `./output` (configurable near the bottom of `main.py`)
+---
+## File Overview
+| Path | Description |
+|------|-------------|
+| `main.py` | CLI pipeline for batch PDF processing |
+| `app.py` | Flask web application (recommended UI) with API endpoints |
+| `run_flask_gpu.py` | Local Flask runner with GPU support |
+| `Dockerfile` | Docker configuration for Hugging Face Spaces deployment |
+| `modal_app.py` | Modal.com deployment configuration (cloud GPU) |
+| `MODAL_DEPLOYMENT.md` | Modal.com deployment guide |
+| `templates/` | Flask HTML templates |
+| `static/` | Flask static files (CSS, JS) |
+| `pdfs/` | Source PDFs (gitignored) |
+| `output/` | Generated outputs per PDF |
+| `pyproject.toml` | Project metadata & dependency list |
+| `uv.lock` | Locked dependency versions (auto-maintained by `uv`) |
+---
+## Troubleshooting
+- **`ModuleNotFoundError: pymupdf4llm`** – install it via `uv pip install pymupdf4llm` (already listed in `pyproject.toml`).
+- **Slow performance** – ensure GPU CUDA drivers are available or reduce concurrency by toggling `USE_MULTIPROCESSING` in `main.py`.
+- **Large outputs** – clean the `output/` directory before reruns to avoid confusing duplicates.
+For additional logging, set `LOG_LEVEL` or edit the `logger` configuration in `main.py`.
+---
+## Acknowledgements
+- [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO)
+- [PyMuPDF](https://pymupdf.readthedocs.io/)
+- [PyMuPDF4LLM](https://github.com/pymupdf/RAG/blob/main/pymupdf4llm.md)
+- [Flask](https://flask.palletsprojects.com/)
+Happy extracting! 🎉

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import os
 import shutil
 from pathlib import Path
 from typing import Dict, List, Optional
 from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
@@ -22,6 +24,10 @@ os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
 # Global model instance
 _model = None
 def get_device_info() -> Dict[str, any]:
     """Get information about GPU/CPU availability."""
@@ -65,97 +71,184 @@ def device_info():
     return jsonify(get_device_info())
 @app.route('/api/upload', methods=['POST'])
 def upload_files():
-    """Handle multiple PDF file uploads."""
     if 'files[]' not in request.files:
         return jsonify({'error': 'No files provided'}), 400
     files = request.files.getlist('files[]')
     extraction_mode = request.form.get('extraction_mode', 'images')
-    include_images = extraction_mode != 'markdown'
-    include_markdown = extraction_mode != 'images'
     if not files or all(f.filename == '' for f in files):
         return jsonify({'error': 'No files selected'}), 400
-    results = []
     for file in files:
         if file and file.filename.endswith('.pdf'):
-            try:
-                # Save uploaded file
-                filename = secure_filename(file.filename)
-                stem = Path(filename).stem
-                upload_path = Path(app.config['UPLOAD_FOLDER']) / filename
-                file.save(str(upload_path))
-                # Prepare output directory
-                output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
-                output_dir.mkdir(parents=True, exist_ok=True)
-                # Copy PDF to output directory
-                pdf_path = output_dir / filename
-                upload_path.rename(pdf_path)
-                # Process PDF
-                extractor.USE_MULTIPROCESSING = False
-                logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
-                if include_images:
-                    load_model_once()
-                extractor.process_pdf_with_pool(
-                    pdf_path,
-                    output_dir,
-                    pool=None,
-                    extract_images=include_images,
-                    extract_markdown=include_markdown,
-                )
-                # Collect results
-                json_path = output_dir / f"{stem}_content_list.json"
-                elements = []
-                if include_images and json_path.exists():
-                    elements = json.loads(json_path.read_text(encoding='utf-8'))
-                annotated_pdf = None
-                if include_images:
-                    candidate_pdf = output_dir / f"{stem}_layout.pdf"
-                    if candidate_pdf.exists():
-                        annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))
-                markdown_path = None
-                if include_markdown:
-                    candidate_md = output_dir / f"{stem}.md"
-                    if candidate_md.exists():
-                        markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))
-                # Get figure and table counts
-                figures = [e for e in elements if e.get('type') == 'figure']
-                tables = [e for e in elements if e.get('type') == 'table']
-                results.append({
-                    'filename': filename,
-                    'stem': stem,
-                    'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
-                    'figures_count': len(figures),
-                    'tables_count': len(tables),
-                    'elements_count': len(elements),
-                    'annotated_pdf': annotated_pdf,
-                    'markdown_path': markdown_path,
-                    'include_images': include_images,
-                    'include_markdown': include_markdown,
-                })
-            except Exception as e:
-                logger.error(f"Error processing {file.filename}: {e}")
-                results.append({
-                    'filename': file.filename,
-                    'error': str(e)
-                })
-    return jsonify({'results': results})
 @app.route('/api/pdf-list')
@@ -290,7 +383,8 @@ def delete_pdf_by_path(stem: str):
 if __name__ == '__main__':
-    port = int(os.environ.get('PORT', 5000))
     app.run(debug=False, host='0.0.0.0', port=port)

 import json
 import os
 import shutil
+import threading
+import uuid
 from pathlib import Path
 from typing import Dict, List, Optional
 from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
 # Global model instance
 _model = None
+# Progress tracking: {task_id: {'status': 'processing'|'completed'|'error', 'progress': 0-100, 'message': str, 'results': [], 'file_progress': {filename: progress}}}
+_progress_tracker: Dict[str, Dict] = {}
+_progress_lock = threading.Lock()
 def get_device_info() -> Dict[str, any]:
     """Get information about GPU/CPU availability."""
     return jsonify(get_device_info())
+def _update_task_progress(task_id: str, filename: str, file_progress: int, message: str):
+    """Update progress for a specific file and calculate overall progress."""
+    with _progress_lock:
+        if task_id not in _progress_tracker:
+            return
+        # Update file-specific progress
+        if 'file_progress' not in _progress_tracker[task_id]:
+            _progress_tracker[task_id]['file_progress'] = {}
+        _progress_tracker[task_id]['file_progress'][filename] = file_progress
+        # Calculate overall progress (average of all files)
+        file_progresses = _progress_tracker[task_id]['file_progress']
+        if file_progresses:
+            total_progress = sum(file_progresses.values()) / len(file_progresses)
+            _progress_tracker[task_id]['progress'] = int(total_progress)
+        _progress_tracker[task_id]['message'] = message
+def process_file_background(task_id: str, file, extraction_mode: str):
+    """Process a single file in the background and update progress."""
+    filename = secure_filename(file.filename)
+    try:
+        _update_task_progress(task_id, filename, 5, f'Processing {filename}...')
+        stem = Path(filename).stem
+        include_images = extraction_mode != 'markdown'
+        include_markdown = extraction_mode != 'images'
+        # Save uploaded file
+        upload_path = Path(app.config['UPLOAD_FOLDER']) / filename
+        file.save(str(upload_path))
+        _update_task_progress(task_id, filename, 15, f'Saved {filename}, preparing output...')
+        # Prepare output directory
+        output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Copy PDF to output directory
+        pdf_path = output_dir / filename
+        upload_path.rename(pdf_path)
+        _update_task_progress(task_id, filename, 25, f'Loading model and processing {filename}...')
+        # Process PDF
+        extractor.USE_MULTIPROCESSING = False
+        logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
+        if include_images:
+            load_model_once()
+        _update_task_progress(task_id, filename, 30, f'Extracting content from {filename}...')
+        extractor.process_pdf_with_pool(
+            pdf_path,
+            output_dir,
+            pool=None,
+            extract_images=include_images,
+            extract_markdown=include_markdown,
+        )
+        _update_task_progress(task_id, filename, 85, f'Collecting results for {filename}...')
+        # Collect results
+        json_path = output_dir / f"{stem}_content_list.json"
+        elements = []
+        if include_images and json_path.exists():
+            elements = json.loads(json_path.read_text(encoding='utf-8'))
+        annotated_pdf = None
+        if include_images:
+            candidate_pdf = output_dir / f"{stem}_layout.pdf"
+            if candidate_pdf.exists():
+                annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))
+        markdown_path = None
+        if include_markdown:
+            candidate_md = output_dir / f"{stem}.md"
+            if candidate_md.exists():
+                markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))
+        # Get figure and table counts
+        figures = [e for e in elements if e.get('type') == 'figure']
+        tables = [e for e in elements if e.get('type') == 'table']
+        result = {
+            'filename': filename,
+            'stem': stem,
+            'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
+            'figures_count': len(figures),
+            'tables_count': len(tables),
+            'elements_count': len(elements),
+            'annotated_pdf': annotated_pdf,
+            'markdown_path': markdown_path,
+            'include_images': include_images,
+            'include_markdown': include_markdown,
+        }
+        with _progress_lock:
+            _progress_tracker[task_id]['results'].append(result)
+            _update_task_progress(task_id, filename, 100, f'Completed processing {filename}')
+            # Check if all files are done
+            total_files = _progress_tracker[task_id].get('total_files', 1)
+            if len(_progress_tracker[task_id]['results']) >= total_files:
+                _progress_tracker[task_id]['status'] = 'completed'
+                _progress_tracker[task_id]['message'] = f'All {total_files} file(s) processed successfully'
+    except Exception as e:
+        logger.error(f"Error processing {file.filename}: {e}")
+        with _progress_lock:
+            _progress_tracker[task_id]['results'].append({
+                'filename': filename,
+                'error': str(e)
+            })
+            # Check if this was the last file
+            total_files = _progress_tracker[task_id].get('total_files', 1)
+            if len(_progress_tracker[task_id]['results']) >= total_files:
+                _progress_tracker[task_id]['status'] = 'error'
+                _progress_tracker[task_id]['message'] = f'Error processing {filename}: {str(e)}'
 @app.route('/api/upload', methods=['POST'])
 def upload_files():
+    """Handle multiple PDF file uploads with background processing."""
     if 'files[]' not in request.files:
         return jsonify({'error': 'No files provided'}), 400
     files = request.files.getlist('files[]')
     extraction_mode = request.form.get('extraction_mode', 'images')
     if not files or all(f.filename == '' for f in files):
         return jsonify({'error': 'No files selected'}), 400
+    # Create a task ID for this upload
+    task_id = str(uuid.uuid4())
+    # Initialize progress tracking
+    with _progress_lock:
+        _progress_tracker[task_id] = {
+            'status': 'processing',
+            'progress': 0,
+            'message': 'Starting upload...',
+            'results': [],
+            'total_files': len([f for f in files if f.filename.endswith('.pdf')])
+        }
+    # Process files in background threads
+    threads = []
     for file in files:
         if file and file.filename.endswith('.pdf'):
+            thread = threading.Thread(
+                target=process_file_background,
+                args=(task_id, file, extraction_mode)
+            )
+            thread.daemon = True
+            thread.start()
+            threads.append(thread)
+    # Return task ID immediately
+    return jsonify({
+        'task_id': task_id,
+        'message': 'Processing started',
+        'total_files': len(threads)
+    })
+@app.route('/api/progress/<task_id>')
+def get_progress(task_id):
+    """Get progress for a processing task."""
+    with _progress_lock:
+        progress = _progress_tracker.get(task_id)
+        if not progress:
+            return jsonify({'error': 'Task not found'}), 404
+        return jsonify(progress)
 @app.route('/api/pdf-list')
 if __name__ == '__main__':
+    # Run on port 7860 for Hugging Face Spaces, or 5000 for local development
+    port = int(os.environ.get('PORT', 7860))
     app.run(debug=False, host='0.0.0.0', port=port)

entrypoint.sh DELETED Viewed

@@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-# Start Flask application
-# Get port from environment variable or use default 7860
-python -c "import os; port = int(os.environ.get('PORT', 7860)); from app import app; app.run(host='0.0.0.0', port=port, debug=False)"

requirements.txt CHANGED Viewed

@@ -1,13 +1,28 @@
-doclayout-yolo>=0.0.4
-huggingface-hub>=1.1.2
-loguru>=0.7.3
-pillow>=12.0.0
-pymupdf>=1.26.6
-pymupdf-layout>=0.0.15
-pypdfium2>=5.0.0
-pymupdf4llm>=0.1.9
-flask>=3.0.0
-werkzeug>=3.0.0
-torch>=2.0.0
-torchvision>=0.15.0

+# Core PDF & Document Processing
+doclayout-yolo==0.0.4
+pymupdf==1.26.6
+pymupdf-layout==0.0.15
+pymupdf4llm==0.1.9
+pypdfium2==5.0.0
+# Deep Learning (CPU-optimized)
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.5.1+cpu
+torchvision==0.20.1+cpu
+# Image Processing
+pillow==12.0.0
+opencv-python-headless==4.10.0.84
+# OCR
+pytesseract==0.3.13
+# Utilities
+huggingface-hub==0.26.2
+loguru==0.7.3
+numpy==1.26.4
+# Web Framework
+flask==3.0.0
+werkzeug==3.0.1
+streamlit==1.40.1

static/js/app.js CHANGED Viewed

@@ -84,10 +84,21 @@ async function handleUpload(e) {
     const extractionMode = document.querySelector('input[name="extractionMode"]:checked').value;
     // Show processing section
-    document.getElementById('processingSection').style.display = 'block';
     document.getElementById('resultsSection').style.display = 'none';
     document.getElementById('emptyState').style.display = 'none';
     const formData = new FormData();
     for (let i = 0; i < files.length; i++) {
         formData.append('files[]', files[i]);
@@ -106,18 +117,13 @@ async function handleUpload(e) {
             throw new Error(data.error);
         }
-        // Hide processing section
-        document.getElementById('processingSection').style.display = 'none';
-        // Reload PDF list and show results
-        await loadPdfList();
-        // Show first PDF details if available
-        if (data.results && data.results.length > 0) {
-            const firstPdf = data.results[0];
-            if (!firstPdf.error) {
-                showPdfDetails(firstPdf.stem);
-            }
         }
         // Reset form
@@ -126,10 +132,82 @@ async function handleUpload(e) {
     } catch (error) {
         console.error('Upload error:', error);
         alert('Error processing files: ' + error.message);
-        document.getElementById('processingSection').style.display = 'none';
     }
 }
 // Load PDF List
 async function loadPdfList() {
     try {

     const extractionMode = document.querySelector('input[name="extractionMode"]:checked').value;
     // Show processing section
+    const processingSection = document.getElementById('processingSection');
+    const processingStatus = document.getElementById('processingStatus');
+    const progressBar = document.getElementById('progressBar');
+    const progressBarFill = document.getElementById('progressBarFill');
+    processingSection.style.display = 'block';
     document.getElementById('resultsSection').style.display = 'none';
     document.getElementById('emptyState').style.display = 'none';
+    // Update processing UI
+    processingStatus.textContent = 'Uploading files...';
+    if (progressBarFill) {
+        progressBarFill.style.width = '0%';
+    }
     const formData = new FormData();
     for (let i = 0; i < files.length; i++) {
         formData.append('files[]', files[i]);
             throw new Error(data.error);
         }
+        // Start polling for progress
+        if (data.task_id) {
+            await pollProgress(data.task_id, processingStatus, progressBarFill);
+        } else {
+            // Fallback for old API
+            processingSection.style.display = 'none';
+            await loadPdfList();
         }
         // Reset form
     } catch (error) {
         console.error('Upload error:', error);
         alert('Error processing files: ' + error.message);
+        processingSection.style.display = 'none';
     }
 }
+// Poll for progress updates
+async function pollProgress(taskId, statusElement, progressBarFill) {
+    const maxAttempts = 600; // 5 minutes max (600 * 0.5s)
+    let attempts = 0;
+    const poll = async () => {
+        try {
+            const response = await fetch(`/api/progress/${taskId}`);
+            const data = await response.json();
+            if (data.error) {
+                throw new Error(data.error);
+            }
+            // Update progress bar
+            if (progressBarFill) {
+                const progress = data.progress || 0;
+                progressBarFill.style.width = `${progress}%`;
+                progressBarFill.setAttribute('aria-valuenow', progress);
+                const progressText = document.getElementById('progressBarText');
+                if (progressText) {
+                    progressText.textContent = `${Math.round(progress)}%`;
+                }
+            }
+            // Update status message
+            if (statusElement) {
+                statusElement.textContent = data.message || 'Processing...';
+            }
+            // Check if completed
+            if (data.status === 'completed') {
+                // Hide processing section
+                document.getElementById('processingSection').style.display = 'none';
+                // Reload PDF list and show results
+                await loadPdfList();
+                // Show first PDF details if available
+                if (data.results && data.results.length > 0) {
+                    const firstPdf = data.results[0];
+                    if (!firstPdf.error) {
+                        showPdfDetails(firstPdf.stem);
+                    }
+                }
+                return;
+            }
+            // Check if error
+            if (data.status === 'error') {
+                throw new Error(data.message || 'Processing failed');
+            }
+            // Continue polling
+            attempts++;
+            if (attempts < maxAttempts) {
+                setTimeout(poll, 500); // Poll every 500ms
+            } else {
+                throw new Error('Processing timeout - please try again');
+            }
+        } catch (error) {
+            console.error('Progress polling error:', error);
+            document.getElementById('processingSection').style.display = 'none';
+            alert('Error: ' + error.message);
+        }
+    };
+    // Start polling
+    poll();
+}
 // Load PDF List
 async function loadPdfList() {
     try {

templates/index.html CHANGED Viewed

@@ -120,15 +120,25 @@
             <div class="col-12">
                 <div class="card shadow-sm">
                     <div class="card-body">
-                        <div class="d-flex align-items-center">
                             <div class="spinner-border text-primary me-3" role="status">
                                 <span class="visually-hidden">Loading...</span>
                             </div>
-                            <div>
                                 <h6 class="mb-0">Processing PDFs...</h6>
                                 <small class="text-muted" id="processingStatus">Please wait</small>
                             </div>
                         </div>
                     </div>
                 </div>
             </div>

             <div class="col-12">
                 <div class="card shadow-sm">
                     <div class="card-body">
+                        <div class="d-flex align-items-center mb-3">
                             <div class="spinner-border text-primary me-3" role="status">
                                 <span class="visually-hidden">Loading...</span>
                             </div>
+                            <div class="flex-grow-1">
                                 <h6 class="mb-0">Processing PDFs...</h6>
                                 <small class="text-muted" id="processingStatus">Please wait</small>
                             </div>
                         </div>
+                        <div class="progress" style="height: 25px;">
+                            <div id="progressBarFill" class="progress-bar progress-bar-striped progress-bar-animated"
+                                 role="progressbar"
+                                 style="width: 0%"
+                                 aria-valuenow="0"
+                                 aria-valuemin="0"
+                                 aria-valuemax="100">
+                                <span id="progressBarText">0%</span>
+                            </div>
+                        </div>
                     </div>
                 </div>
             </div>