Spaces:
Running
Running
saifisvibinn
commited on
Commit
·
a20a1e3
1
Parent(s):
5b16707
Deploy PDF extraction tool with API and progress tracking
Browse files- .dockerignore +36 -20
- .gitattributes +0 -35
- .gitignore +32 -0
- Dockerfile +23 -28
- README.md +181 -39
- app.py +171 -77
- entrypoint.sh +0 -7
- requirements.txt +27 -12
- static/js/app.js +92 -14
- templates/index.html +12 -2
.dockerignore
CHANGED
|
@@ -1,25 +1,41 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
*.
|
| 4 |
-
|
| 5 |
-
.Python
|
| 6 |
*.so
|
| 7 |
-
|
| 8 |
-
*.egg-info
|
| 9 |
-
dist
|
| 10 |
-
build
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
.gitignore
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
*.md
|
| 16 |
-
!README.md
|
| 17 |
-
pdfs/
|
| 18 |
output/
|
| 19 |
uploads/
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
|
|
|
| 5 |
*.so
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
|
| 11 |
+
# Virtual environments
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
ENV/
|
| 15 |
+
|
| 16 |
+
# IDE
|
| 17 |
+
.vscode/
|
| 18 |
+
.idea/
|
| 19 |
+
*.swp
|
| 20 |
+
*.swo
|
| 21 |
+
|
| 22 |
+
# Git
|
| 23 |
+
.git/
|
| 24 |
.gitignore
|
| 25 |
+
|
| 26 |
+
# Output directories (will be created in container)
|
|
|
|
|
|
|
|
|
|
| 27 |
output/
|
| 28 |
uploads/
|
| 29 |
+
pdfs/
|
| 30 |
+
|
| 31 |
+
# Documentation
|
| 32 |
+
*.md
|
| 33 |
+
!README.md
|
| 34 |
+
|
| 35 |
+
# Docker
|
| 36 |
+
Dockerfile
|
| 37 |
+
.dockerignore
|
| 38 |
|
| 39 |
+
# Other
|
| 40 |
+
.DS_Store
|
| 41 |
+
*.log
|
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
|
| 11 |
+
# Virtual environments
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
ENV/
|
| 15 |
+
|
| 16 |
+
# Output directories
|
| 17 |
+
output/
|
| 18 |
+
uploads/
|
| 19 |
+
pdfs/
|
| 20 |
+
|
| 21 |
+
# IDE
|
| 22 |
+
.vscode/
|
| 23 |
+
.idea/
|
| 24 |
+
*.swp
|
| 25 |
+
*.swo
|
| 26 |
+
|
| 27 |
+
# OS
|
| 28 |
+
.DS_Store
|
| 29 |
+
Thumbs.db
|
| 30 |
+
|
| 31 |
+
# Logs
|
| 32 |
+
*.log
|
Dockerfile
CHANGED
|
@@ -1,45 +1,40 @@
|
|
| 1 |
-
#
|
| 2 |
FROM python:3.12-slim
|
| 3 |
|
| 4 |
-
#
|
|
|
|
|
|
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
-
|
| 7 |
-
|
|
|
|
| 8 |
libglib2.0-0 \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
-
# Create a non-root user (Hugging Face Spaces best practice)
|
| 12 |
-
RUN useradd -m -u 1000 user
|
| 13 |
-
USER user
|
| 14 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
| 15 |
-
|
| 16 |
-
# Set working directory
|
| 17 |
-
WORKDIR /app
|
| 18 |
-
|
| 19 |
# Copy requirements first for better caching
|
| 20 |
-
COPY
|
| 21 |
|
| 22 |
# Install Python dependencies
|
| 23 |
-
RUN pip install --no-cache-dir
|
| 24 |
|
| 25 |
# Copy application files
|
| 26 |
-
COPY
|
| 27 |
-
|
| 28 |
-
# Make entrypoint script executable
|
| 29 |
-
RUN chmod +x /app/entrypoint.sh
|
| 30 |
|
| 31 |
# Create necessary directories
|
| 32 |
-
RUN mkdir -p uploads
|
| 33 |
|
| 34 |
-
# Expose
|
| 35 |
EXPOSE 7860
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
ENV PORT=7860
|
| 41 |
-
|
| 42 |
-
# Run the Flask app
|
| 43 |
-
# Hugging Face Spaces expects the app to listen on 0.0.0.0 and port 7860
|
| 44 |
-
CMD ["/app/entrypoint.sh"]
|
| 45 |
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces Docker Runtime
|
| 2 |
FROM python:3.12-slim
|
| 3 |
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
+
tesseract-ocr \
|
| 10 |
+
tesseract-ocr-eng \
|
| 11 |
+
poppler-utils \
|
| 12 |
libglib2.0-0 \
|
| 13 |
+
libsm6 \
|
| 14 |
+
libxext6 \
|
| 15 |
+
libxrender-dev \
|
| 16 |
+
libgomp1 \
|
| 17 |
+
curl \
|
| 18 |
&& rm -rf /var/lib/apt/lists/*
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# Copy requirements first for better caching
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
|
| 23 |
# Install Python dependencies
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
| 26 |
# Copy application files
|
| 27 |
+
COPY . .
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Create necessary directories
|
| 30 |
+
RUN mkdir -p output pdfs uploads
|
| 31 |
|
| 32 |
+
# Expose Hugging Face Spaces default port
|
| 33 |
EXPOSE 7860
|
| 34 |
|
| 35 |
+
# Health check
|
| 36 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 37 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Run Flask app
|
| 40 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,56 +1,198 @@
|
|
| 1 |
-
|
| 2 |
-
title: PDF Layout Extractor
|
| 3 |
-
emoji: 📄
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
-
app_port: 7860
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
## Features
|
| 17 |
-
|
| 18 |
- **Layout-aware extraction** of figures and tables with YOLO-based detection
|
| 19 |
- **Cross-page stitching** for multi-page tables, captions, titles, and body text
|
| 20 |
- **Annotated PDF output** with bounding boxes for detected regions
|
| 21 |
-
- **Markdown export** powered by `pymupdf4llm`
|
| 22 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
|
| 36 |
-
|
| 37 |
-
-
|
| 38 |
-
-
|
| 39 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
- `figures/` - Extracted figure images (PNG)
|
| 48 |
-
- `tables/` - Extracted table images (PNG)
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
|
|
|
|
| 1 |
+
# PDF Layout Extraction Companion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
A streamlined workflow for extracting figures, tables, annotated layouts, and markdown text from scientific PDFs using [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO), PyMuPDF, and Flask. The project exposes a command-line pipeline (`main.py`) and a modern Flask web UI (`app.py`).
|
| 4 |
|
| 5 |
+
---
|
| 6 |
|
| 7 |
## Features
|
|
|
|
| 8 |
- **Layout-aware extraction** of figures and tables with YOLO-based detection
|
| 9 |
- **Cross-page stitching** for multi-page tables, captions, titles, and body text
|
| 10 |
- **Annotated PDF output** with bounding boxes for detected regions
|
| 11 |
+
- **Markdown export** powered by `pymupdf4llm` / `pymupdf-layout`
|
| 12 |
+
- **Flask Web UI** with modern design, dark/light theme, GPU/CPU status, and individual PDF viewing
|
| 13 |
+
- Unified `output/<PDF stem>/` directory structure for CLI + UI runs
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## Requirements
|
| 18 |
+
- Python 3.12+
|
| 19 |
+
- [uv](https://docs.astral.sh/uv/latest/) (recommended) or `pip`
|
| 20 |
+
- GPU optional (DocLayout-YOLO runs on CPU as well)
|
| 21 |
+
|
| 22 |
+
Install dependencies:
|
| 23 |
+
```bash
|
| 24 |
+
uv pip install
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
> If you prefer a virtualenv, create/activate it first, then run `uv pip install` inside.
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
## Quick Start
|
| 32 |
+
|
| 33 |
+
### Command Line Pipeline
|
| 34 |
+
Process all PDFs in `./pdfs` and write outputs to `./output/<PDF stem>/`:
|
| 35 |
+
```bash
|
| 36 |
+
uv run python main.py
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
Each subdirectory contains:
|
| 40 |
+
- `* _content_list.json` – metadata for extracted figures/tables
|
| 41 |
+
- `*_layout.pdf` – annotated PDF with layout boxes
|
| 42 |
+
- `*.md` – markdown export (if `pymupdf4llm` is installed)
|
| 43 |
+
- `figures/` & `tables/` – cropped PNGs with stitched captions/titles
|
| 44 |
+
|
| 45 |
+
### Flask Web App (Recommended)
|
| 46 |
+
Launch the modern Flask web interface locally:
|
| 47 |
+
```bash
|
| 48 |
+
python run_flask_gpu.py
|
| 49 |
+
```
|
| 50 |
+
Then open your browser to `http://localhost:5000`
|
| 51 |
+
|
| 52 |
+
**Features:**
|
| 53 |
+
- Clean, modern UI with dark/light theme support
|
| 54 |
+
- Multiple PDF upload and processing
|
| 55 |
+
- **Real-time progress bar** with status updates
|
| 56 |
+
- Individual PDF output viewing with sidebar navigation
|
| 57 |
+
- Real-time GPU/CPU status display
|
| 58 |
+
- Image gallery for figures and tables
|
| 59 |
+
- Markdown preview and download
|
| 60 |
+
- Responsive design for mobile and desktop
|
| 61 |
+
- **REST API** for programmatic access
|
| 62 |
+
|
| 63 |
+
All Flask app runs also write into `./output/<PDF stem>/` using the same structure as the CLI.
|
| 64 |
+
|
| 65 |
+
### Deploy to Hugging Face Spaces (Docker)
|
| 66 |
+
Deploy your Flask app to Hugging Face Spaces with Docker:
|
| 67 |
|
| 68 |
+
1. **Create a new Space on Hugging Face:**
|
| 69 |
+
- Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
| 70 |
+
- Click "Create new Space"
|
| 71 |
+
- Choose "Docker" as the SDK
|
| 72 |
+
- Set visibility (public/private)
|
| 73 |
|
| 74 |
+
2. **Push your code:**
|
| 75 |
+
```bash
|
| 76 |
+
# Clone your space (replace with your space name)
|
| 77 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 78 |
+
cd YOUR_SPACE_NAME
|
| 79 |
+
|
| 80 |
+
# Copy your project files
|
| 81 |
+
cp -r /path/to/pdf-minor-allegations/* .
|
| 82 |
+
|
| 83 |
+
# Commit and push
|
| 84 |
+
git add .
|
| 85 |
+
git commit -m "Initial deployment"
|
| 86 |
+
git push
|
| 87 |
+
```
|
| 88 |
|
| 89 |
+
3. **Your Space will automatically build and deploy!**
|
| 90 |
|
| 91 |
+
**Features:**
|
| 92 |
+
- **REST API endpoints** for programmatic access
|
| 93 |
+
- **Real-time progress tracking** with progress bar
|
| 94 |
+
- **Multiple processing modes:** Images only, Markdown only, or Both
|
| 95 |
+
- **Background processing** - upload files and track progress via API
|
| 96 |
+
- **Modern web UI** with dark/light theme
|
| 97 |
+
- **GPU/CPU support** - automatically detects available hardware
|
| 98 |
+
- **Free tier available** with CPU instances
|
| 99 |
+
- **Automatic HTTPS** and custom domain support
|
| 100 |
|
| 101 |
+
**API Endpoints:**
|
| 102 |
+
- `POST /api/upload` - Upload PDFs for processing (returns `task_id`)
|
| 103 |
+
- `GET /api/progress/<task_id>` - Get processing progress (0-100%)
|
| 104 |
+
- `GET /api/pdf-list` - List all processed PDFs
|
| 105 |
+
- `GET /api/pdf-details/<pdf_stem>` - Get details for a processed PDF
|
| 106 |
+
- `GET /api/device-info` - Get GPU/CPU device information
|
| 107 |
+
- `GET /output/<path>` - Download processed files (PDFs, images, markdown)
|
| 108 |
|
| 109 |
+
**Example API Usage:**
|
| 110 |
+
```python
|
| 111 |
+
import requests
|
| 112 |
+
import time
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
# Upload a PDF
|
| 115 |
+
files = {'files[]': open('document.pdf', 'rb')}
|
| 116 |
+
data = {'extraction_mode': 'both'} # or 'images' or 'markdown'
|
| 117 |
+
response = requests.post('https://YOUR_SPACE.hf.space/api/upload', files=files, data=data)
|
| 118 |
+
task_id = response.json()['task_id']
|
| 119 |
|
| 120 |
+
# Poll for progress
|
| 121 |
+
while True:
|
| 122 |
+
progress = requests.get(f'https://YOUR_SPACE.hf.space/api/progress/{task_id}').json()
|
| 123 |
+
print(f"Progress: {progress['progress']}% - {progress['message']}")
|
| 124 |
+
if progress['status'] == 'completed':
|
| 125 |
+
break
|
| 126 |
+
time.sleep(0.5)
|
| 127 |
+
|
| 128 |
+
# Get results
|
| 129 |
+
results = progress['results']
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
### Deploy to Modal.com (Cloud with GPU)
|
| 133 |
+
Deploy your Flask app online with GPU support using Modal:
|
| 134 |
+
```bash
|
| 135 |
+
# Install Modal CLI
|
| 136 |
+
pip install modal
|
| 137 |
+
|
| 138 |
+
# Authenticate with Modal
|
| 139 |
+
modal token new
|
| 140 |
+
|
| 141 |
+
# Deploy to Modal
|
| 142 |
+
modal deploy modal_app.py
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
See [MODAL_DEPLOYMENT.md](MODAL_DEPLOYMENT.md) for detailed instructions.
|
| 146 |
+
|
| 147 |
+
**Benefits:**
|
| 148 |
+
- GPU support (T4, A10G, or A100)
|
| 149 |
+
- Pay-per-use pricing
|
| 150 |
+
- Automatic HTTPS
|
| 151 |
+
- Auto-scaling
|
| 152 |
+
- Global deployment
|
| 153 |
+
|
| 154 |
+
---
|
| 155 |
+
|
| 156 |
+
## Configuration Highlights
|
| 157 |
+
- **Detection model:** DocLayout-YOLO (`doclayout_yolo_docstructbench_imgsz1024.pt`)
|
| 158 |
+
- **Detection thresholds:** configurable in `main.py`
|
| 159 |
+
- **Layout stitching:** tables, captions, titles, body text
|
| 160 |
+
- **Markdown extraction:** defaults to enabled (`pymupdf4llm.to_markdown`); falls back gracefully if the package is missing
|
| 161 |
+
- **Output directory:** `./output` (configurable near the bottom of `main.py`)
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## File Overview
|
| 166 |
+
| Path | Description |
|
| 167 |
+
|------|-------------|
|
| 168 |
+
| `main.py` | CLI pipeline for batch PDF processing |
|
| 169 |
+
| `app.py` | Flask web application (recommended UI) with API endpoints |
|
| 170 |
+
| `run_flask_gpu.py` | Local Flask runner with GPU support |
|
| 171 |
+
| `Dockerfile` | Docker configuration for Hugging Face Spaces deployment |
|
| 172 |
+
| `modal_app.py` | Modal.com deployment configuration (cloud GPU) |
|
| 173 |
+
| `MODAL_DEPLOYMENT.md` | Modal.com deployment guide |
|
| 174 |
+
| `templates/` | Flask HTML templates |
|
| 175 |
+
| `static/` | Flask static files (CSS, JS) |
|
| 176 |
+
| `pdfs/` | Source PDFs (gitignored) |
|
| 177 |
+
| `output/` | Generated outputs per PDF |
|
| 178 |
+
| `pyproject.toml` | Project metadata & dependency list |
|
| 179 |
+
| `uv.lock` | Locked dependency versions (auto-maintained by `uv`) |
|
| 180 |
+
|
| 181 |
+
---
|
| 182 |
+
|
| 183 |
+
## Troubleshooting
|
| 184 |
+
- **`ModuleNotFoundError: pymupdf4llm`** – install it via `uv pip install pymupdf4llm` (already listed in `pyproject.toml`).
|
| 185 |
+
- **Slow performance** – ensure GPU CUDA drivers are available or reduce concurrency by toggling `USE_MULTIPROCESSING` in `main.py`.
|
| 186 |
+
- **Large outputs** – clean the `output/` directory before reruns to avoid confusing duplicates.
|
| 187 |
+
|
| 188 |
+
For additional logging, set `LOG_LEVEL` or edit the `logger` configuration in `main.py`.
|
| 189 |
+
|
| 190 |
+
---
|
| 191 |
|
| 192 |
+
## Acknowledgements
|
| 193 |
+
- [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO)
|
| 194 |
+
- [PyMuPDF](https://pymupdf.readthedocs.io/)
|
| 195 |
+
- [PyMuPDF4LLM](https://github.com/pymupdf/RAG/blob/main/pymupdf4llm.md)
|
| 196 |
+
- [Flask](https://flask.palletsprojects.com/)
|
| 197 |
|
| 198 |
+
Happy extracting! 🎉
|
app.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import shutil
|
|
|
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Dict, List, Optional
|
| 6 |
from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
|
|
@@ -22,6 +24,10 @@ os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
|
|
| 22 |
# Global model instance
|
| 23 |
_model = None
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def get_device_info() -> Dict[str, any]:
|
| 27 |
"""Get information about GPU/CPU availability."""
|
|
@@ -65,97 +71,184 @@ def device_info():
|
|
| 65 |
return jsonify(get_device_info())
|
| 66 |
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
@app.route('/api/upload', methods=['POST'])
|
| 69 |
def upload_files():
|
| 70 |
-
"""Handle multiple PDF file uploads."""
|
| 71 |
if 'files[]' not in request.files:
|
| 72 |
return jsonify({'error': 'No files provided'}), 400
|
| 73 |
|
| 74 |
files = request.files.getlist('files[]')
|
| 75 |
extraction_mode = request.form.get('extraction_mode', 'images')
|
| 76 |
-
include_images = extraction_mode != 'markdown'
|
| 77 |
-
include_markdown = extraction_mode != 'images'
|
| 78 |
|
| 79 |
if not files or all(f.filename == '' for f in files):
|
| 80 |
return jsonify({'error': 'No files selected'}), 400
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
|
|
|
|
|
|
| 84 |
for file in files:
|
| 85 |
if file and file.filename.endswith('.pdf'):
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
# Prepare output directory
|
| 94 |
-
output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
|
| 95 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 96 |
-
|
| 97 |
-
# Copy PDF to output directory
|
| 98 |
-
pdf_path = output_dir / filename
|
| 99 |
-
upload_path.rename(pdf_path)
|
| 100 |
-
|
| 101 |
-
# Process PDF
|
| 102 |
-
extractor.USE_MULTIPROCESSING = False
|
| 103 |
-
logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
|
| 104 |
-
|
| 105 |
-
if include_images:
|
| 106 |
-
load_model_once()
|
| 107 |
-
|
| 108 |
-
extractor.process_pdf_with_pool(
|
| 109 |
-
pdf_path,
|
| 110 |
-
output_dir,
|
| 111 |
-
pool=None,
|
| 112 |
-
extract_images=include_images,
|
| 113 |
-
extract_markdown=include_markdown,
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
# Collect results
|
| 117 |
-
json_path = output_dir / f"{stem}_content_list.json"
|
| 118 |
-
elements = []
|
| 119 |
-
if include_images and json_path.exists():
|
| 120 |
-
elements = json.loads(json_path.read_text(encoding='utf-8'))
|
| 121 |
-
|
| 122 |
-
annotated_pdf = None
|
| 123 |
-
if include_images:
|
| 124 |
-
candidate_pdf = output_dir / f"{stem}_layout.pdf"
|
| 125 |
-
if candidate_pdf.exists():
|
| 126 |
-
annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))
|
| 127 |
-
|
| 128 |
-
markdown_path = None
|
| 129 |
-
if include_markdown:
|
| 130 |
-
candidate_md = output_dir / f"{stem}.md"
|
| 131 |
-
if candidate_md.exists():
|
| 132 |
-
markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))
|
| 133 |
-
|
| 134 |
-
# Get figure and table counts
|
| 135 |
-
figures = [e for e in elements if e.get('type') == 'figure']
|
| 136 |
-
tables = [e for e in elements if e.get('type') == 'table']
|
| 137 |
-
|
| 138 |
-
results.append({
|
| 139 |
-
'filename': filename,
|
| 140 |
-
'stem': stem,
|
| 141 |
-
'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
|
| 142 |
-
'figures_count': len(figures),
|
| 143 |
-
'tables_count': len(tables),
|
| 144 |
-
'elements_count': len(elements),
|
| 145 |
-
'annotated_pdf': annotated_pdf,
|
| 146 |
-
'markdown_path': markdown_path,
|
| 147 |
-
'include_images': include_images,
|
| 148 |
-
'include_markdown': include_markdown,
|
| 149 |
-
})
|
| 150 |
-
|
| 151 |
-
except Exception as e:
|
| 152 |
-
logger.error(f"Error processing {file.filename}: {e}")
|
| 153 |
-
results.append({
|
| 154 |
-
'filename': file.filename,
|
| 155 |
-
'error': str(e)
|
| 156 |
-
})
|
| 157 |
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
|
| 161 |
@app.route('/api/pdf-list')
|
|
@@ -290,7 +383,8 @@ def delete_pdf_by_path(stem: str):
|
|
| 290 |
|
| 291 |
|
| 292 |
if __name__ == '__main__':
|
| 293 |
-
port
|
|
|
|
| 294 |
app.run(debug=False, host='0.0.0.0', port=port)
|
| 295 |
|
| 296 |
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import shutil
|
| 4 |
+
import threading
|
| 5 |
+
import uuid
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Dict, List, Optional
|
| 8 |
from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
|
|
|
|
| 24 |
# Global model instance
|
| 25 |
_model = None
|
| 26 |
|
| 27 |
+
# Progress tracking: {task_id: {'status': 'processing'|'completed'|'error', 'progress': 0-100, 'message': str, 'results': [], 'file_progress': {filename: progress}}}
|
| 28 |
+
_progress_tracker: Dict[str, Dict] = {}
|
| 29 |
+
_progress_lock = threading.Lock()
|
| 30 |
+
|
| 31 |
|
| 32 |
def get_device_info() -> Dict[str, any]:
|
| 33 |
"""Get information about GPU/CPU availability."""
|
|
|
|
| 71 |
return jsonify(get_device_info())
|
| 72 |
|
| 73 |
|
| 74 |
+
def _update_task_progress(task_id: str, filename: str, file_progress: int, message: str):
|
| 75 |
+
"""Update progress for a specific file and calculate overall progress."""
|
| 76 |
+
with _progress_lock:
|
| 77 |
+
if task_id not in _progress_tracker:
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
# Update file-specific progress
|
| 81 |
+
if 'file_progress' not in _progress_tracker[task_id]:
|
| 82 |
+
_progress_tracker[task_id]['file_progress'] = {}
|
| 83 |
+
_progress_tracker[task_id]['file_progress'][filename] = file_progress
|
| 84 |
+
|
| 85 |
+
# Calculate overall progress (average of all files)
|
| 86 |
+
file_progresses = _progress_tracker[task_id]['file_progress']
|
| 87 |
+
if file_progresses:
|
| 88 |
+
total_progress = sum(file_progresses.values()) / len(file_progresses)
|
| 89 |
+
_progress_tracker[task_id]['progress'] = int(total_progress)
|
| 90 |
+
|
| 91 |
+
_progress_tracker[task_id]['message'] = message
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def process_file_background(task_id: str, file, extraction_mode: str):
|
| 95 |
+
"""Process a single file in the background and update progress."""
|
| 96 |
+
filename = secure_filename(file.filename)
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
_update_task_progress(task_id, filename, 5, f'Processing {filename}...')
|
| 100 |
+
|
| 101 |
+
stem = Path(filename).stem
|
| 102 |
+
include_images = extraction_mode != 'markdown'
|
| 103 |
+
include_markdown = extraction_mode != 'images'
|
| 104 |
+
|
| 105 |
+
# Save uploaded file
|
| 106 |
+
upload_path = Path(app.config['UPLOAD_FOLDER']) / filename
|
| 107 |
+
file.save(str(upload_path))
|
| 108 |
+
|
| 109 |
+
_update_task_progress(task_id, filename, 15, f'Saved {filename}, preparing output...')
|
| 110 |
+
|
| 111 |
+
# Prepare output directory
|
| 112 |
+
output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
|
| 113 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 114 |
+
|
| 115 |
+
# Copy PDF to output directory
|
| 116 |
+
pdf_path = output_dir / filename
|
| 117 |
+
upload_path.rename(pdf_path)
|
| 118 |
+
|
| 119 |
+
_update_task_progress(task_id, filename, 25, f'Loading model and processing {filename}...')
|
| 120 |
+
|
| 121 |
+
# Process PDF
|
| 122 |
+
extractor.USE_MULTIPROCESSING = False
|
| 123 |
+
logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
|
| 124 |
+
|
| 125 |
+
if include_images:
|
| 126 |
+
load_model_once()
|
| 127 |
+
|
| 128 |
+
_update_task_progress(task_id, filename, 30, f'Extracting content from {filename}...')
|
| 129 |
+
|
| 130 |
+
extractor.process_pdf_with_pool(
|
| 131 |
+
pdf_path,
|
| 132 |
+
output_dir,
|
| 133 |
+
pool=None,
|
| 134 |
+
extract_images=include_images,
|
| 135 |
+
extract_markdown=include_markdown,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
_update_task_progress(task_id, filename, 85, f'Collecting results for {filename}...')
|
| 139 |
+
|
| 140 |
+
# Collect results
|
| 141 |
+
json_path = output_dir / f"{stem}_content_list.json"
|
| 142 |
+
elements = []
|
| 143 |
+
if include_images and json_path.exists():
|
| 144 |
+
elements = json.loads(json_path.read_text(encoding='utf-8'))
|
| 145 |
+
|
| 146 |
+
annotated_pdf = None
|
| 147 |
+
if include_images:
|
| 148 |
+
candidate_pdf = output_dir / f"{stem}_layout.pdf"
|
| 149 |
+
if candidate_pdf.exists():
|
| 150 |
+
annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))
|
| 151 |
+
|
| 152 |
+
markdown_path = None
|
| 153 |
+
if include_markdown:
|
| 154 |
+
candidate_md = output_dir / f"{stem}.md"
|
| 155 |
+
if candidate_md.exists():
|
| 156 |
+
markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))
|
| 157 |
+
|
| 158 |
+
# Get figure and table counts
|
| 159 |
+
figures = [e for e in elements if e.get('type') == 'figure']
|
| 160 |
+
tables = [e for e in elements if e.get('type') == 'table']
|
| 161 |
+
|
| 162 |
+
result = {
|
| 163 |
+
'filename': filename,
|
| 164 |
+
'stem': stem,
|
| 165 |
+
'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
|
| 166 |
+
'figures_count': len(figures),
|
| 167 |
+
'tables_count': len(tables),
|
| 168 |
+
'elements_count': len(elements),
|
| 169 |
+
'annotated_pdf': annotated_pdf,
|
| 170 |
+
'markdown_path': markdown_path,
|
| 171 |
+
'include_images': include_images,
|
| 172 |
+
'include_markdown': include_markdown,
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
with _progress_lock:
|
| 176 |
+
_progress_tracker[task_id]['results'].append(result)
|
| 177 |
+
_update_task_progress(task_id, filename, 100, f'Completed processing {filename}')
|
| 178 |
+
|
| 179 |
+
# Check if all files are done
|
| 180 |
+
total_files = _progress_tracker[task_id].get('total_files', 1)
|
| 181 |
+
if len(_progress_tracker[task_id]['results']) >= total_files:
|
| 182 |
+
_progress_tracker[task_id]['status'] = 'completed'
|
| 183 |
+
_progress_tracker[task_id]['message'] = f'All {total_files} file(s) processed successfully'
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.error(f"Error processing {file.filename}: {e}")
|
| 187 |
+
with _progress_lock:
|
| 188 |
+
_progress_tracker[task_id]['results'].append({
|
| 189 |
+
'filename': filename,
|
| 190 |
+
'error': str(e)
|
| 191 |
+
})
|
| 192 |
+
# Check if this was the last file
|
| 193 |
+
total_files = _progress_tracker[task_id].get('total_files', 1)
|
| 194 |
+
if len(_progress_tracker[task_id]['results']) >= total_files:
|
| 195 |
+
_progress_tracker[task_id]['status'] = 'error'
|
| 196 |
+
_progress_tracker[task_id]['message'] = f'Error processing {filename}: {str(e)}'
|
| 197 |
+
|
| 198 |
+
|
| 199 |
@app.route('/api/upload', methods=['POST'])
|
| 200 |
def upload_files():
|
| 201 |
+
"""Handle multiple PDF file uploads with background processing."""
|
| 202 |
if 'files[]' not in request.files:
|
| 203 |
return jsonify({'error': 'No files provided'}), 400
|
| 204 |
|
| 205 |
files = request.files.getlist('files[]')
|
| 206 |
extraction_mode = request.form.get('extraction_mode', 'images')
|
|
|
|
|
|
|
| 207 |
|
| 208 |
if not files or all(f.filename == '' for f in files):
|
| 209 |
return jsonify({'error': 'No files selected'}), 400
|
| 210 |
|
| 211 |
+
# Create a task ID for this upload
|
| 212 |
+
task_id = str(uuid.uuid4())
|
| 213 |
+
|
| 214 |
+
# Initialize progress tracking
|
| 215 |
+
with _progress_lock:
|
| 216 |
+
_progress_tracker[task_id] = {
|
| 217 |
+
'status': 'processing',
|
| 218 |
+
'progress': 0,
|
| 219 |
+
'message': 'Starting upload...',
|
| 220 |
+
'results': [],
|
| 221 |
+
'total_files': len([f for f in files if f.filename.endswith('.pdf')])
|
| 222 |
+
}
|
| 223 |
|
| 224 |
+
# Process files in background threads
|
| 225 |
+
threads = []
|
| 226 |
for file in files:
|
| 227 |
if file and file.filename.endswith('.pdf'):
|
| 228 |
+
thread = threading.Thread(
|
| 229 |
+
target=process_file_background,
|
| 230 |
+
args=(task_id, file, extraction_mode)
|
| 231 |
+
)
|
| 232 |
+
thread.daemon = True
|
| 233 |
+
thread.start()
|
| 234 |
+
threads.append(thread)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
# Return task ID immediately
|
| 237 |
+
return jsonify({
|
| 238 |
+
'task_id': task_id,
|
| 239 |
+
'message': 'Processing started',
|
| 240 |
+
'total_files': len(threads)
|
| 241 |
+
})
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
@app.route('/api/progress/<task_id>')
|
| 245 |
+
def get_progress(task_id):
|
| 246 |
+
"""Get progress for a processing task."""
|
| 247 |
+
with _progress_lock:
|
| 248 |
+
progress = _progress_tracker.get(task_id)
|
| 249 |
+
if not progress:
|
| 250 |
+
return jsonify({'error': 'Task not found'}), 404
|
| 251 |
+
return jsonify(progress)
|
| 252 |
|
| 253 |
|
| 254 |
@app.route('/api/pdf-list')
|
|
|
|
| 383 |
|
| 384 |
|
| 385 |
if __name__ == '__main__':
|
| 386 |
+
# Run on port 7860 for Hugging Face Spaces, or 5000 for local development
|
| 387 |
+
port = int(os.environ.get('PORT', 7860))
|
| 388 |
app.run(debug=False, host='0.0.0.0', port=port)
|
| 389 |
|
| 390 |
|
entrypoint.sh
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
set -e
|
| 3 |
-
|
| 4 |
-
# Start Flask application
|
| 5 |
-
# Get port from environment variable or use default 7860
|
| 6 |
-
python -c "import os; port = int(os.environ.get('PORT', 7860)); from app import app; app.run(host='0.0.0.0', port=port, debug=False)"
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,13 +1,28 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
pypdfium2>=5.0.0
|
| 8 |
-
pymupdf4llm>=0.1.9
|
| 9 |
-
flask>=3.0.0
|
| 10 |
-
werkzeug>=3.0.0
|
| 11 |
-
torch>=2.0.0
|
| 12 |
-
torchvision>=0.15.0
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core PDF & Document Processing
|
| 2 |
+
doclayout-yolo==0.0.4
|
| 3 |
+
pymupdf==1.26.6
|
| 4 |
+
pymupdf-layout==0.0.15
|
| 5 |
+
pymupdf4llm==0.1.9
|
| 6 |
+
pypdfium2==5.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
# Deep Learning (CPU-optimized)
|
| 9 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 10 |
+
torch==2.5.1+cpu
|
| 11 |
+
torchvision==0.20.1+cpu
|
| 12 |
+
|
| 13 |
+
# Image Processing
|
| 14 |
+
pillow==12.0.0
|
| 15 |
+
opencv-python-headless==4.10.0.84
|
| 16 |
+
|
| 17 |
+
# OCR
|
| 18 |
+
pytesseract==0.3.13
|
| 19 |
+
|
| 20 |
+
# Utilities
|
| 21 |
+
huggingface-hub==0.26.2
|
| 22 |
+
loguru==0.7.3
|
| 23 |
+
numpy==1.26.4
|
| 24 |
+
|
| 25 |
+
# Web Framework
|
| 26 |
+
flask==3.0.0
|
| 27 |
+
werkzeug==3.0.1
|
| 28 |
+
streamlit==1.40.1
|
static/js/app.js
CHANGED
|
@@ -84,10 +84,21 @@ async function handleUpload(e) {
|
|
| 84 |
const extractionMode = document.querySelector('input[name="extractionMode"]:checked').value;
|
| 85 |
|
| 86 |
// Show processing section
|
| 87 |
-
document.getElementById('processingSection')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
document.getElementById('resultsSection').style.display = 'none';
|
| 89 |
document.getElementById('emptyState').style.display = 'none';
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
const formData = new FormData();
|
| 92 |
for (let i = 0; i < files.length; i++) {
|
| 93 |
formData.append('files[]', files[i]);
|
|
@@ -106,18 +117,13 @@ async function handleUpload(e) {
|
|
| 106 |
throw new Error(data.error);
|
| 107 |
}
|
| 108 |
|
| 109 |
-
//
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
if (data.results && data.results.length > 0) {
|
| 117 |
-
const firstPdf = data.results[0];
|
| 118 |
-
if (!firstPdf.error) {
|
| 119 |
-
showPdfDetails(firstPdf.stem);
|
| 120 |
-
}
|
| 121 |
}
|
| 122 |
|
| 123 |
// Reset form
|
|
@@ -126,10 +132,82 @@ async function handleUpload(e) {
|
|
| 126 |
} catch (error) {
|
| 127 |
console.error('Upload error:', error);
|
| 128 |
alert('Error processing files: ' + error.message);
|
| 129 |
-
|
| 130 |
}
|
| 131 |
}
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
// Load PDF List
|
| 134 |
async function loadPdfList() {
|
| 135 |
try {
|
|
|
|
| 84 |
const extractionMode = document.querySelector('input[name="extractionMode"]:checked').value;
|
| 85 |
|
| 86 |
// Show processing section
|
| 87 |
+
const processingSection = document.getElementById('processingSection');
|
| 88 |
+
const processingStatus = document.getElementById('processingStatus');
|
| 89 |
+
const progressBar = document.getElementById('progressBar');
|
| 90 |
+
const progressBarFill = document.getElementById('progressBarFill');
|
| 91 |
+
|
| 92 |
+
processingSection.style.display = 'block';
|
| 93 |
document.getElementById('resultsSection').style.display = 'none';
|
| 94 |
document.getElementById('emptyState').style.display = 'none';
|
| 95 |
|
| 96 |
+
// Update processing UI
|
| 97 |
+
processingStatus.textContent = 'Uploading files...';
|
| 98 |
+
if (progressBarFill) {
|
| 99 |
+
progressBarFill.style.width = '0%';
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
const formData = new FormData();
|
| 103 |
for (let i = 0; i < files.length; i++) {
|
| 104 |
formData.append('files[]', files[i]);
|
|
|
|
| 117 |
throw new Error(data.error);
|
| 118 |
}
|
| 119 |
|
| 120 |
+
// Start polling for progress
|
| 121 |
+
if (data.task_id) {
|
| 122 |
+
await pollProgress(data.task_id, processingStatus, progressBarFill);
|
| 123 |
+
} else {
|
| 124 |
+
// Fallback for old API
|
| 125 |
+
processingSection.style.display = 'none';
|
| 126 |
+
await loadPdfList();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
}
|
| 128 |
|
| 129 |
// Reset form
|
|
|
|
| 132 |
} catch (error) {
|
| 133 |
console.error('Upload error:', error);
|
| 134 |
alert('Error processing files: ' + error.message);
|
| 135 |
+
processingSection.style.display = 'none';
|
| 136 |
}
|
| 137 |
}
|
| 138 |
|
| 139 |
+
// Poll for progress updates
|
| 140 |
+
async function pollProgress(taskId, statusElement, progressBarFill) {
|
| 141 |
+
const maxAttempts = 600; // 5 minutes max (600 * 0.5s)
|
| 142 |
+
let attempts = 0;
|
| 143 |
+
|
| 144 |
+
const poll = async () => {
|
| 145 |
+
try {
|
| 146 |
+
const response = await fetch(`/api/progress/${taskId}`);
|
| 147 |
+
const data = await response.json();
|
| 148 |
+
|
| 149 |
+
if (data.error) {
|
| 150 |
+
throw new Error(data.error);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
// Update progress bar
|
| 154 |
+
if (progressBarFill) {
|
| 155 |
+
const progress = data.progress || 0;
|
| 156 |
+
progressBarFill.style.width = `${progress}%`;
|
| 157 |
+
progressBarFill.setAttribute('aria-valuenow', progress);
|
| 158 |
+
const progressText = document.getElementById('progressBarText');
|
| 159 |
+
if (progressText) {
|
| 160 |
+
progressText.textContent = `${Math.round(progress)}%`;
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
// Update status message
|
| 165 |
+
if (statusElement) {
|
| 166 |
+
statusElement.textContent = data.message || 'Processing...';
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
// Check if completed
|
| 170 |
+
if (data.status === 'completed') {
|
| 171 |
+
// Hide processing section
|
| 172 |
+
document.getElementById('processingSection').style.display = 'none';
|
| 173 |
+
|
| 174 |
+
// Reload PDF list and show results
|
| 175 |
+
await loadPdfList();
|
| 176 |
+
|
| 177 |
+
// Show first PDF details if available
|
| 178 |
+
if (data.results && data.results.length > 0) {
|
| 179 |
+
const firstPdf = data.results[0];
|
| 180 |
+
if (!firstPdf.error) {
|
| 181 |
+
showPdfDetails(firstPdf.stem);
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
return;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
// Check if error
|
| 188 |
+
if (data.status === 'error') {
|
| 189 |
+
throw new Error(data.message || 'Processing failed');
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
// Continue polling
|
| 193 |
+
attempts++;
|
| 194 |
+
if (attempts < maxAttempts) {
|
| 195 |
+
setTimeout(poll, 500); // Poll every 500ms
|
| 196 |
+
} else {
|
| 197 |
+
throw new Error('Processing timeout - please try again');
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
} catch (error) {
|
| 201 |
+
console.error('Progress polling error:', error);
|
| 202 |
+
document.getElementById('processingSection').style.display = 'none';
|
| 203 |
+
alert('Error: ' + error.message);
|
| 204 |
+
}
|
| 205 |
+
};
|
| 206 |
+
|
| 207 |
+
// Start polling
|
| 208 |
+
poll();
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
// Load PDF List
|
| 212 |
async function loadPdfList() {
|
| 213 |
try {
|
templates/index.html
CHANGED
|
@@ -120,15 +120,25 @@
|
|
| 120 |
<div class="col-12">
|
| 121 |
<div class="card shadow-sm">
|
| 122 |
<div class="card-body">
|
| 123 |
-
<div class="d-flex align-items-center">
|
| 124 |
<div class="spinner-border text-primary me-3" role="status">
|
| 125 |
<span class="visually-hidden">Loading...</span>
|
| 126 |
</div>
|
| 127 |
-
<div>
|
| 128 |
<h6 class="mb-0">Processing PDFs...</h6>
|
| 129 |
<small class="text-muted" id="processingStatus">Please wait</small>
|
| 130 |
</div>
|
| 131 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
</div>
|
| 133 |
</div>
|
| 134 |
</div>
|
|
|
|
| 120 |
<div class="col-12">
|
| 121 |
<div class="card shadow-sm">
|
| 122 |
<div class="card-body">
|
| 123 |
+
<div class="d-flex align-items-center mb-3">
|
| 124 |
<div class="spinner-border text-primary me-3" role="status">
|
| 125 |
<span class="visually-hidden">Loading...</span>
|
| 126 |
</div>
|
| 127 |
+
<div class="flex-grow-1">
|
| 128 |
<h6 class="mb-0">Processing PDFs...</h6>
|
| 129 |
<small class="text-muted" id="processingStatus">Please wait</small>
|
| 130 |
</div>
|
| 131 |
</div>
|
| 132 |
+
<div class="progress" style="height: 25px;">
|
| 133 |
+
<div id="progressBarFill" class="progress-bar progress-bar-striped progress-bar-animated"
|
| 134 |
+
role="progressbar"
|
| 135 |
+
style="width: 0%"
|
| 136 |
+
aria-valuenow="0"
|
| 137 |
+
aria-valuemin="0"
|
| 138 |
+
aria-valuemax="100">
|
| 139 |
+
<span id="progressBarText">0%</span>
|
| 140 |
+
</div>
|
| 141 |
+
</div>
|
| 142 |
</div>
|
| 143 |
</div>
|
| 144 |
</div>
|