saifisvibinn commited on
Commit
a20a1e3
·
1 Parent(s): 5b16707

Deploy PDF extraction tool with API and progress tracking

Browse files
Files changed (10) hide show
  1. .dockerignore +36 -20
  2. .gitattributes +0 -35
  3. .gitignore +32 -0
  4. Dockerfile +23 -28
  5. README.md +181 -39
  6. app.py +171 -77
  7. entrypoint.sh +0 -7
  8. requirements.txt +27 -12
  9. static/js/app.js +92 -14
  10. templates/index.html +12 -2
.dockerignore CHANGED
@@ -1,25 +1,41 @@
1
- __pycache__
2
- *.pyc
3
- *.pyo
4
- *.pyd
5
- .Python
6
  *.so
7
- *.egg
8
- *.egg-info
9
- dist
10
- build
11
- .git
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  .gitignore
13
- .vscode
14
- .idea
15
- *.md
16
- !README.md
17
- pdfs/
18
  output/
19
  uploads/
20
- uv.lock
21
- .env
22
- .venv
23
- venv/
24
- env/
 
 
 
 
25
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
 
5
  *.so
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+
11
+ # Virtual environments
12
+ venv/
13
+ env/
14
+ ENV/
15
+
16
+ # IDE
17
+ .vscode/
18
+ .idea/
19
+ *.swp
20
+ *.swo
21
+
22
+ # Git
23
+ .git/
24
  .gitignore
25
+
26
+ # Output directories (will be created in container)
 
 
 
27
  output/
28
  uploads/
29
+ pdfs/
30
+
31
+ # Documentation
32
+ *.md
33
+ !README.md
34
+
35
+ # Docker
36
+ Dockerfile
37
+ .dockerignore
38
 
39
+ # Other
40
+ .DS_Store
41
+ *.log
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+
11
+ # Virtual environments
12
+ venv/
13
+ env/
14
+ ENV/
15
+
16
+ # Output directories
17
+ output/
18
+ uploads/
19
+ pdfs/
20
+
21
+ # IDE
22
+ .vscode/
23
+ .idea/
24
+ *.swp
25
+ *.swo
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # Logs
32
+ *.log
Dockerfile CHANGED
@@ -1,45 +1,40 @@
1
- # Use Python 3.12 slim image as base
2
  FROM python:3.12-slim
3
 
4
- # Install system dependencies (as root)
 
 
 
5
  RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- libgl1 \
 
8
  libglib2.0-0 \
 
 
 
 
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # Create a non-root user (Hugging Face Spaces best practice)
12
- RUN useradd -m -u 1000 user
13
- USER user
14
- ENV PATH="/home/user/.local/bin:$PATH"
15
-
16
- # Set working directory
17
- WORKDIR /app
18
-
19
  # Copy requirements first for better caching
20
- COPY --chown=user ./requirements.txt requirements.txt
21
 
22
  # Install Python dependencies
23
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
24
 
25
  # Copy application files
26
- COPY --chown=user . /app
27
-
28
- # Make entrypoint script executable
29
- RUN chmod +x /app/entrypoint.sh
30
 
31
  # Create necessary directories
32
- RUN mkdir -p uploads output
33
 
34
- # Expose port (Hugging Face Spaces uses 7860)
35
  EXPOSE 7860
36
 
37
- # Set environment variables
38
- ENV FLASK_APP=app.py
39
- ENV PYTHONUNBUFFERED=1
40
- ENV PORT=7860
41
-
42
- # Run the Flask app
43
- # Hugging Face Spaces expects the app to listen on 0.0.0.0 and port 7860
44
- CMD ["/app/entrypoint.sh"]
45
 
 
 
 
1
+ # Hugging Face Spaces Docker Runtime
2
  FROM python:3.12-slim
3
 
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
+ tesseract-ocr \
10
+ tesseract-ocr-eng \
11
+ poppler-utils \
12
  libglib2.0-0 \
13
+ libsm6 \
14
+ libxext6 \
15
+ libxrender-dev \
16
+ libgomp1 \
17
+ curl \
18
  && rm -rf /var/lib/apt/lists/*
19
 
 
 
 
 
 
 
 
 
20
  # Copy requirements first for better caching
21
+ COPY requirements.txt .
22
 
23
  # Install Python dependencies
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
 
26
  # Copy application files
27
+ COPY . .
 
 
 
28
 
29
  # Create necessary directories
30
+ RUN mkdir -p output pdfs uploads
31
 
32
+ # Expose Hugging Face Spaces default port
33
  EXPOSE 7860
34
 
35
+ # Health check
36
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
37
+ CMD curl -f http://localhost:7860/ || exit 1
 
 
 
 
 
38
 
39
+ # Run Flask app
40
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,56 +1,198 @@
1
- ---
2
- title: PDF Layout Extractor
3
- emoji: 📄
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- app_port: 7860
10
- ---
11
 
12
- # PDF Layout Extractor
13
 
14
- A web application for extracting figures, tables, annotated layouts, and markdown text from scientific PDFs using [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO).
15
 
16
  ## Features
17
-
18
  - **Layout-aware extraction** of figures and tables with YOLO-based detection
19
  - **Cross-page stitching** for multi-page tables, captions, titles, and body text
20
  - **Annotated PDF output** with bounding boxes for detected regions
21
- - **Markdown export** powered by `pymupdf4llm`
22
- - **Modern Flask Web UI** with dark/light theme support
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- ## Usage
 
 
 
 
25
 
26
- 1. Upload one or more PDF files (max 500MB per file)
27
- 2. Choose extraction mode:
28
- - **Images Only**: Extract figures and tables with layout detection
29
- - **Markdown Only**: Extract text content as markdown
30
- - **Both**: Extract both images and markdown
31
- 3. Wait for processing to complete
32
- 4. View and download extracted figures, tables, annotated PDFs, and markdown files
 
 
 
 
 
 
 
33
 
34
- ## Technical Details
35
 
36
- - Built with Flask and DocLayout-YOLO
37
- - Supports both CPU and GPU processing (GPU recommended for faster processing)
38
- - Maximum file size: 500MB per PDF
39
- - Model: DocLayout-YOLO from `juliozhao/DocLayout-YOLO-DocStructBench`
 
 
 
 
 
40
 
41
- ## Output Structure
 
 
 
 
 
 
42
 
43
- Each processed PDF creates a directory with:
44
- - `*_content_list.json` - Metadata for extracted figures/tables
45
- - `*_layout.pdf` - Annotated PDF with layout bounding boxes
46
- - `*.md` - Markdown export of text content
47
- - `figures/` - Extracted figure images (PNG)
48
- - `tables/` - Extracted table images (PNG)
49
 
50
- ## Model Information
 
 
 
 
51
 
52
- This application uses the DocLayout-YOLO model for document layout detection. The model is automatically downloaded from Hugging Face Hub on first use.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- ## License
 
 
 
 
55
 
56
- MIT License
 
1
+ # PDF Layout Extraction Companion
 
 
 
 
 
 
 
 
 
2
 
3
+ A streamlined workflow for extracting figures, tables, annotated layouts, and markdown text from scientific PDFs using [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO), PyMuPDF, and Flask. The project exposes a command-line pipeline (`main.py`) and a modern Flask web UI (`app.py`).
4
 
5
+ ---
6
 
7
  ## Features
 
8
  - **Layout-aware extraction** of figures and tables with YOLO-based detection
9
  - **Cross-page stitching** for multi-page tables, captions, titles, and body text
10
  - **Annotated PDF output** with bounding boxes for detected regions
11
+ - **Markdown export** powered by `pymupdf4llm` / `pymupdf-layout`
12
+ - **Flask Web UI** with modern design, dark/light theme, GPU/CPU status, and individual PDF viewing
13
+ - Unified `output/<PDF stem>/` directory structure for CLI + UI runs
14
+
15
+ ---
16
+
17
+ ## Requirements
18
+ - Python 3.12+
19
+ - [uv](https://docs.astral.sh/uv/latest/) (recommended) or `pip`
20
+ - GPU optional (DocLayout-YOLO runs on CPU as well)
21
+
22
+ Install dependencies:
23
+ ```bash
24
+ uv pip install
25
+ ```
26
+
27
+ > If you prefer a virtualenv, create/activate it first, then run `uv pip install` inside.
28
+
29
+ ---
30
+
31
+ ## Quick Start
32
+
33
+ ### Command Line Pipeline
34
+ Process all PDFs in `./pdfs` and write outputs to `./output/<PDF stem>/`:
35
+ ```bash
36
+ uv run python main.py
37
+ ```
38
+
39
+ Each subdirectory contains:
40
+ - `* _content_list.json` – metadata for extracted figures/tables
41
+ - `*_layout.pdf` – annotated PDF with layout boxes
42
+ - `*.md` – markdown export (if `pymupdf4llm` is installed)
43
+ - `figures/` & `tables/` – cropped PNGs with stitched captions/titles
44
+
45
+ ### Flask Web App (Recommended)
46
+ Launch the modern Flask web interface locally:
47
+ ```bash
48
+ python run_flask_gpu.py
49
+ ```
50
+ Then open your browser to `http://localhost:5000`
51
+
52
+ **Features:**
53
+ - Clean, modern UI with dark/light theme support
54
+ - Multiple PDF upload and processing
55
+ - **Real-time progress bar** with status updates
56
+ - Individual PDF output viewing with sidebar navigation
57
+ - Real-time GPU/CPU status display
58
+ - Image gallery for figures and tables
59
+ - Markdown preview and download
60
+ - Responsive design for mobile and desktop
61
+ - **REST API** for programmatic access
62
+
63
+ All Flask app runs also write into `./output/<PDF stem>/` using the same structure as the CLI.
64
+
65
+ ### Deploy to Hugging Face Spaces (Docker)
66
+ Deploy your Flask app to Hugging Face Spaces with Docker:
67
 
68
+ 1. **Create a new Space on Hugging Face:**
69
+ - Go to [Hugging Face Spaces](https://huggingface.co/spaces)
70
+ - Click "Create new Space"
71
+ - Choose "Docker" as the SDK
72
+ - Set visibility (public/private)
73
 
74
+ 2. **Push your code:**
75
+ ```bash
76
+ # Clone your space (replace with your space name)
77
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
78
+ cd YOUR_SPACE_NAME
79
+
80
+ # Copy your project files
81
+ cp -r /path/to/pdf-minor-allegations/* .
82
+
83
+ # Commit and push
84
+ git add .
85
+ git commit -m "Initial deployment"
86
+ git push
87
+ ```
88
 
89
+ 3. **Your Space will automatically build and deploy!**
90
 
91
+ **Features:**
92
+ - **REST API endpoints** for programmatic access
93
+ - **Real-time progress tracking** with progress bar
94
+ - **Multiple processing modes:** Images only, Markdown only, or Both
95
+ - **Background processing** - upload files and track progress via API
96
+ - **Modern web UI** with dark/light theme
97
+ - **GPU/CPU support** - automatically detects available hardware
98
+ - **Free tier available** with CPU instances
99
+ - **Automatic HTTPS** and custom domain support
100
 
101
+ **API Endpoints:**
102
+ - `POST /api/upload` - Upload PDFs for processing (returns `task_id`)
103
+ - `GET /api/progress/<task_id>` - Get processing progress (0-100%)
104
+ - `GET /api/pdf-list` - List all processed PDFs
105
+ - `GET /api/pdf-details/<pdf_stem>` - Get details for a processed PDF
106
+ - `GET /api/device-info` - Get GPU/CPU device information
107
+ - `GET /output/<path>` - Download processed files (PDFs, images, markdown)
108
 
109
+ **Example API Usage:**
110
+ ```python
111
+ import requests
112
+ import time
 
 
113
 
114
+ # Upload a PDF
115
+ files = {'files[]': open('document.pdf', 'rb')}
116
+ data = {'extraction_mode': 'both'} # or 'images' or 'markdown'
117
+ response = requests.post('https://YOUR_SPACE.hf.space/api/upload', files=files, data=data)
118
+ task_id = response.json()['task_id']
119
 
120
+ # Poll for progress
121
+ while True:
122
+ progress = requests.get(f'https://YOUR_SPACE.hf.space/api/progress/{task_id}').json()
123
+ print(f"Progress: {progress['progress']}% - {progress['message']}")
124
+ if progress['status'] == 'completed':
125
+ break
126
+ time.sleep(0.5)
127
+
128
+ # Get results
129
+ results = progress['results']
130
+ ```
131
+
132
+ ### Deploy to Modal.com (Cloud with GPU)
133
+ Deploy your Flask app online with GPU support using Modal:
134
+ ```bash
135
+ # Install Modal CLI
136
+ pip install modal
137
+
138
+ # Authenticate with Modal
139
+ modal token new
140
+
141
+ # Deploy to Modal
142
+ modal deploy modal_app.py
143
+ ```
144
+
145
+ See [MODAL_DEPLOYMENT.md](MODAL_DEPLOYMENT.md) for detailed instructions.
146
+
147
+ **Benefits:**
148
+ - GPU support (T4, A10G, or A100)
149
+ - Pay-per-use pricing
150
+ - Automatic HTTPS
151
+ - Auto-scaling
152
+ - Global deployment
153
+
154
+ ---
155
+
156
+ ## Configuration Highlights
157
+ - **Detection model:** DocLayout-YOLO (`doclayout_yolo_docstructbench_imgsz1024.pt`)
158
+ - **Detection thresholds:** configurable in `main.py`
159
+ - **Layout stitching:** tables, captions, titles, body text
160
+ - **Markdown extraction:** defaults to enabled (`pymupdf4llm.to_markdown`); falls back gracefully if the package is missing
161
+ - **Output directory:** `./output` (configurable near the bottom of `main.py`)
162
+
163
+ ---
164
+
165
+ ## File Overview
166
+ | Path | Description |
167
+ |------|-------------|
168
+ | `main.py` | CLI pipeline for batch PDF processing |
169
+ | `app.py` | Flask web application (recommended UI) with API endpoints |
170
+ | `run_flask_gpu.py` | Local Flask runner with GPU support |
171
+ | `Dockerfile` | Docker configuration for Hugging Face Spaces deployment |
172
+ | `modal_app.py` | Modal.com deployment configuration (cloud GPU) |
173
+ | `MODAL_DEPLOYMENT.md` | Modal.com deployment guide |
174
+ | `templates/` | Flask HTML templates |
175
+ | `static/` | Flask static files (CSS, JS) |
176
+ | `pdfs/` | Source PDFs (gitignored) |
177
+ | `output/` | Generated outputs per PDF |
178
+ | `pyproject.toml` | Project metadata & dependency list |
179
+ | `uv.lock` | Locked dependency versions (auto-maintained by `uv`) |
180
+
181
+ ---
182
+
183
+ ## Troubleshooting
184
+ - **`ModuleNotFoundError: pymupdf4llm`** – install it via `uv pip install pymupdf4llm` (already listed in `pyproject.toml`).
185
+ - **Slow performance** – ensure GPU CUDA drivers are available or reduce concurrency by toggling `USE_MULTIPROCESSING` in `main.py`.
186
+ - **Large outputs** – clean the `output/` directory before reruns to avoid confusing duplicates.
187
+
188
+ For additional logging, set `LOG_LEVEL` or edit the `logger` configuration in `main.py`.
189
+
190
+ ---
191
 
192
+ ## Acknowledgements
193
+ - [DocLayout-YOLO](https://github.com/juliozhao/DocLayout-YOLO)
194
+ - [PyMuPDF](https://pymupdf.readthedocs.io/)
195
+ - [PyMuPDF4LLM](https://github.com/pymupdf/RAG/blob/main/pymupdf4llm.md)
196
+ - [Flask](https://flask.palletsprojects.com/)
197
 
198
+ Happy extracting! 🎉
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import json
2
  import os
3
  import shutil
 
 
4
  from pathlib import Path
5
  from typing import Dict, List, Optional
6
  from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
@@ -22,6 +24,10 @@ os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
22
  # Global model instance
23
  _model = None
24
 
 
 
 
 
25
 
26
  def get_device_info() -> Dict[str, any]:
27
  """Get information about GPU/CPU availability."""
@@ -65,97 +71,184 @@ def device_info():
65
  return jsonify(get_device_info())
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  @app.route('/api/upload', methods=['POST'])
69
  def upload_files():
70
- """Handle multiple PDF file uploads."""
71
  if 'files[]' not in request.files:
72
  return jsonify({'error': 'No files provided'}), 400
73
 
74
  files = request.files.getlist('files[]')
75
  extraction_mode = request.form.get('extraction_mode', 'images')
76
- include_images = extraction_mode != 'markdown'
77
- include_markdown = extraction_mode != 'images'
78
 
79
  if not files or all(f.filename == '' for f in files):
80
  return jsonify({'error': 'No files selected'}), 400
81
 
82
- results = []
 
 
 
 
 
 
 
 
 
 
 
83
 
 
 
84
  for file in files:
85
  if file and file.filename.endswith('.pdf'):
86
- try:
87
- # Save uploaded file
88
- filename = secure_filename(file.filename)
89
- stem = Path(filename).stem
90
- upload_path = Path(app.config['UPLOAD_FOLDER']) / filename
91
- file.save(str(upload_path))
92
-
93
- # Prepare output directory
94
- output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
95
- output_dir.mkdir(parents=True, exist_ok=True)
96
-
97
- # Copy PDF to output directory
98
- pdf_path = output_dir / filename
99
- upload_path.rename(pdf_path)
100
-
101
- # Process PDF
102
- extractor.USE_MULTIPROCESSING = False
103
- logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
104
-
105
- if include_images:
106
- load_model_once()
107
-
108
- extractor.process_pdf_with_pool(
109
- pdf_path,
110
- output_dir,
111
- pool=None,
112
- extract_images=include_images,
113
- extract_markdown=include_markdown,
114
- )
115
-
116
- # Collect results
117
- json_path = output_dir / f"{stem}_content_list.json"
118
- elements = []
119
- if include_images and json_path.exists():
120
- elements = json.loads(json_path.read_text(encoding='utf-8'))
121
-
122
- annotated_pdf = None
123
- if include_images:
124
- candidate_pdf = output_dir / f"{stem}_layout.pdf"
125
- if candidate_pdf.exists():
126
- annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))
127
-
128
- markdown_path = None
129
- if include_markdown:
130
- candidate_md = output_dir / f"{stem}.md"
131
- if candidate_md.exists():
132
- markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))
133
-
134
- # Get figure and table counts
135
- figures = [e for e in elements if e.get('type') == 'figure']
136
- tables = [e for e in elements if e.get('type') == 'table']
137
-
138
- results.append({
139
- 'filename': filename,
140
- 'stem': stem,
141
- 'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
142
- 'figures_count': len(figures),
143
- 'tables_count': len(tables),
144
- 'elements_count': len(elements),
145
- 'annotated_pdf': annotated_pdf,
146
- 'markdown_path': markdown_path,
147
- 'include_images': include_images,
148
- 'include_markdown': include_markdown,
149
- })
150
-
151
- except Exception as e:
152
- logger.error(f"Error processing {file.filename}: {e}")
153
- results.append({
154
- 'filename': file.filename,
155
- 'error': str(e)
156
- })
157
 
158
- return jsonify({'results': results})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
 
161
  @app.route('/api/pdf-list')
@@ -290,7 +383,8 @@ def delete_pdf_by_path(stem: str):
290
 
291
 
292
  if __name__ == '__main__':
293
- port = int(os.environ.get('PORT', 5000))
 
294
  app.run(debug=False, host='0.0.0.0', port=port)
295
 
296
 
 
1
  import json
2
  import os
3
  import shutil
4
+ import threading
5
+ import uuid
6
  from pathlib import Path
7
  from typing import Dict, List, Optional
8
  from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
 
24
  # Global model instance
25
  _model = None
26
 
27
+ # Progress tracking: {task_id: {'status': 'processing'|'completed'|'error', 'progress': 0-100, 'message': str, 'results': [], 'file_progress': {filename: progress}}}
28
+ _progress_tracker: Dict[str, Dict] = {}
29
+ _progress_lock = threading.Lock()
30
+
31
 
32
  def get_device_info() -> Dict[str, any]:
33
  """Get information about GPU/CPU availability."""
 
71
  return jsonify(get_device_info())
72
 
73
 
74
+ def _update_task_progress(task_id: str, filename: str, file_progress: int, message: str):
75
+ """Update progress for a specific file and calculate overall progress."""
76
+ with _progress_lock:
77
+ if task_id not in _progress_tracker:
78
+ return
79
+
80
+ # Update file-specific progress
81
+ if 'file_progress' not in _progress_tracker[task_id]:
82
+ _progress_tracker[task_id]['file_progress'] = {}
83
+ _progress_tracker[task_id]['file_progress'][filename] = file_progress
84
+
85
+ # Calculate overall progress (average of all files)
86
+ file_progresses = _progress_tracker[task_id]['file_progress']
87
+ if file_progresses:
88
+ total_progress = sum(file_progresses.values()) / len(file_progresses)
89
+ _progress_tracker[task_id]['progress'] = int(total_progress)
90
+
91
+ _progress_tracker[task_id]['message'] = message
92
+
93
+
94
+ def process_file_background(task_id: str, file, extraction_mode: str):
95
+ """Process a single file in the background and update progress."""
96
+ filename = secure_filename(file.filename)
97
+
98
+ try:
99
+ _update_task_progress(task_id, filename, 5, f'Processing {filename}...')
100
+
101
+ stem = Path(filename).stem
102
+ include_images = extraction_mode != 'markdown'
103
+ include_markdown = extraction_mode != 'images'
104
+
105
+ # Save uploaded file
106
+ upload_path = Path(app.config['UPLOAD_FOLDER']) / filename
107
+ file.save(str(upload_path))
108
+
109
+ _update_task_progress(task_id, filename, 15, f'Saved {filename}, preparing output...')
110
+
111
+ # Prepare output directory
112
+ output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
113
+ output_dir.mkdir(parents=True, exist_ok=True)
114
+
115
+ # Copy PDF to output directory
116
+ pdf_path = output_dir / filename
117
+ upload_path.rename(pdf_path)
118
+
119
+ _update_task_progress(task_id, filename, 25, f'Loading model and processing {filename}...')
120
+
121
+ # Process PDF
122
+ extractor.USE_MULTIPROCESSING = False
123
+ logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")
124
+
125
+ if include_images:
126
+ load_model_once()
127
+
128
+ _update_task_progress(task_id, filename, 30, f'Extracting content from {filename}...')
129
+
130
+ extractor.process_pdf_with_pool(
131
+ pdf_path,
132
+ output_dir,
133
+ pool=None,
134
+ extract_images=include_images,
135
+ extract_markdown=include_markdown,
136
+ )
137
+
138
+ _update_task_progress(task_id, filename, 85, f'Collecting results for {filename}...')
139
+
140
+ # Collect results
141
+ json_path = output_dir / f"{stem}_content_list.json"
142
+ elements = []
143
+ if include_images and json_path.exists():
144
+ elements = json.loads(json_path.read_text(encoding='utf-8'))
145
+
146
+ annotated_pdf = None
147
+ if include_images:
148
+ candidate_pdf = output_dir / f"{stem}_layout.pdf"
149
+ if candidate_pdf.exists():
150
+ annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))
151
+
152
+ markdown_path = None
153
+ if include_markdown:
154
+ candidate_md = output_dir / f"{stem}.md"
155
+ if candidate_md.exists():
156
+ markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))
157
+
158
+ # Get figure and table counts
159
+ figures = [e for e in elements if e.get('type') == 'figure']
160
+ tables = [e for e in elements if e.get('type') == 'table']
161
+
162
+ result = {
163
+ 'filename': filename,
164
+ 'stem': stem,
165
+ 'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
166
+ 'figures_count': len(figures),
167
+ 'tables_count': len(tables),
168
+ 'elements_count': len(elements),
169
+ 'annotated_pdf': annotated_pdf,
170
+ 'markdown_path': markdown_path,
171
+ 'include_images': include_images,
172
+ 'include_markdown': include_markdown,
173
+ }
174
+
175
+ with _progress_lock:
176
+ _progress_tracker[task_id]['results'].append(result)
177
+ _update_task_progress(task_id, filename, 100, f'Completed processing {filename}')
178
+
179
+ # Check if all files are done
180
+ total_files = _progress_tracker[task_id].get('total_files', 1)
181
+ if len(_progress_tracker[task_id]['results']) >= total_files:
182
+ _progress_tracker[task_id]['status'] = 'completed'
183
+ _progress_tracker[task_id]['message'] = f'All {total_files} file(s) processed successfully'
184
+
185
+ except Exception as e:
186
+ logger.error(f"Error processing {file.filename}: {e}")
187
+ with _progress_lock:
188
+ _progress_tracker[task_id]['results'].append({
189
+ 'filename': filename,
190
+ 'error': str(e)
191
+ })
192
+ # Check if this was the last file
193
+ total_files = _progress_tracker[task_id].get('total_files', 1)
194
+ if len(_progress_tracker[task_id]['results']) >= total_files:
195
+ _progress_tracker[task_id]['status'] = 'error'
196
+ _progress_tracker[task_id]['message'] = f'Error processing {filename}: {str(e)}'
197
+
198
+
199
  @app.route('/api/upload', methods=['POST'])
200
  def upload_files():
201
+ """Handle multiple PDF file uploads with background processing."""
202
  if 'files[]' not in request.files:
203
  return jsonify({'error': 'No files provided'}), 400
204
 
205
  files = request.files.getlist('files[]')
206
  extraction_mode = request.form.get('extraction_mode', 'images')
 
 
207
 
208
  if not files or all(f.filename == '' for f in files):
209
  return jsonify({'error': 'No files selected'}), 400
210
 
211
+ # Create a task ID for this upload
212
+ task_id = str(uuid.uuid4())
213
+
214
+ # Initialize progress tracking
215
+ with _progress_lock:
216
+ _progress_tracker[task_id] = {
217
+ 'status': 'processing',
218
+ 'progress': 0,
219
+ 'message': 'Starting upload...',
220
+ 'results': [],
221
+ 'total_files': len([f for f in files if f.filename.endswith('.pdf')])
222
+ }
223
 
224
+ # Process files in background threads
225
+ threads = []
226
  for file in files:
227
  if file and file.filename.endswith('.pdf'):
228
+ thread = threading.Thread(
229
+ target=process_file_background,
230
+ args=(task_id, file, extraction_mode)
231
+ )
232
+ thread.daemon = True
233
+ thread.start()
234
+ threads.append(thread)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ # Return task ID immediately
237
+ return jsonify({
238
+ 'task_id': task_id,
239
+ 'message': 'Processing started',
240
+ 'total_files': len(threads)
241
+ })
242
+
243
+
244
+ @app.route('/api/progress/<task_id>')
245
+ def get_progress(task_id):
246
+ """Get progress for a processing task."""
247
+ with _progress_lock:
248
+ progress = _progress_tracker.get(task_id)
249
+ if not progress:
250
+ return jsonify({'error': 'Task not found'}), 404
251
+ return jsonify(progress)
252
 
253
 
254
  @app.route('/api/pdf-list')
 
383
 
384
 
385
  if __name__ == '__main__':
386
+ # Run on port 7860 for Hugging Face Spaces, or 5000 for local development
387
+ port = int(os.environ.get('PORT', 7860))
388
  app.run(debug=False, host='0.0.0.0', port=port)
389
 
390
 
entrypoint.sh DELETED
@@ -1,7 +0,0 @@
1
- #!/bin/bash
2
- set -e
3
-
4
- # Start Flask application
5
- # Get port from environment variable or use default 7860
6
- python -c "import os; port = int(os.environ.get('PORT', 7860)); from app import app; app.run(host='0.0.0.0', port=port, debug=False)"
7
-
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,13 +1,28 @@
1
- doclayout-yolo>=0.0.4
2
- huggingface-hub>=1.1.2
3
- loguru>=0.7.3
4
- pillow>=12.0.0
5
- pymupdf>=1.26.6
6
- pymupdf-layout>=0.0.15
7
- pypdfium2>=5.0.0
8
- pymupdf4llm>=0.1.9
9
- flask>=3.0.0
10
- werkzeug>=3.0.0
11
- torch>=2.0.0
12
- torchvision>=0.15.0
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core PDF & Document Processing
2
+ doclayout-yolo==0.0.4
3
+ pymupdf==1.26.6
4
+ pymupdf-layout==0.0.15
5
+ pymupdf4llm==0.1.9
6
+ pypdfium2==5.0.0
 
 
 
 
 
 
7
 
8
+ # Deep Learning (CPU-optimized)
9
+ --extra-index-url https://download.pytorch.org/whl/cpu
10
+ torch==2.5.1+cpu
11
+ torchvision==0.20.1+cpu
12
+
13
+ # Image Processing
14
+ pillow==12.0.0
15
+ opencv-python-headless==4.10.0.84
16
+
17
+ # OCR
18
+ pytesseract==0.3.13
19
+
20
+ # Utilities
21
+ huggingface-hub==0.26.2
22
+ loguru==0.7.3
23
+ numpy==1.26.4
24
+
25
+ # Web Framework
26
+ flask==3.0.0
27
+ werkzeug==3.0.1
28
+ streamlit==1.40.1
static/js/app.js CHANGED
@@ -84,10 +84,21 @@ async function handleUpload(e) {
84
  const extractionMode = document.querySelector('input[name="extractionMode"]:checked').value;
85
 
86
  // Show processing section
87
- document.getElementById('processingSection').style.display = 'block';
 
 
 
 
 
88
  document.getElementById('resultsSection').style.display = 'none';
89
  document.getElementById('emptyState').style.display = 'none';
90
 
 
 
 
 
 
 
91
  const formData = new FormData();
92
  for (let i = 0; i < files.length; i++) {
93
  formData.append('files[]', files[i]);
@@ -106,18 +117,13 @@ async function handleUpload(e) {
106
  throw new Error(data.error);
107
  }
108
 
109
- // Hide processing section
110
- document.getElementById('processingSection').style.display = 'none';
111
-
112
- // Reload PDF list and show results
113
- await loadPdfList();
114
-
115
- // Show first PDF details if available
116
- if (data.results && data.results.length > 0) {
117
- const firstPdf = data.results[0];
118
- if (!firstPdf.error) {
119
- showPdfDetails(firstPdf.stem);
120
- }
121
  }
122
 
123
  // Reset form
@@ -126,10 +132,82 @@ async function handleUpload(e) {
126
  } catch (error) {
127
  console.error('Upload error:', error);
128
  alert('Error processing files: ' + error.message);
129
- document.getElementById('processingSection').style.display = 'none';
130
  }
131
  }
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  // Load PDF List
134
  async function loadPdfList() {
135
  try {
 
84
  const extractionMode = document.querySelector('input[name="extractionMode"]:checked').value;
85
 
86
  // Show processing section
87
+ const processingSection = document.getElementById('processingSection');
88
+ const processingStatus = document.getElementById('processingStatus');
89
+ const progressBar = document.getElementById('progressBar');
90
+ const progressBarFill = document.getElementById('progressBarFill');
91
+
92
+ processingSection.style.display = 'block';
93
  document.getElementById('resultsSection').style.display = 'none';
94
  document.getElementById('emptyState').style.display = 'none';
95
 
96
+ // Update processing UI
97
+ processingStatus.textContent = 'Uploading files...';
98
+ if (progressBarFill) {
99
+ progressBarFill.style.width = '0%';
100
+ }
101
+
102
  const formData = new FormData();
103
  for (let i = 0; i < files.length; i++) {
104
  formData.append('files[]', files[i]);
 
117
  throw new Error(data.error);
118
  }
119
 
120
+ // Start polling for progress
121
+ if (data.task_id) {
122
+ await pollProgress(data.task_id, processingStatus, progressBarFill);
123
+ } else {
124
+ // Fallback for old API
125
+ processingSection.style.display = 'none';
126
+ await loadPdfList();
 
 
 
 
 
127
  }
128
 
129
  // Reset form
 
132
  } catch (error) {
133
  console.error('Upload error:', error);
134
  alert('Error processing files: ' + error.message);
135
+ processingSection.style.display = 'none';
136
  }
137
  }
138
 
139
+ // Poll for progress updates
140
+ async function pollProgress(taskId, statusElement, progressBarFill) {
141
+ const maxAttempts = 600; // 5 minutes max (600 * 0.5s)
142
+ let attempts = 0;
143
+
144
+ const poll = async () => {
145
+ try {
146
+ const response = await fetch(`/api/progress/${taskId}`);
147
+ const data = await response.json();
148
+
149
+ if (data.error) {
150
+ throw new Error(data.error);
151
+ }
152
+
153
+ // Update progress bar
154
+ if (progressBarFill) {
155
+ const progress = data.progress || 0;
156
+ progressBarFill.style.width = `${progress}%`;
157
+ progressBarFill.setAttribute('aria-valuenow', progress);
158
+ const progressText = document.getElementById('progressBarText');
159
+ if (progressText) {
160
+ progressText.textContent = `${Math.round(progress)}%`;
161
+ }
162
+ }
163
+
164
+ // Update status message
165
+ if (statusElement) {
166
+ statusElement.textContent = data.message || 'Processing...';
167
+ }
168
+
169
+ // Check if completed
170
+ if (data.status === 'completed') {
171
+ // Hide processing section
172
+ document.getElementById('processingSection').style.display = 'none';
173
+
174
+ // Reload PDF list and show results
175
+ await loadPdfList();
176
+
177
+ // Show first PDF details if available
178
+ if (data.results && data.results.length > 0) {
179
+ const firstPdf = data.results[0];
180
+ if (!firstPdf.error) {
181
+ showPdfDetails(firstPdf.stem);
182
+ }
183
+ }
184
+ return;
185
+ }
186
+
187
+ // Check if error
188
+ if (data.status === 'error') {
189
+ throw new Error(data.message || 'Processing failed');
190
+ }
191
+
192
+ // Continue polling
193
+ attempts++;
194
+ if (attempts < maxAttempts) {
195
+ setTimeout(poll, 500); // Poll every 500ms
196
+ } else {
197
+ throw new Error('Processing timeout - please try again');
198
+ }
199
+
200
+ } catch (error) {
201
+ console.error('Progress polling error:', error);
202
+ document.getElementById('processingSection').style.display = 'none';
203
+ alert('Error: ' + error.message);
204
+ }
205
+ };
206
+
207
+ // Start polling
208
+ poll();
209
+ }
210
+
211
  // Load PDF List
212
  async function loadPdfList() {
213
  try {
templates/index.html CHANGED
@@ -120,15 +120,25 @@
120
  <div class="col-12">
121
  <div class="card shadow-sm">
122
  <div class="card-body">
123
- <div class="d-flex align-items-center">
124
  <div class="spinner-border text-primary me-3" role="status">
125
  <span class="visually-hidden">Loading...</span>
126
  </div>
127
- <div>
128
  <h6 class="mb-0">Processing PDFs...</h6>
129
  <small class="text-muted" id="processingStatus">Please wait</small>
130
  </div>
131
  </div>
 
 
 
 
 
 
 
 
 
 
132
  </div>
133
  </div>
134
  </div>
 
120
  <div class="col-12">
121
  <div class="card shadow-sm">
122
  <div class="card-body">
123
+ <div class="d-flex align-items-center mb-3">
124
  <div class="spinner-border text-primary me-3" role="status">
125
  <span class="visually-hidden">Loading...</span>
126
  </div>
127
+ <div class="flex-grow-1">
128
  <h6 class="mb-0">Processing PDFs...</h6>
129
  <small class="text-muted" id="processingStatus">Please wait</small>
130
  </div>
131
  </div>
132
+ <div class="progress" style="height: 25px;">
133
+ <div id="progressBarFill" class="progress-bar progress-bar-striped progress-bar-animated"
134
+ role="progressbar"
135
+ style="width: 0%"
136
+ aria-valuenow="0"
137
+ aria-valuemin="0"
138
+ aria-valuemax="100">
139
+ <span id="progressBarText">0%</span>
140
+ </div>
141
+ </div>
142
  </div>
143
  </div>
144
  </div>