Spaces:
Running
Running
saifisvibinn
commited on
Commit
·
f8f4523
1
Parent(s):
64226b5
Add full URLs to progress and pdf-details endpoints
Browse files
app.py
CHANGED
|
@@ -402,6 +402,9 @@ def process_file_background(task_id: str, file_data: bytes, filename: str, extra
|
|
| 402 |
figures = [e for e in elements if e.get('type') == 'figure']
|
| 403 |
tables = [e for e in elements if e.get('type') == 'table']
|
| 404 |
|
|
|
|
|
|
|
|
|
|
| 405 |
result = {
|
| 406 |
'filename': filename,
|
| 407 |
'stem': stem,
|
|
@@ -553,6 +556,68 @@ def get_progress(task_id):
|
|
| 553 |
progress = _progress_tracker.get(task_id)
|
| 554 |
if not progress:
|
| 555 |
return jsonify({'error': 'Task not found'}), 404
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
return jsonify(progress)
|
| 557 |
|
| 558 |
|
|
@@ -587,6 +652,12 @@ def pdf_details(pdf_stem):
|
|
| 587 |
if not output_dir.exists():
|
| 588 |
return jsonify({'error': 'PDF not found'}), 404
|
| 589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
# Load content list
|
| 591 |
json_files = list(output_dir.glob('*_content_list.json'))
|
| 592 |
elements = []
|
|
@@ -633,7 +704,14 @@ def pdf_details(pdf_stem):
|
|
| 633 |
'markdown_path': markdown_path,
|
| 634 |
'figure_images': figure_images,
|
| 635 |
'table_images': table_images,
|
| 636 |
-
# Add
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
'download_urls': {
|
| 638 |
'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
|
| 639 |
'markdown': f"/output/{markdown_path}" if markdown_path else None,
|
|
|
|
| 402 |
figures = [e for e in elements if e.get('type') == 'figure']
|
| 403 |
tables = [e for e in elements if e.get('type') == 'table']
|
| 404 |
|
| 405 |
+
# Get base URL for constructing full URLs
|
| 406 |
+
# Note: We can't use request.host_url here since we're in a background thread
|
| 407 |
+
# So we'll construct URLs that will be completed in the API endpoint
|
| 408 |
result = {
|
| 409 |
'filename': filename,
|
| 410 |
'stem': stem,
|
|
|
|
| 556 |
progress = _progress_tracker.get(task_id)
|
| 557 |
if not progress:
|
| 558 |
return jsonify({'error': 'Task not found'}), 404
|
| 559 |
+
|
| 560 |
+
# Get base URL for constructing full URLs
|
| 561 |
+
base_url = request.host_url.rstrip('/')
|
| 562 |
+
if 'hf.space' in base_url:
|
| 563 |
+
# Force HTTPS for Hugging Face Spaces
|
| 564 |
+
base_url = base_url.replace('http://', 'https://')
|
| 565 |
+
|
| 566 |
+
# Add full URLs to results if they exist
|
| 567 |
+
if 'results' in progress:
|
| 568 |
+
for result in progress['results']:
|
| 569 |
+
# Add full URL for annotated PDF
|
| 570 |
+
if result.get('annotated_pdf'):
|
| 571 |
+
result['annotated_pdf_url'] = f"{base_url}/output/{result['annotated_pdf']}"
|
| 572 |
+
|
| 573 |
+
# Add full URL for markdown
|
| 574 |
+
if result.get('markdown_path'):
|
| 575 |
+
result['markdown_url'] = f"{base_url}/output/{result['markdown_path']}"
|
| 576 |
+
|
| 577 |
+
# Add image URLs for figures and tables if available
|
| 578 |
+
output_dir = Path(app.config['OUTPUT_FOLDER']) / result.get('stem', '')
|
| 579 |
+
if output_dir.exists():
|
| 580 |
+
# Load content list to get figure and table image paths
|
| 581 |
+
json_files = list(output_dir.glob('*_content_list.json'))
|
| 582 |
+
if json_files:
|
| 583 |
+
try:
|
| 584 |
+
elements = json.loads(json_files[0].read_text(encoding='utf-8'))
|
| 585 |
+
figures = [e for e in elements if e.get('type') == 'figure']
|
| 586 |
+
tables = [e for e in elements if e.get('type') == 'table']
|
| 587 |
+
|
| 588 |
+
# Add figure URLs
|
| 589 |
+
figure_urls = []
|
| 590 |
+
for fig in figures:
|
| 591 |
+
if fig.get('image_path'):
|
| 592 |
+
img_path = output_dir / fig['image_path']
|
| 593 |
+
if img_path.exists():
|
| 594 |
+
relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
|
| 595 |
+
figure_urls.append({
|
| 596 |
+
'page': fig.get('page', 0),
|
| 597 |
+
'url': f"{base_url}/output/{relative_path}",
|
| 598 |
+
'path': relative_path
|
| 599 |
+
})
|
| 600 |
+
|
| 601 |
+
# Add table URLs
|
| 602 |
+
table_urls = []
|
| 603 |
+
for tab in tables:
|
| 604 |
+
if tab.get('image_path'):
|
| 605 |
+
img_path = output_dir / tab['image_path']
|
| 606 |
+
if img_path.exists():
|
| 607 |
+
relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
|
| 608 |
+
table_urls.append({
|
| 609 |
+
'page': tab.get('page', 0),
|
| 610 |
+
'url': f"{base_url}/output/{relative_path}",
|
| 611 |
+
'path': relative_path
|
| 612 |
+
})
|
| 613 |
+
|
| 614 |
+
if figure_urls:
|
| 615 |
+
result['figure_urls'] = figure_urls
|
| 616 |
+
if table_urls:
|
| 617 |
+
result['table_urls'] = table_urls
|
| 618 |
+
except Exception as e:
|
| 619 |
+
logger.warning(f"Error loading image URLs for {result.get('stem')}: {e}")
|
| 620 |
+
|
| 621 |
return jsonify(progress)
|
| 622 |
|
| 623 |
|
|
|
|
| 652 |
if not output_dir.exists():
|
| 653 |
return jsonify({'error': 'PDF not found'}), 404
|
| 654 |
|
| 655 |
+
# Get base URL for constructing full URLs
|
| 656 |
+
base_url = request.host_url.rstrip('/')
|
| 657 |
+
if 'hf.space' in base_url:
|
| 658 |
+
# Force HTTPS for Hugging Face Spaces
|
| 659 |
+
base_url = base_url.replace('http://', 'https://')
|
| 660 |
+
|
| 661 |
# Load content list
|
| 662 |
json_files = list(output_dir.glob('*_content_list.json'))
|
| 663 |
elements = []
|
|
|
|
| 704 |
'markdown_path': markdown_path,
|
| 705 |
'figure_images': figure_images,
|
| 706 |
'table_images': table_images,
|
| 707 |
+
# Add full URLs for direct access
|
| 708 |
+
'urls': {
|
| 709 |
+
'annotated_pdf': f"{base_url}/output/{annotated_pdf}" if annotated_pdf else None,
|
| 710 |
+
'markdown': f"{base_url}/output/{markdown_path}" if markdown_path else None,
|
| 711 |
+
'figures': [f"{base_url}/output/{img}" for img in figure_images] if figure_images else [],
|
| 712 |
+
'tables': [f"{base_url}/output/{img}" for img in table_images] if table_images else [],
|
| 713 |
+
},
|
| 714 |
+
# Keep relative paths for backward compatibility
|
| 715 |
'download_urls': {
|
| 716 |
'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
|
| 717 |
'markdown': f"/output/{markdown_path}" if markdown_path else None,
|