saifisvibinn commited on
Commit
f8f4523
·
1 Parent(s): 64226b5

Add full URLs to progress and pdf-details endpoints

Browse files
Files changed (1) hide show
  1. app.py +79 -1
app.py CHANGED
@@ -402,6 +402,9 @@ def process_file_background(task_id: str, file_data: bytes, filename: str, extra
402
  figures = [e for e in elements if e.get('type') == 'figure']
403
  tables = [e for e in elements if e.get('type') == 'table']
404
 
 
 
 
405
  result = {
406
  'filename': filename,
407
  'stem': stem,
@@ -553,6 +556,68 @@ def get_progress(task_id):
553
  progress = _progress_tracker.get(task_id)
554
  if not progress:
555
  return jsonify({'error': 'Task not found'}), 404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  return jsonify(progress)
557
 
558
 
@@ -587,6 +652,12 @@ def pdf_details(pdf_stem):
587
  if not output_dir.exists():
588
  return jsonify({'error': 'PDF not found'}), 404
589
 
 
 
 
 
 
 
590
  # Load content list
591
  json_files = list(output_dir.glob('*_content_list.json'))
592
  elements = []
@@ -633,7 +704,14 @@ def pdf_details(pdf_stem):
633
  'markdown_path': markdown_path,
634
  'figure_images': figure_images,
635
  'table_images': table_images,
636
- # Add direct download URLs for convenience
 
 
 
 
 
 
 
637
  'download_urls': {
638
  'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
639
  'markdown': f"/output/{markdown_path}" if markdown_path else None,
 
402
  figures = [e for e in elements if e.get('type') == 'figure']
403
  tables = [e for e in elements if e.get('type') == 'table']
404
 
405
+ # Get base URL for constructing full URLs
406
+ # Note: We can't use request.host_url here since we're in a background thread
407
+ # So we'll construct URLs that will be completed in the API endpoint
408
  result = {
409
  'filename': filename,
410
  'stem': stem,
 
556
  progress = _progress_tracker.get(task_id)
557
  if not progress:
558
  return jsonify({'error': 'Task not found'}), 404
559
+
560
+ # Get base URL for constructing full URLs
561
+ base_url = request.host_url.rstrip('/')
562
+ if 'hf.space' in base_url:
563
+ # Force HTTPS for Hugging Face Spaces
564
+ base_url = base_url.replace('http://', 'https://')
565
+
566
+ # Add full URLs to results if they exist
567
+ if 'results' in progress:
568
+ for result in progress['results']:
569
+ # Add full URL for annotated PDF
570
+ if result.get('annotated_pdf'):
571
+ result['annotated_pdf_url'] = f"{base_url}/output/{result['annotated_pdf']}"
572
+
573
+ # Add full URL for markdown
574
+ if result.get('markdown_path'):
575
+ result['markdown_url'] = f"{base_url}/output/{result['markdown_path']}"
576
+
577
+ # Add image URLs for figures and tables if available
578
+ output_dir = Path(app.config['OUTPUT_FOLDER']) / result.get('stem', '')
579
+ if output_dir.exists():
580
+ # Load content list to get figure and table image paths
581
+ json_files = list(output_dir.glob('*_content_list.json'))
582
+ if json_files:
583
+ try:
584
+ elements = json.loads(json_files[0].read_text(encoding='utf-8'))
585
+ figures = [e for e in elements if e.get('type') == 'figure']
586
+ tables = [e for e in elements if e.get('type') == 'table']
587
+
588
+ # Add figure URLs
589
+ figure_urls = []
590
+ for fig in figures:
591
+ if fig.get('image_path'):
592
+ img_path = output_dir / fig['image_path']
593
+ if img_path.exists():
594
+ relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
595
+ figure_urls.append({
596
+ 'page': fig.get('page', 0),
597
+ 'url': f"{base_url}/output/{relative_path}",
598
+ 'path': relative_path
599
+ })
600
+
601
+ # Add table URLs
602
+ table_urls = []
603
+ for tab in tables:
604
+ if tab.get('image_path'):
605
+ img_path = output_dir / tab['image_path']
606
+ if img_path.exists():
607
+ relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
608
+ table_urls.append({
609
+ 'page': tab.get('page', 0),
610
+ 'url': f"{base_url}/output/{relative_path}",
611
+ 'path': relative_path
612
+ })
613
+
614
+ if figure_urls:
615
+ result['figure_urls'] = figure_urls
616
+ if table_urls:
617
+ result['table_urls'] = table_urls
618
+ except Exception as e:
619
+ logger.warning(f"Error loading image URLs for {result.get('stem')}: {e}")
620
+
621
  return jsonify(progress)
622
 
623
 
 
652
  if not output_dir.exists():
653
  return jsonify({'error': 'PDF not found'}), 404
654
 
655
+ # Get base URL for constructing full URLs
656
+ base_url = request.host_url.rstrip('/')
657
+ if 'hf.space' in base_url:
658
+ # Force HTTPS for Hugging Face Spaces
659
+ base_url = base_url.replace('http://', 'https://')
660
+
661
  # Load content list
662
  json_files = list(output_dir.glob('*_content_list.json'))
663
  elements = []
 
704
  'markdown_path': markdown_path,
705
  'figure_images': figure_images,
706
  'table_images': table_images,
707
+ # Add full URLs for direct access
708
+ 'urls': {
709
+ 'annotated_pdf': f"{base_url}/output/{annotated_pdf}" if annotated_pdf else None,
710
+ 'markdown': f"{base_url}/output/{markdown_path}" if markdown_path else None,
711
+ 'figures': [f"{base_url}/output/{img}" for img in figure_images] if figure_images else [],
712
+ 'tables': [f"{base_url}/output/{img}" for img in table_images] if table_images else [],
713
+ },
714
+ # Keep relative paths for backward compatibility
715
  'download_urls': {
716
  'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
717
  'markdown': f"/output/{markdown_path}" if markdown_path else None,