Spaces:

VolariGlobal
/

volaris-pdf-tool

Running

App Files Files Community

saifisvibinn commited on Nov 18, 2025

Commit

f8f4523

1 Parent(s): 64226b5

Add full URLs to progress and pdf-details endpoints

Browse files

Files changed (1) hide show

app.py +79 -1

app.py CHANGED Viewed

@@ -402,6 +402,9 @@ def process_file_background(task_id: str, file_data: bytes, filename: str, extra
         figures = [e for e in elements if e.get('type') == 'figure']
         tables = [e for e in elements if e.get('type') == 'table']
         result = {
             'filename': filename,
             'stem': stem,
@@ -553,6 +556,68 @@ def get_progress(task_id):
         progress = _progress_tracker.get(task_id)
         if not progress:
             return jsonify({'error': 'Task not found'}), 404
         return jsonify(progress)
@@ -587,6 +652,12 @@ def pdf_details(pdf_stem):
     if not output_dir.exists():
         return jsonify({'error': 'PDF not found'}), 404
     # Load content list
     json_files = list(output_dir.glob('*_content_list.json'))
     elements = []
@@ -633,7 +704,14 @@ def pdf_details(pdf_stem):
         'markdown_path': markdown_path,
         'figure_images': figure_images,
         'table_images': table_images,
-        # Add direct download URLs for convenience
         'download_urls': {
             'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
             'markdown': f"/output/{markdown_path}" if markdown_path else None,

         figures = [e for e in elements if e.get('type') == 'figure']
         tables = [e for e in elements if e.get('type') == 'table']
+        # Get base URL for constructing full URLs
+        # Note: We can't use request.host_url here since we're in a background thread
+        # So we'll construct URLs that will be completed in the API endpoint
         result = {
             'filename': filename,
             'stem': stem,
         progress = _progress_tracker.get(task_id)
         if not progress:
             return jsonify({'error': 'Task not found'}), 404
+        # Get base URL for constructing full URLs
+        base_url = request.host_url.rstrip('/')
+        if 'hf.space' in base_url:
+            # Force HTTPS for Hugging Face Spaces
+            base_url = base_url.replace('http://', 'https://')
+        # Add full URLs to results if they exist
+        if 'results' in progress:
+            for result in progress['results']:
+                # Add full URL for annotated PDF
+                if result.get('annotated_pdf'):
+                    result['annotated_pdf_url'] = f"{base_url}/output/{result['annotated_pdf']}"
+                # Add full URL for markdown
+                if result.get('markdown_path'):
+                    result['markdown_url'] = f"{base_url}/output/{result['markdown_path']}"
+                # Add image URLs for figures and tables if available
+                output_dir = Path(app.config['OUTPUT_FOLDER']) / result.get('stem', '')
+                if output_dir.exists():
+                    # Load content list to get figure and table image paths
+                    json_files = list(output_dir.glob('*_content_list.json'))
+                    if json_files:
+                        try:
+                            elements = json.loads(json_files[0].read_text(encoding='utf-8'))
+                            figures = [e for e in elements if e.get('type') == 'figure']
+                            tables = [e for e in elements if e.get('type') == 'table']
+                            # Add figure URLs
+                            figure_urls = []
+                            for fig in figures:
+                                if fig.get('image_path'):
+                                    img_path = output_dir / fig['image_path']
+                                    if img_path.exists():
+                                        relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
+                                        figure_urls.append({
+                                            'page': fig.get('page', 0),
+                                            'url': f"{base_url}/output/{relative_path}",
+                                            'path': relative_path
+                                        })
+                            # Add table URLs
+                            table_urls = []
+                            for tab in tables:
+                                if tab.get('image_path'):
+                                    img_path = output_dir / tab['image_path']
+                                    if img_path.exists():
+                                        relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
+                                        table_urls.append({
+                                            'page': tab.get('page', 0),
+                                            'url': f"{base_url}/output/{relative_path}",
+                                            'path': relative_path
+                                        })
+                            if figure_urls:
+                                result['figure_urls'] = figure_urls
+                            if table_urls:
+                                result['table_urls'] = table_urls
+                        except Exception as e:
+                            logger.warning(f"Error loading image URLs for {result.get('stem')}: {e}")
         return jsonify(progress)
     if not output_dir.exists():
         return jsonify({'error': 'PDF not found'}), 404
+    # Get base URL for constructing full URLs
+    base_url = request.host_url.rstrip('/')
+    if 'hf.space' in base_url:
+        # Force HTTPS for Hugging Face Spaces
+        base_url = base_url.replace('http://', 'https://')
     # Load content list
     json_files = list(output_dir.glob('*_content_list.json'))
     elements = []
         'markdown_path': markdown_path,
         'figure_images': figure_images,
         'table_images': table_images,
+        # Add full URLs for direct access
+        'urls': {
+            'annotated_pdf': f"{base_url}/output/{annotated_pdf}" if annotated_pdf else None,
+            'markdown': f"{base_url}/output/{markdown_path}" if markdown_path else None,
+            'figures': [f"{base_url}/output/{img}" for img in figure_images] if figure_images else [],
+            'tables': [f"{base_url}/output/{img}" for img in table_images] if table_images else [],
+        },
+        # Keep relative paths for backward compatibility
         'download_urls': {
             'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
             'markdown': f"/output/{markdown_path}" if markdown_path else None,