Spaces:

23f3003322
/

llm-quiz-analysis

Sleeping

App Files Files Community

23f3003322 commited on Nov 29, 2025

Commit

dfe7fff

1 Parent(s): 308bf1a

task processing complete

Browse files

Files changed (12) hide show

app/__pycache__/main.cpython-313.pyc +0 -0
app/api/routes/__pycache__/task.cpython-313.pyc +0 -0
app/api/routes/task.py +148 -38
app/core/__pycache__/security.cpython-313.pyc +0 -0
app/core/security.py +26 -29
app/models/__pycache__/response.cpython-313.pyc +0 -0
app/models/response.py +41 -0
app/orchestrator/models.py +118 -0
app/services/__pycache__/task_processor.cpython-313.pyc +0 -0
app/services/task_fetcher.py +295 -753
app/services/task_processor.py +290 -136
app/utils/prompts.py +163 -1

app/__pycache__/main.cpython-313.pyc CHANGED Viewed

Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ

app/api/routes/__pycache__/task.cpython-313.pyc CHANGED Viewed

Binary files a/app/api/routes/__pycache__/task.cpython-313.pyc and b/app/api/routes/__pycache__/task.cpython-313.pyc differ

app/api/routes/task.py CHANGED Viewed

@@ -1,66 +1,176 @@
 """
-Task Processing Routes
-Main API endpoint for handling task requests
 """
 from datetime import datetime
-from fastapi import APIRouter, Request, status
 from app.models.request import TaskRequest
-from app.models.response import TaskResponse
-from app.api.dependencies import verify_authentication
-from app.services.task_processor import TaskProcessor
 from app.core.logging import get_logger
-import requests
 logger = get_logger(__name__)
 router = APIRouter()
 task_processor = TaskProcessor()
-@router.post("/task", response_model=TaskResponse, status_code=status.HTTP_200_OK)
-async def handle_task(request: Request):
     """
     Main API endpoint for handling task requests
-    - Validates request format (HTTP 400 if invalid)
-    - Verifies secret (HTTP 403 if invalid)
-    - Processes task and returns results (HTTP 200 if successful)
     """
     start_time = datetime.now()
     logger.info("📥 Task request received")
-    # Parse and validate request body with Pydantic
-    body = await request.json()
-    task_data = TaskRequest(**body)
-    logger.info(f"✅ Request validated for: {task_data.email}")
-    # Verify authentication
-    logger.info("🔐 Verifying authentication")
-    verify_authentication(task_data.secret)
-    logger.info("✅ Authentication successful")
-    # Process the task
-    logger.info("🚀 Starting task execution")
-    result_data = await task_processor.process(task_data)
-    # Calculate execution time
-    execution_time = (datetime.now() - start_time).total_seconds()
-    logger.info(f"⏱️  Task completed in {execution_time:.3f}s")
-    # Prepare response
-    response = TaskResponse(
-        success=True,
-        message="Task completed successfully",
-        data=result_data,
-        email=task_data.email,
-        task_url=str(task_data.url),
-        execution_time=execution_time
-    )
-    logger.info("✅ Response prepared successfully")
-    return response

 """
+Task API Routes
+Handles task submission and processing
 """
+from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
 from datetime import datetime
+from typing import Dict, Any
 from app.models.request import TaskRequest
+from app.models.response import TaskResponse, ImmediateResponse
 from app.core.logging import get_logger
+from app.core.security import verify_authentication, AuthenticationError
+from app.core.exceptions import TaskProcessingError
+from app.services.task_processor import TaskProcessor
 logger = get_logger(__name__)
 router = APIRouter()
+# Initialize task processor (singleton)
 task_processor = TaskProcessor()
+@router.post(
+    "/task",
+    response_model=ImmediateResponse,
+    status_code=status.HTTP_200_OK,
+    responses={
+        200: {"description": "Request accepted and processing started"},
+        400: {"description": "Invalid JSON format or request data"},
+        403: {"description": "Invalid secret - authentication failed"}
+    }
+)
+async def handle_task(
+    request: Request,
+    background_tasks: BackgroundTasks
+):
     """
     Main API endpoint for handling task requests
+    Flow:
+    1. Validate JSON format (HTTP 400 if invalid)
+    2. Verify secret (HTTP 403 if invalid)
+    3. Respond immediately with HTTP 200
+    4. Process task in background
+    Returns:
+        Immediate HTTP 200 response with task accepted message
     """
     start_time = datetime.now()
     logger.info("📥 Task request received")
+    try:
+        # ================================================================
+        # STEP 1: PARSE AND VALIDATE JSON (HTTP 400 if invalid)
+        # ================================================================
+        try:
+            body = await request.json()
+            task_data = TaskRequest(**body)
+        except ValueError as e:
+            logger.error(f"❌ Invalid JSON format: {str(e)}")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Invalid JSON format: {str(e)}"
+            )
+        except Exception as e:
+            logger.error(f"❌ Request validation failed: {str(e)}")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Invalid request data: {str(e)}"
+            )
+        logger.info(f"✅ Request validated for: {task_data.email}")
+        # ================================================================
+        # STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
+        # ================================================================
+        logger.info("🔐 Verifying authentication")
+        try:
+            verify_authentication(task_data.secret)
+        except AuthenticationError as e:
+            logger.error(f"❌ Authentication failed: {str(e)}")
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Invalid secret. Authentication failed."
+            )
+        logger.info("✅ Authentication successful")
+        # ================================================================
+        # STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
+        # ================================================================
+        logger.info("✅ Request accepted - processing in background")
+        # Add task processing to background
+        background_tasks.add_task(
+            process_task_background,
+            task_data=task_data,
+            start_time=start_time
+        )
+        # Immediate response
+        response = ImmediateResponse(
+            success=True,
+            message="Task accepted and processing started",
+            email=task_data.email,
+            task_url=str(task_data.url),
+            status="processing",
+            timestamp=datetime.now().isoformat()
+        )
+        logger.info(f"📤 Sent immediate response to client")
+        return response
+    except HTTPException:
+        # Re-raise HTTP exceptions (400, 403)
+        raise
+    except Exception as e:
+        logger.error(f"❌ Unexpected error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}"
+        )
+async def process_task_background(
+    task_data: TaskRequest,
+    start_time: datetime
+):
+    """
+    Process task in background after sending immediate response
+    This runs asynchronously after the HTTP 200 response is sent.
+    Results are logged but not returned to client.
+    Args:
+        task_data: Validated task request
+        start_time: Request start time for metrics
+    """
+    logger.info("=" * 80)
+    logger.info("🔄 BACKGROUND TASK PROCESSING STARTED")
+    logger.info("=" * 80)
+    logger.info(f"🔗 URL: {task_data.url}")
+    try:
+        # Process the task
+        result_data = await task_processor.process(task_data)
+        # Calculate execution time
+        execution_time = (datetime.now() - start_time).total_seconds()
+        # Optional: Store result in database/cache for later retrieval
+        # await store_result(task_data.email, result_data)
+    except TaskProcessingError as e:
+        logger.error("=" * 80)
+        logger.error("❌ BACKGROUND TASK FAILED")
+        logger.error("=" * 80)
+        logger.error(f"Error: {str(e)}")
+        logger.error("=" * 80)
+        # Optional: Store error for later retrieval or send notification
+        # await store_error(task_data.email, str(e))
+    except Exception as e:
+        logger.error("=" * 80)
+        logger.error("❌ BACKGROUND TASK UNEXPECTED ERROR")
+        logger.error("=" * 80)
+        logger.error(f"Error: {str(e)}", exc_info=True)
+        logger.error("=" * 80)

app/core/__pycache__/security.cpython-313.pyc CHANGED Viewed

Binary files a/app/core/__pycache__/security.cpython-313.pyc and b/app/core/__pycache__/security.cpython-313.pyc differ

app/core/security.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """
-Security Utilities
-Handles authentication and authorization
 """
 from app.core.config import settings
@@ -9,49 +8,47 @@ from app.core.logging import get_logger
 logger = get_logger(__name__)
-def verify_secret(provided_secret: str) -> bool:
     """
-    Verify the provided secret against environment configuration
     Args:
-        provided_secret: Secret from request
     Returns:
-        bool: True if secret matches, False otherwise
     """
-    if not settings.is_secret_configured():
-        logger.error("⚠️  API_SECRET not configured in environment")
-        return False
-    is_valid = provided_secret == settings.API_SECRET
-    if is_valid:
-        logger.info("✅ Secret verification successful")
-    else:
-        logger.warning("🚫 Secret verification failed")
-        logger.debug(
-            f"Expected length: {len(settings.API_SECRET)}, "
-            f"Got length: {len(provided_secret)}"
-        )
     return is_valid
-def mask_secret(secret: str, visible_chars: int = 4) -> str:
     """
-    Mask secret for logging purposes
     Args:
-        secret: Secret to mask
-        visible_chars: Number of characters to show at the end
     Returns:
-        str: Masked secret
     """
-    if not secret:
-        return ""
-    if len(secret) <= visible_chars:
-        return "*" * len(secret)
-    return "*" * (len(secret) - visible_chars) + secret[-visible_chars:]

 """
+Security and Authentication
 """
 from app.core.config import settings
 logger = get_logger(__name__)
+class AuthenticationError(Exception):
+    """Raised when authentication fails"""
+    pass
+def verify_secret(secret: str) -> bool:
     """
+    Verify if provided secret is valid
     Args:
+        secret: Secret from request
     Returns:
+        bool: True if valid, False otherwise
     """
+    # Get expected secret from config/env
+    expected_secret = settings.API_SECRET
+    # Simple comparison (use constant-time comparison in production)
+    is_valid = secret == expected_secret
+    if not is_valid:
+        logger.warning(f"❌ Invalid secret attempt")
     return is_valid
+def verify_authentication(secret: str) -> bool:
     """
+    Verify request authentication
     Args:
+        secret: Secret from request
     Returns:
+        bool: True if authenticated
+    Raises:
+        AuthenticationError: If authentication fails (HTTP 403)
     """
+    if not verify_secret(secret):
+        raise AuthenticationError("Invalid secret. Authentication failed.")
+    return True

app/models/__pycache__/response.cpython-313.pyc CHANGED Viewed

Binary files a/app/models/__pycache__/response.cpython-313.pyc and b/app/models/__pycache__/response.cpython-313.pyc differ

app/models/response.py CHANGED Viewed

@@ -7,6 +7,47 @@ from typing import Optional, Dict, Any
 from datetime import datetime
 from pydantic import BaseModel, Field
 class TaskResponse(BaseModel):
     """

 from datetime import datetime
 from pydantic import BaseModel, Field
+class ImmediateResponse(BaseModel):
+    """
+    Immediate response sent after validation
+    HTTP 200 response before task processing
+    """
+    success: bool = Field(
+        description="Whether request was accepted"
+    )
+    message: str = Field(
+        description="Status message"
+    )
+    email: str = Field(
+        description="Student email from request"
+    )
+    task_url: str = Field(
+        description="Task URL from request"
+    )
+    status: str = Field(
+        description="Processing status: processing, completed, failed"
+    )
+    timestamp: str = Field(
+        description="Response timestamp (ISO format)"
+    )
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "success": True,
+                "message": "Task accepted and processing started",
+                "email": "[email protected]",
+                "task_url": "https://example.com/quiz-834",
+                "status": "processing",
+                "timestamp": "2025-11-29T12:00:00"
+            }
+        }
 class TaskResponse(BaseModel):
     """

app/orchestrator/models.py CHANGED Viewed

@@ -48,6 +48,124 @@ class OutputFormat(str, Enum):
     UNKNOWN = "unknown"
 class TaskClassification(BaseModel):
     """
     Structured output for task classification

     UNKNOWN = "unknown"
+class URLDetection(BaseModel):
+    """Result of URL detection analysis"""
+    is_redirect: bool = Field(
+        description="True if content redirects to another URL for the actual task"
+    )
+    question_url: Optional[str] = Field(
+        default=None,
+        description="The URL to visit for the actual task (if is_redirect is True)"
+    )
+    reasoning: str = Field(
+        description="Detailed explanation of why this is or isn't a redirect"
+    )
+    url_types: Dict[str, str] = Field(
+        default_factory=dict,
+        description="Classification of each URL found (e.g., 'question_url', 'data_url', 'submission_url')"
+    )
+    confidence: str = Field(
+        default="medium",
+        description="Confidence level: low, medium, high"
+    )
+class InstructionStep(BaseModel):
+    """Single instruction step"""
+    step_number: int = Field(
+        description="Step number in sequence (1, 2, 3...)"
+    )
+    action: str = Field(
+        description="Primary action: scrape, extract, calculate, submit, download, transcribe, analyze, visit"
+    )
+    description: str = Field(
+        description="Clear description of what to do in this step"
+    )
+    target: Optional[str] = Field(
+        default=None,
+        description="Target of the action (URL, field name, file, etc.)"
+    )
+    dependencies: List[int] = Field(
+        default_factory=list,
+        description="Step numbers this step depends on"
+    )
+class UnifiedTaskAnalysis(BaseModel):
+    """
+    Unified analysis for task fetching
+    Combines redirect detection, submission URL extraction, and instruction parsing
+    """
+    # ========================================================================
+    # REDIRECT DETECTION
+    # ========================================================================
+    is_redirect: bool = Field(
+        description="True if this content redirects to another URL for the actual task"
+    )
+    question_url: Optional[str] = Field(
+        default=None,
+        description="URL to visit for the actual task (if is_redirect=True)"
+    )
+    redirect_reasoning: str = Field(
+        default="",
+        description="Why this is or isn't a redirect"
+    )
+    # ========================================================================
+    # SUBMISSION URL EXTRACTION
+    # ========================================================================
+    submission_url: Optional[str] = Field(
+        default=None,
+        description="URL where the final answer should be POSTed"
+    )
+    submission_url_is_relative: bool = Field(
+        default=False,
+        description="True if submission URL is relative and needs base URL resolution"
+    )
+    submission_reasoning: str = Field(
+        default="",
+        description="How the submission URL was identified"
+    )
+    # ========================================================================
+    # INSTRUCTION PARSING
+    # ========================================================================
+    instructions: List[InstructionStep] = Field(
+        default_factory=list,
+        description="Parsed step-by-step instructions (empty if redirect)"
+    )
+    overall_goal: str = Field(
+        description="High-level summary of what needs to be accomplished"
+    )
+    complexity: str = Field(
+        description="Task complexity: trivial, simple, moderate, complex"
+    )
+    # ========================================================================
+    # CONFIDENCE
+    # ========================================================================
+    confidence: float = Field(
+        ge=0.0,
+        le=1.0,
+        description="Overall confidence (0.0-1.0)"
+    )
 class TaskClassification(BaseModel):
     """
     Structured output for task classification

app/services/__pycache__/task_processor.cpython-313.pyc CHANGED Viewed

Binary files a/app/services/__pycache__/task_processor.cpython-313.pyc and b/app/services/__pycache__/task_processor.cpython-313.pyc differ

app/services/task_fetcher.py CHANGED Viewed

@@ -1,42 +1,53 @@
 """
-Task Fetcher Service - Enhanced Version
-Fetches and extracts task descriptions from URLs with intelligent content detection
 """
 import httpx
 import json
-import base64
 import re
 from typing import Optional, Dict, Any, List
-from urllib.parse import urlparse, urljoin
 from bs4 import BeautifulSoup
 from app.core.config import settings
 from app.core.logging import get_logger
 from app.core.exceptions import TaskProcessingError
 logger = get_logger(__name__)
 class TaskFetcher:
     """
-    Enhanced service for fetching and extracting task descriptions from URLs
-    Handles multiple content types and detects special elements requiring processing
     """
     def __init__(self, timeout: int = 30):
-        """
-        Initialize TaskFetcher
-        Args:
-            timeout: Request timeout in seconds
-        """
         self.timeout = timeout
-        self.client = None
-        logger.debug("TaskFetcher initialized")
     async def __aenter__(self):
-        """Async context manager entry"""
         self.client = httpx.AsyncClient(
             timeout=self.timeout,
             follow_redirects=True,
@@ -48,795 +59,326 @@ class TaskFetcher:
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit"""
         if self.client:
             await self.client.aclose()
-    async def fetch_task(self, url: str) -> Dict[str, Any]:
         """
-        Fetch and extract task description from URL with intelligent detection
-        Args:
-            url: URL to fetch task from
-        Returns:
-            Dict containing task information:
-            {
-                "task_description": str,
-                "raw_content": str,
-                "content_type": str,
-                "url": str,
-                "needs_llm_analysis": bool,
-                "metadata": dict with special_elements, etc.
             }
-        Raises:
-            TaskProcessingError: If fetching or extraction fails
-        """
-        logger.info(f"📥 Fetching task from URL: {url}")
-        # Validate URL
         if not self._is_valid_url(url):
-            logger.error(f"❌ Invalid URL format: {url}")
             raise TaskProcessingError(f"Invalid URL format: {url}")
         try:
-            # Fetch content
             response = await self._fetch_url(url)
-            # Detect content type
             content_type = self._detect_content_type(response)
-            logger.info(f"📄 Content type detected: {content_type}")
-            # Extract task based on content type
-            task_info = await self._extract_task(response, content_type, url)
-            # Determine if LLM analysis is needed
-            task_info['needs_llm_analysis'] = self._needs_llm_analysis(task_info)
-            if task_info['needs_llm_analysis']:
-                logger.warning("🤖 Content requires LLM analysis for complete extraction")
-            logger.info(f"✅ Task fetched successfully")
-            logger.debug(f"Task description length: {len(task_info['task_description'])} chars")
-            return task_info
-        except httpx.TimeoutException:
-            logger.error(f"⏱️  Timeout fetching URL: {url}")
-            raise TaskProcessingError(f"Request timeout for URL: {url}")
-        except httpx.HTTPStatusError as e:
-            logger.error(f"❌ HTTP error {e.response.status_code}: {url}")
-            raise TaskProcessingError(
-                f"HTTP {e.response.status_code} error fetching URL: {url}"
-            )
-        except Exception as e:
-            logger.error(f"❌ Unexpected error fetching task: {str(e)}", exc_info=True)
-            raise TaskProcessingError(f"Failed to fetch task from URL: {str(e)}")
-    def _is_valid_url(self, url: str) -> bool:
-        """
-        Validate URL format
-        Args:
-            url: URL to validate
-        Returns:
-            bool: True if valid, False otherwise
-        """
-        try:
-            result = urlparse(url)
-            is_valid = all([result.scheme in ['http', 'https'], result.netloc])
-            if not is_valid:
-                logger.warning(f"Invalid URL structure: {url}")
-            return is_valid
         except Exception as e:
-            logger.warning(f"URL validation error: {str(e)}")
-            return False
     async def _fetch_url(self, url: str) -> httpx.Response:
-        """
-        Fetch content from URL with retry logic
-        Args:
-            url: URL to fetch
-        Returns:
-            httpx.Response: HTTP response
-        Raises:
-            httpx.HTTPStatusError: If HTTP error occurs
-            httpx.TimeoutException: If request times out
-        """
-        max_retries = settings.MAX_RETRIES
         for attempt in range(max_retries):
             try:
-                logger.debug(f"Attempt {attempt + 1}/{max_retries} to fetch URL")
                 response = await self.client.get(url)
                 response.raise_for_status()
-                logger.debug(
-                    f"✓ Fetch successful | Status: {response.status_code} | "
-                    f"Size: {len(response.content)} bytes"
-                )
                 return response
             except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
                 if attempt == max_retries - 1:
-                    # Last attempt, raise error
                     raise
-                logger.warning(
-                    f"⚠️  Attempt {attempt + 1} failed: {str(e)} | Retrying..."
-                )
                 continue
-    def _detect_content_type(self, response: httpx.Response) -> str:
-        """
-        Detect content type from response
-        Args:
-            response: HTTP response
-        Returns:
-            str: Content type (json, html, text, pdf, csv, etc.)
-        """
-        content_type_header = response.headers.get('content-type', '').lower()
-        # Check content-type header first
-        if 'application/json' in content_type_header:
-            return 'json'
-        elif 'text/html' in content_type_header:
-            return 'html'
-        elif 'application/pdf' in content_type_header:
-            return 'pdf'
-        elif 'text/csv' in content_type_header or 'application/csv' in content_type_header:
-            return 'csv'
-        elif 'audio/' in content_type_header:
-            return 'audio'
-        elif 'video/' in content_type_header:
-            return 'video'
-        elif 'image/' in content_type_header:
-            return 'image'
-        # Try to detect from content
-        try:
-            json.loads(response.text)
-            return 'json'
-        except:
-            if '<html' in response.text.lower()[:100]:
-                return 'html'
-            return 'text'
-    async def _extract_task(
-        self,
-        response: httpx.Response,
-        content_type: str,
-        url: str
-    ) -> Dict[str, Any]:
-        """
-        Extract task description based on content type
-        Args:
-            response: HTTP response
-            content_type: Detected content type
-            url: Original URL
-        Returns:
-            Dict with task information
-        """
-        extractors = {
-            'json': self._extract_from_json,
-            'html': self._extract_from_html,
-            'text': self._extract_from_text,
-            'pdf': self._extract_from_binary,
-            'csv': self._extract_from_binary,
-            'audio': self._extract_from_binary,
-            'video': self._extract_from_binary,
-            'image': self._extract_from_binary,
-        }
-        extractor = extractors.get(content_type, self._extract_from_text)
-        logger.debug(f"Using extractor: {extractor.__name__}")
-        return await extractor(response, url)
-    async def _extract_from_json(
-        self,
-        response: httpx.Response,
-        url: str
-    ) -> Dict[str, Any]:
-        """
-        Extract task from JSON response with base64 detection
-        Args:
-            response: HTTP response
-            url: Original URL
-        Returns:
-            Dict with task information
-        """
-        logger.debug("Extracting task from JSON")
-        try:
-            data = response.json()
-            # Try common field names for task description
-            task_fields = [
-                'task', 'task_description', 'description',
-                'question', 'prompt', 'instruction',
-                'task_text', 'content', 'message', 'text'
-            ]
-            task_description = None
-            found_field = None
-            # Search for task description in JSON
-            for field in task_fields:
-                if field in data:
-                    task_description = str(data[field])
-                    found_field = field
-                    logger.debug(f"Found task in field: {field}")
-                    break
-            # If not found in root, try nested
-            if not task_description:
-                task_description = self._search_nested_json(data, task_fields)
-                found_field = "nested"
-            # Fallback: use entire JSON as string
-            if not task_description:
-                logger.warning("No task field found, using entire JSON")
-                task_description = json.dumps(data, indent=2)
-                found_field = "full_json"
-            # Check for base64 encoding
-            original_description = task_description
-            task_description = self._detect_and_decode_base64(task_description)
-            was_base64_decoded = (original_description != task_description)
-            return {
-                'task_description': task_description.strip(),
-                'raw_content': response.text,
-                'content_type': 'json',
-                'url': url,
-                'metadata': {
-                    'json_structure': list(data.keys()) if isinstance(data, dict) else [],
-                    'data_type': type(data).__name__,
-                    'found_in_field': found_field,
-                    'was_base64_decoded': was_base64_decoded,
-                    'special_elements': {}  # No special elements in JSON
-                }
-            }
-        except json.JSONDecodeError as e:
-            logger.error(f"Failed to parse JSON: {str(e)}")
-            # Fallback to text extraction
-            return await self._extract_from_text(response, url)
-    def _search_nested_json(self, data: Any, fields: List[str], max_depth: int = 3) -> Optional[str]:
         """
-        Recursively search for task description in nested JSON
-        Args:
-            data: JSON data to search
-            fields: Field names to look for
-            max_depth: Maximum recursion depth
-        Returns:
-            Task description if found, None otherwise
         """
-        if max_depth <= 0:
-            return None
-        if isinstance(data, dict):
-            for field in fields:
-                if field in data:
-                    return str(data[field])
-            # Search nested dicts
-            for value in data.values():
-                result = self._search_nested_json(value, fields, max_depth - 1)
-                if result:
-                    return result
-        elif isinstance(data, list) and len(data) > 0:
-            # Search first item in list
-            return self._search_nested_json(data[0], fields, max_depth - 1)
-        return None
-    async def _extract_from_html(
-        self,
-        response: httpx.Response,
-        url: str
-    ) -> Dict[str, Any]:
         """
-        Extract task from HTML response with comprehensive element detection
-        Args:
-            response: HTTP response
-            url: Original URL
-        Returns:
-            Dict with task information
         """
-        logger.debug("Extracting task from HTML")
         try:
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # FIRST: Detect special elements
-            special_elements = self._detect_special_elements(soup, url)
-            has_special = any(special_elements.values())
-            if has_special:
-                detected_types = [k for k, v in special_elements.items() if v]
-                logger.info(f"🔍 Detected special elements: {', '.join(detected_types)}")
-            # Strategy 1: Look for common task containers
-            task_selectors = [
-                {'id': 'task'},
-                {'id': 'question'},
-                {'id': 'instruction'},
-                {'id': 'quiz'},
-                {'class_': 'task'},
-                {'class_': 'question'},
-                {'class_': 'instruction'},
-                {'class_': 'task-description'},
-                {'class_': 'quiz-question'},
-                {'data-task': True}
-            ]
-            task_description = None
-            extraction_method = None
-            for selector in task_selectors:
-                element = soup.find(**selector)
-                if element:
-                    task_description = element.get_text(strip=True, separator=' ')
-                    extraction_method = f"selector_{list(selector.keys())[0]}"
-                    logger.debug(f"Found task using selector: {selector}")
-                    break
-            # Strategy 2: Look for main content area
-            if not task_description:
-                main_content = (
-                    soup.find('main') or
-                    soup.find('article') or
-                    soup.find('div', class_='content') or
-                    soup.find('div', id='content') or
-                    soup.find('section', class_='main')
-                )
-                if main_content:
-                    task_description = main_content.get_text(strip=True, separator=' ')
-                    extraction_method = "main_content"
-                    logger.debug("Found task in main content area")
-            # Strategy 3: Look for pre/code/textarea blocks (often contain base64 or instructions)
-            if not task_description:
-                code_blocks = soup.find_all(['pre', 'code', 'textarea'])
-                if code_blocks:
-                    task_description = '\n'.join(
-                        block.get_text(strip=True) for block in code_blocks
-                    )
-                    extraction_method = "code_blocks"
-                    logger.debug(f"Found task in {len(code_blocks)} code blocks")
-            # Strategy 4: Use body text (last resort)
-            if not task_description:
-                # Remove script and style tags
-                for script in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
-                    script.decompose()
-                task_description = soup.get_text(strip=True, separator=' ')
-                extraction_method = "body_fallback"
-                logger.warning("Using body text as task description")
-            # Check for base64 encoding
-            original_description = task_description
-            task_description = self._detect_and_decode_base64(task_description)
-            was_base64_decoded = (original_description != task_description)
-            # Extract metadata
-            title = soup.find('title')
-            title_text = title.get_text(strip=True) if title else ''
-            # Check for meta description
-            meta_desc = soup.find('meta', attrs={'name': 'description'})
-            meta_description = meta_desc.get('content', '') if meta_desc else ''
             return {
-                'task_description': task_description.strip(),
-                'raw_content': response.text,
-                'content_type': 'html',
-                'url': url,
-                'metadata': {
-                    'title': title_text,
-                    'meta_description': meta_description,
-                    'extraction_method': extraction_method,
-                    'has_forms': bool(soup.find('form')),
-                    'has_tables': bool(soup.find('table')),
-                    'was_base64_decoded': was_base64_decoded,
-                    'special_elements': special_elements,
-                    'page_size_kb': len(response.content) / 1024
-                }
-            }
-        except Exception as e:
-            logger.error(f"Failed to parse HTML: {str(e)}", exc_info=True)
-            # Fallback to text extraction
-            return await self._extract_from_text(response, url)
-    def _detect_special_elements(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
-        """
-        Detect special elements that might contain or lead to the task
-        Args:
-            soup: BeautifulSoup parsed HTML
-            base_url: Base URL for resolving relative URLs
-        Returns:
-            Dict of detected elements with absolute URLs
-        """
-        elements = {
-            'audio_urls': [],
-            'video_urls': [],
-            'image_urls': [],
-            'download_links': [],
-            'iframe_sources': [],
-            'external_links': [],
-            'form_actions': [],
-            'javascript_files': []
-        }
-        # Audio elements
-        for audio in soup.find_all(['audio', 'source']):
-            src = audio.get('src')
-            if src:
-                # Check for audio extensions or content type
-                is_audio = any(ext in src.lower() for ext in ['.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac'])
-                audio_type = audio.get('type', '')
-                if is_audio or 'audio/' in audio_type:
-                    absolute_url = urljoin(base_url, src)
-                    elements['audio_urls'].append(absolute_url)
-                    logger.debug(f"Found audio: {absolute_url}")
-        # Video elements
-        for video in soup.find_all(['video', 'source']):
-            src = video.get('src')
-            if src:
-                is_video = any(ext in src.lower() for ext in ['.mp4', '.webm', '.avi', '.mov', '.mkv'])
-                video_type = video.get('type', '')
-                if is_video or 'video/' in video_type:
-                    absolute_url = urljoin(base_url, src)
-                    elements['video_urls'].append(absolute_url)
-                    logger.debug(f"Found video: {absolute_url}")
-        # YouTube/Vimeo iframes
-        for iframe in soup.find_all('iframe'):
-            src = iframe.get('src', '')
-            if 'youtube.com' in src or 'vimeo.com' in src:
-                elements['video_urls'].append(src)
-                logger.debug(f"Found video iframe: {src}")
-            elif src:
-                absolute_url = urljoin(base_url, src)
-                elements['iframe_sources'].append(absolute_url)
-                logger.debug(f"Found iframe: {absolute_url}")
-        # Image elements (might contain QR codes, screenshots with tasks, etc.)
-        for img in soup.find_all('img'):
-            src = img.get('src')
-            if src:
-                absolute_url = urljoin(base_url, src)
-                # Only include if it looks like it might contain data (not decorative)
-                alt_text = img.get('alt', '').lower()
-                if any(keyword in alt_text for keyword in ['task', 'question', 'instruction', 'qr', 'code']):
-                    elements['image_urls'].append(absolute_url)
-                    logger.debug(f"Found relevant image: {absolute_url}")
-        # Download links
-        for link in soup.find_all('a', href=True):
-            href = link.get('href', '')
-            link_text = link.get_text().strip().lower()
-            # Check for downloadable files
-            download_extensions = [
-                '.pdf', '.csv', '.xlsx', '.xls', '.zip', '.txt',
-                '.json', '.xml', '.doc', '.docx', '.tsv'
-            ]
-            is_download = any(ext in href.lower() for ext in download_extensions)
-            # Check for text indicating download or external task
-            download_keywords = ['download', 'get task', 'click here', 'task file', 'see task']
-            has_download_text = any(keyword in link_text for keyword in download_keywords)
-            if is_download:
-                absolute_url = urljoin(base_url, href)
-                elements['download_links'].append(absolute_url)
-                logger.debug(f"Found download link: {absolute_url}")
-            elif has_download_text:
-                absolute_url = urljoin(base_url, href)
-                elements['external_links'].append(absolute_url)
-                logger.debug(f"Found external link: {absolute_url}")
-        # Forms (might need to submit to get task)
-        for form in soup.find_all('form'):
-            action = form.get('action')
-            if action:
-                absolute_url = urljoin(base_url, action)
-                elements['form_actions'].append(absolute_url)
-                logger.debug(f"Found form action: {absolute_url}")
-        # JavaScript files (might load task dynamically)
-        for script in soup.find_all('script', src=True):
-            src = script.get('src')
-            if src and not any(cdn in src for cdn in ['google', 'cdn', 'jquery']):
-                absolute_url = urljoin(base_url, src)
-                elements['javascript_files'].append(absolute_url)
-        # Remove duplicates
-        for key in elements:
-            elements[key] = list(set(elements[key]))
-        return elements
-    async def _extract_from_text(
-        self,
-        response: httpx.Response,
-        url: str
-    ) -> Dict[str, Any]:
-        """
-        Extract task from plain text response with base64 detection
-        Args:
-            response: HTTP response
-            url: Original URL
-        Returns:
-            Dict with task information
-        """
-        logger.debug("Extracting task from plain text")
-        text = response.text.strip()
-        # Check for base64 encoding
-        original_text = text
-        text = self._detect_and_decode_base64(text)
-        was_base64_decoded = (original_text != text)
-        return {
-            'task_description': text,
-            'raw_content': response.text,
-            'content_type': 'text',
-            'url': url,
-            'metadata': {
-                'length': len(text),
-                'lines': text.count('\n') + 1,
-                'was_base64_decoded': was_base64_decoded,
-                'special_elements': {}
             }
-        }
-    async def _extract_from_binary(
-        self,
-        response: httpx.Response,
-        url: str
-    ) -> Dict[str, Any]:
-        """
-        Handle binary content (PDF, audio, video, images, CSV)
-        Returns info that LLM will need to process
-        Args:
-            response: HTTP response
-            url: Original URL
-        Returns:
-            Dict with task information indicating processing needed
-        """
-        content_type = response.headers.get('content-type', 'unknown')
-        logger.warning(f"⚠️  Binary content detected: {content_type}")
-        task_description = f"Binary content detected. Type: {content_type}. URL: {url}. "
-        if 'pdf' in content_type:
-            task_description += "This is a PDF file that needs to be downloaded and parsed."
-        elif 'audio' in content_type:
-            task_description += "This is an audio file that needs to be transcribed."
-        elif 'video' in content_type:
-            task_description += "This is a video file that might need processing or transcription."
-        elif 'image' in content_type:
-            task_description += "This is an image that might need OCR or vision analysis."
-        elif 'csv' in content_type:
-            task_description += "This is a CSV file that needs to be downloaded and parsed."
-        else:
-            task_description += "This needs to be downloaded and processed."
-        return {
-            'task_description': task_description,
-            'raw_content': '',  # Don't include binary in raw_content
-            'content_type': content_type,
-            'url': url,
-            'metadata': {
-                'is_binary': True,
-                'content_length': len(response.content),
-                'requires_download': True,
-                'special_elements': {
-                    'download_links': [url]
-                }
-            }
-        }
-    def _detect_and_decode_base64(self, text: str) -> str:
-        """
-        Detect and decode base64 content in text
-        Args:
-            text: Text that might contain base64
-        Returns:
-            Decoded text if base64 found, original text otherwise
-        """
-        # Pattern to detect base64 strings (at least 20 chars, typical base64 chars)
-        # Must be fairly long to avoid false positives
-        base64_pattern = r'([A-Za-z0-9+/]{40,}={0,2})'
-        matches = re.findall(base64_pattern, text)
-        if not matches:
-            return text
-        logger.debug(f"Found {len(matches)} potential base64 strings")
-        decoded_parts = []
-        for match in matches:
             try:
-                # Try to decode
-                decoded_bytes = base64.b64decode(match, validate=True)
-                decoded_text = decoded_bytes.decode('utf-8', errors='ignore')
-                # Check if decoded text is readable (not binary)
-                if self._is_readable_text(decoded_text):
-                    logger.info(
-                        f"✓ Decoded base64 string "
-                        f"(length: {len(match)} → {len(decoded_text)} chars)"
-                    )
-                    decoded_parts.append(decoded_text)
-                else:
-                    logger.debug("Base64 decoded to binary/unreadable data")
             except Exception as e:
-                logger.debug(f"Not valid base64: {str(e)}")
-                continue
-        # If we successfully decoded anything, return the best candidate
-        if decoded_parts:
-            # Use the longest decoded string as it's likely the main content
-            result = max(decoded_parts, key=len)
-            logger.info(f"✅ Using decoded base64 content ({len(result)} chars)")
-            return result
-        return text
-    def _is_readable_text(self, text: str, min_printable_ratio: float = 0.7) -> bool:
-        """
-        Check if decoded text is human-readable
-        Args:
-            text: Text to check
-            min_printable_ratio: Minimum ratio of printable characters
-        Returns:
-            bool: True if text appears readable
-        """
-        if not text or len(text) < 10:
             return False
-        # Count printable characters (letters, numbers, punctuation, spaces)
-        printable_count = sum(c.isprintable() or c.isspace() for c in text)
-        ratio = printable_count / len(text)
-        # Also check for some common words to confirm it's text
-        has_common_words = any(
-            word in text.lower()
-            for word in ['the', 'and', 'task', 'data', 'file', 'http']
         )
-        return ratio >= min_printable_ratio and (ratio > 0.9 or has_common_words)
-    def _needs_llm_analysis(self, task_info: Dict[str, Any]) -> bool:
-        """
-        Determine if fetched content needs LLM analysis to extract actual task
-        Args:
-            task_info: Fetched task information
-        Returns:
-            bool: True if LLM analysis needed
-        """
-        metadata = task_info.get('metadata', {})
-        task_desc = task_info.get('task_description', '').lower()
-        # Check 1: Binary content always needs LLM
-        if metadata.get('is_binary'):
-            logger.info("🤖 Binary content detected - LLM analysis required")
-            return True
-        # Check 2: Special elements present
-        special_elements = metadata.get('special_elements', {})
-        has_audio = bool(special_elements.get('audio_urls'))
-        has_video = bool(special_elements.get('video_urls'))
-        has_downloads = bool(special_elements.get('download_links'))
-        has_iframes = bool(special_elements.get('iframe_sources'))
-        has_images = bool(special_elements.get('image_urls'))
-        has_forms = bool(special_elements.get('form_actions'))
-        if any([has_audio, has_video, has_downloads, has_iframes, has_images, has_forms]):
-            logger.info(
-                f"🤖 Special elements detected - LLM analysis recommended "
-                f"(audio:{has_audio}, video:{has_video}, downloads:{has_downloads}, "
-                f"iframes:{has_iframes}, images:{has_images}, forms:{has_forms})"
             )
-            return True
-        # Check 3: Very short content (likely incomplete)
-        if len(task_desc.strip()) < 30:
-            logger.info("🤖 Very short content - LLM analysis recommended")
-            return True
-        # Check 4: Indirect language suggesting further action needed
-        indirect_keywords = [
-            'click here', 'download', 'visit', 'listen to',
-            'watch', 'see attached', 'refer to', 'check the',
-            'navigate to', 'go to', 'follow the link',
-            'open the file', 'play the', 'view the'
-        ]
-        has_indirect_language = any(keyword in task_desc for keyword in indirect_keywords)
-        if has_indirect_language:
-            logger.info("🤖 Indirect language detected - LLM analysis recommended")
-            return True
-        # Check 5: Multiple URLs in content (might need to visit them)
-        url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
-        urls_in_content = re.findall(url_pattern, task_desc)
-        if len(urls_in_content) > 1:
-            logger.info(f"🤖 Multiple URLs found ({len(urls_in_content)}) - LLM analysis recommended")
-            return True
-        # Content seems straightforward
-        logger.info("✓ Content appears straightforward - LLM analysis not required")
-        return False
-# Convenience function for quick usage
-async def fetch_task_from_url(url: str) -> Dict[str, Any]:
-    """
-    Convenience function to fetch task from URL
-    Args:
-        url: URL to fetch task from
-    Returns:
-        Dict with task information
-    Raises:
-        TaskProcessingError: If fetching fails
-    """
-    async with TaskFetcher() as fetcher:
-        return await fetcher.fetch_task(url)

 """
+Task Fetcher Service - with Static/Dynamic Scraper fallback
+Fetches and extracts task descriptions from URLs
 """
 import httpx
 import json
 import re
 from typing import Optional, Dict, Any, List
+from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from app.core.config import settings
 from app.core.logging import get_logger
 from app.core.exceptions import TaskProcessingError
+from app.utils.llm_client import get_llm_client
+from app.utils.prompts import AnalysisPrompts
 logger = get_logger(__name__)
 class TaskFetcher:
     """
+    Enhanced service for fetching and extracting task descriptions from URLs.
+    Strategy:
+      1. httpx (fast)
+      2. If content looks JS-only/empty → DynamicScraper
+      3. Let orchestrator use Static/Dynamic scrapers later for real data pages
     """
     def __init__(self, timeout: int = 30):
         self.timeout = timeout
+        self.client: Optional[httpx.AsyncClient] = None
+        self.llm_client = get_llm_client()
+        # Import here to avoid circular imports
+        from app.orchestrator.models import UnifiedTaskAnalysis
+        self._content_analyzer_agent = self.llm_client.create_agent(
+            output_type=UnifiedTaskAnalysis,
+            system_prompt=(
+                "You are an expert at analyzing task content. "
+                "You detect redirects, extract submission URLs, and parse instructions."
+            ),
+            retries=2
+        )
+        logger.debug("TaskFetcher initialized with unified LLM analysis")
     async def __aenter__(self):
         self.client = httpx.AsyncClient(
             timeout=self.timeout,
             follow_redirects=True,
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         if self.client:
             await self.client.aclose()
+    # ======================================================================
+    # PUBLIC ENTRY POINT
+    # ======================================================================
+    async def fetch_and_analyze(self, url: str, base_url: Optional[str] = None) -> Dict[str, Any]:
         """
+        Fetch URL and perform unified LLM analysis.
+        1. httpx + basic extraction
+        2. If JS-only / empty → DynamicScraper
+        3. LLM: redirect + submission_url + instructions
+        """
+        logger.info(f"📥 Fetching and analyzing URL: {url}")
+        if base_url is None:
+            base_url = url
+        # Step 1: Fetch visible content (with fallback)
+        content = await self._fetch_content(url)
+        logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
+        # Step 2: Unified LLM analysis
+        analysis = await self._analyze_content_with_llm(
+            task_description=content['task_description'],
+            raw_content=content['raw_content'],
+            url=url,
+            base_url=base_url
+        )
+        # Merge content + analysis
+        result = {
+            **content,
+            'is_redirect': analysis.is_redirect,
+            'question_url': analysis.question_url,
+            'submission_url': analysis.submission_url,
+            'instructions': self._format_instructions(analysis.instructions),
+            'overall_goal': analysis.overall_goal,
+            'complexity': analysis.complexity,
+            'llm_analysis': {
+                'redirect_reasoning': analysis.redirect_reasoning,
+                'submission_reasoning': analysis.submission_reasoning,
+                'confidence': analysis.confidence,
             }
+        }
+        # Resolve relative submission URL if needed
+        if analysis.submission_url and analysis.submission_url_is_relative:
+            absolute = str(httpx.URL(base_url).join(analysis.submission_url))
+            logger.info(f"✓ Resolved relative submission URL: {analysis.submission_url} → {absolute}")
+            result['submission_url'] = absolute
+        # Resolve relative question URL if needed
+        if analysis.question_url and analysis.question_url.startswith('/'):
+            absolute_q = str(httpx.URL(base_url).join(analysis.question_url))
+            logger.info(f"✓ Resolved relative question URL: {analysis.question_url} → {absolute_q}")
+            result['question_url'] = absolute_q
+        logger.info("✅ Analysis complete:")
+        logger.info(f"   Is Redirect: {result['is_redirect']}")
+        logger.info(f"   Submission URL: {result['submission_url']}")
+        logger.info(f"   Instructions: {len(result['instructions'])} steps")
+        logger.info(f"   Complexity: {result['complexity']}")
+        return result
+    # ======================================================================
+    # FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
+    # ======================================================================
+    async def _fetch_content(self, url: str) -> Dict[str, Any]:
+        """
+        Fetch content from URL.
+        - Try httpx first
+        - If JS-only/empty → fallback to DynamicScraper
+        """
         if not self._is_valid_url(url):
             raise TaskProcessingError(f"Invalid URL format: {url}")
         try:
             response = await self._fetch_url(url)
             content_type = self._detect_content_type(response)
+            # Basic extraction
+            task_description = await self._extract_basic_content(response, content_type)
+            raw_content = response.text[:5000]
+            # Heuristic: if nothing useful, try dynamic scraper
+            if self._looks_js_only(task_description, raw_content):
+                logger.warning("⚠️ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
+                dyn = await self._fetch_with_dynamic_scraper(url)
+                task_description = dyn['task_description']
+                raw_content = dyn['raw_content']
+            return {
+                'task_description': task_description,
+                'raw_content': raw_content,
+                'content_type': content_type,
+                'url': url,
+                'metadata': {
+                    'content_length': len(response.content),
+                    'status_code': response.status_code,
+                }
+            }
         except Exception as e:
+            logger.error(f"❌ Failed to fetch content: {e}", exc_info=True)
+            raise TaskProcessingError(f"Failed to fetch URL: {str(e)}")
     async def _fetch_url(self, url: str) -> httpx.Response:
+        """Fetch with httpx and retries."""
+        max_retries = getattr(settings, "MAX_RETRIES", 3)
         for attempt in range(max_retries):
             try:
+                logger.debug(f"HTTPX fetch attempt {attempt + 1}/{max_retries} for {url}")
                 response = await self.client.get(url)
                 response.raise_for_status()
                 return response
             except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
+                logger.warning(f"Attempt {attempt + 1} failed: {e}")
                 if attempt == max_retries - 1:
                     raise
                 continue
+    def _looks_js_only(self, task_description: str, html: str) -> bool:
         """
+        Detect JS-only / empty pages that need dynamic rendering.
+        - Empty or tiny text
+        - Has <script> that uses atob/innerHTML/URLSearchParams
         """
+        if task_description and len(task_description.strip()) > 50:
+            return False
+        # Strong JS signals
+        js_markers = ['atob(', 'innerHTML', 'URLSearchParams', 'document.querySelector']
+        if any(marker in html for marker in js_markers):
+            return True
+        # Very little visible text after stripping scripts
+        cleaned = re.sub(r'<script.*?</script>', '', html, flags=re.S | re.I)
+        if len(cleaned.strip()) < 100:
+            return True
+        return False
+    async def _fetch_with_dynamic_scraper(self, url: str) -> Dict[str, Any]:
         """
+        Use DynamicScraper to render the page and extract visible text
+        for instruction pages.
         """
+        from app.modules.scrapers.dynamic_scraper import DynamicScraper
+        scraper = DynamicScraper(use_pool=True)
+        await scraper.initialize()
         try:
+            # Auto-extract text blocks
+            result = await scraper.scrape_url(url)
+            if not result.success:
+                raise RuntimeError(result.error or "Dynamic scraping failed")
+            # DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
+            texts: List[str] = []
+            if isinstance(result.data, list):
+                for row in result.data:
+                    if isinstance(row, dict) and 'text' in row:
+                        texts.append(str(row['text']))
+            task_text = "\n".join(texts) if texts else ""
+            logger.info(f"✓ Got {len(texts)} text blocks via DynamicScraper")
+            # Best-effort raw_content: you could extend DynamicScraper to return page.content()
             return {
+                'task_description': task_text,
+                'raw_content': task_text[:5000],  # at least something readable
             }
+        finally:
+            await scraper.cleanup()
+    # ======================================================================
+    # BASIC EXTRACTION (NO LLM)
+    # ======================================================================
+    async def _extract_basic_content(self, response: httpx.Response, content_type: str) -> str:
+        """Fast, no-JS extraction for instruction pages."""
+        if content_type == 'json':
+            try:
+                data = response.json()
+                for field in ['task', 'description', 'question', 'content', 'text']:
+                    if isinstance(data, dict) and field in data:
+                        return str(data[field])
+                return json.dumps(data)
+            except Exception:
+                return response.text
+        if content_type == 'html':
             try:
+                html = response.text
+                soup = BeautifulSoup(html, 'html.parser')
+                for script in soup(['script', 'style', 'nav', 'header', 'footer']):
+                    script.decompose()
+                text = soup.get_text(strip=True, separator=' ')
+                return text
             except Exception as e:
+                logger.error(f"HTML basic extraction failed: {e}")
+                return response.text
+        return response.text
+    def _detect_content_type(self, response: httpx.Response) -> str:
+        ct = response.headers.get('content-type', '').lower()
+        if 'application/json' in ct:
+            return 'json'
+        if 'text/html' in ct or '<html' in response.text.lower()[:200]:
+            return 'html'
+        return 'text'
+    def _is_valid_url(self, url: str) -> bool:
+        try:
+            r = urlparse(url)
+            return r.scheme in ('http', 'https') and bool(r.netloc)
+        except Exception:
             return False
+    # ======================================================================
+    # LLM ANALYSIS
+    # ======================================================================
+    async def _analyze_content_with_llm(
+        self,
+        task_description: str,
+        raw_content: str,
+        url: str,
+        base_url: str
+    ):
+        """Unified LLM analysis."""
+        logger.info("🤖 Running unified LLM analysis...")
+        url_pattern = r'https?://[^\s<>"\']+(?:/[^\s<>"\']*)?'
+        all_urls = re.findall(url_pattern, task_description + raw_content[:1000])
+        all_urls = list({u.rstrip('.,;:)') for u in all_urls})
+        prompt = AnalysisPrompts.unified_content_analysis_prompt(
+            task_description=task_description[:2000],
+            found_urls=all_urls,
+            current_url=url,
+            base_url=base_url
         )
+        from app.orchestrator.models import UnifiedTaskAnalysis
+        try:
+            analysis: UnifiedTaskAnalysis = await self.llm_client.run_agent(
+                self._content_analyzer_agent,
+                prompt
             )
+            return analysis
+        except Exception as e:
+            logger.error(f"❌ LLM analysis failed: {e}", exc_info=True)
+            return self._fallback_analysis(task_description, all_urls, url, base_url)
+    def _fallback_analysis(
+        self,
+        task_description: str,
+        all_urls: List[str],
+        url: str,
+        base_url: str
+    ):
+        """Very simple fallback if LLM fails."""
+        from app.orchestrator.models import UnifiedTaskAnalysis, InstructionStep
+        logger.warning("⚠️ Using fallback pattern-based analysis")
+        is_redirect = False
+        submission_url = None
+        for pattern in [r'POST\s+(?:to\s+)?([^\s<>"\']+)', r'submit\s+(?:to\s+)?([^\s<>"\']+)']:
+            m = re.search(pattern, task_description, re.IGNORECASE)
+            if m:
+                submission_url = m.group(1).rstrip('.,;:)')
+                break
+        sentences = re.split(r'[.;\n]', task_description)
+        instructions = []
+        step = 1
+        for s in sentences:
+            s = s.strip()
+            if len(s) > 5:
+                instructions.append(InstructionStep(
+                    step_number=step,
+                    action='unknown',
+                    description=s,
+                    target=None,
+                    dependencies=[]
+                ))
+                step += 1
+        return UnifiedTaskAnalysis(
+            is_redirect=is_redirect,
+            question_url=None,
+            redirect_reasoning="Fallback: no redirect detection",
+            submission_url=submission_url,
+            submission_url_is_relative=submission_url.startswith('/') if submission_url else False,
+            submission_reasoning="Fallback: simple regex match",
+            instructions=instructions,
+            overall_goal="Unknown (fallback)",
+            complexity="unknown",
+            confidence=0.3
+        )
+    def _format_instructions(self, steps) -> List[Dict[str, Any]]:
+        return [
+            {
+                'step': s.step_number,
+                'action': s.action,
+                'text': s.description,
+                'target': s.target,
+                'dependencies': s.dependencies,
+            }
+            for s in steps
+        ]

app/services/task_processor.py CHANGED Viewed

@@ -1,24 +1,25 @@
 """
 Task Processing Service
-Complete orchestration using OrchestratorEngine (Steps 1-8)
 """
-from typing import Dict, Any
 from app.models.request import TaskRequest
 from app.core.logging import get_logger
 from app.core.exceptions import TaskProcessingError
 from app.orchestrator.orchestrator_engine import OrchestratorEngine
 from app.modules.registry import ModuleRegistry
 from app.modules.mock_modules import register_mock_modules
 logger = get_logger(__name__)
 class TaskProcessor:
     """
-    Service class for processing tasks
-    Uses complete orchestration engine (Steps 1-8)
     """
     def __init__(
@@ -27,74 +28,177 @@ class TaskProcessor:
         enable_actions: bool = True,
         auto_register_modules: bool = True
     ):
-        """
-        Initialize task processor
-        Args:
-            enable_decomposition: Enable complex task decomposition
-            enable_actions: Enable action execution (downloads, transcription, OCR)
-            auto_register_modules: Auto-register mock modules
-        """
-        logger.info("🚀 Initializing TaskProcessor with OrchestratorEngine")
-        # Setup module registry
         self.registry = ModuleRegistry()
         if auto_register_modules:
-            logger.info("📦 Registering mock modules...")
             register_mock_modules(self.registry)
             logger.info(f"✓ Registered {len(self.registry.get_all_modules())} modules")
-        # Initialize orchestrator engine (Steps 1-8)
         self.orchestrator = OrchestratorEngine(
             module_registry=self.registry,
             enable_decomposition=enable_decomposition,
             enable_actions=enable_actions
         )
-        logger.info("✅ TaskProcessor initialized with complete orchestration")
     async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
         """
-        Process a task based on the provided data
-        Uses complete orchestration pipeline (Steps 1-8)
-        Args:
-            task_data: Validated task request
-        Returns:
-            Dict containing complete task results
-        Raises:
-            TaskProcessingError: If processing fails
         """
         logger.info(f"🔄 Processing task for: {task_data.email}")
-        logger.info(f"📋 Task URL: {task_data.url}")
         try:
-            # Execute complete orchestration
             logger.info("=" * 80)
-            logger.info("EXECUTING COMPLETE ORCHESTRATION PIPELINE (Steps 1-8)")
             logger.info("=" * 80)
             orchestration_result = await self.orchestrator.execute_task(
-                task_input=str(task_data.url),
-                task_url=str(task_data.url),
-                context={'email': task_data.email}
             )
-            # Build response
-            result = self._build_response(
-                task_data=task_data,
-                orchestration_result=orchestration_result
-            )
-            logger.info("=" * 80)
-            logger.info(f"✅ Task processing completed | Duration: {orchestration_result['duration']:.2f}s")
             logger.info("=" * 80)
-            return result
         except TaskProcessingError:
             # Re-raise task processing errors
             raise
@@ -103,117 +207,158 @@ class TaskProcessor:
             logger.error(f"❌ Task processing failed: {str(e)}", exc_info=True)
             raise TaskProcessingError(f"Failed to process task: {str(e)}")
     def _build_response(
         self,
         task_data: TaskRequest,
-        orchestration_result: Dict[str, Any]
     ) -> Dict[str, Any]:
         """
-        Build API response from orchestration result
         Args:
             task_data: Original task request
             orchestration_result: Result from orchestrator
         Returns:
-            Dict: Formatted response
         """
-        # Extract key information from orchestration
-        classification = orchestration_result.get('execution_details', {}).get('classification')
-        parameters = orchestration_result.get('execution_details', {}).get('parameters')
-        # Base response
-        response = {
-            'status': 'completed' if orchestration_result['success'] else 'failed',
             'email': task_data.email,
-            'task_url': str(task_data.url),
-            'task_id': orchestration_result['task_id'],
-            'execution_id': orchestration_result['execution_id'],
-            'duration': orchestration_result['duration'],
-            'strategy': orchestration_result.get('strategy', 'unknown'),
-            'success': orchestration_result['success']
-        }
-        # Add execution details
-        if orchestration_result['success']:
-            response['result'] = {
-                'data': orchestration_result.get('data'),
-                'execution_details': orchestration_result.get('execution_details', {}),
-                'steps_completed': orchestration_result.get('steps', {})
-            }
-            # Add classification if available
-            if classification:
-                response['classification'] = self._format_classification(classification)
-            # Add parameters if available
-            if parameters:
-                response['parameters'] = self._format_parameters(parameters)
-            response['message'] = 'Task executed successfully through complete orchestration pipeline'
-        else:
-            response['error'] = orchestration_result.get('error', 'Unknown error')
-            response['message'] = 'Task execution failed'
-        # Add execution log
-        response['execution_log'] = orchestration_result.get('execution_log', [])
-        return response
-    def _format_classification(self, classification: Any) -> Dict[str, Any]:
-        """Format classification for API response"""
-        try:
-            return {
-                'primary_task': classification.primary_task.value,
-                'secondary_tasks': [t.value for t in classification.secondary_tasks],
-                'complexity': classification.complexity.value,
-                'estimated_steps': classification.estimated_steps,
-                'requires_javascript': classification.requires_javascript,
-                'requires_authentication': classification.requires_authentication,
-                'requires_external_data': classification.requires_external_data,
-                'output_format': classification.output_format.value,
-                'confidence': classification.confidence,
-                'reasoning': classification.reasoning,
-                'key_entities': classification.key_entities,
-                'suggested_tools': classification.suggested_tools
-            }
-        except Exception as e:
-            logger.warning(f"Could not format classification: {e}")
-            return {'error': 'Classification format error'}
-    def _format_parameters(self, parameters: Any) -> Dict[str, Any]:
-        """Format parameters for API response"""
-        try:
-            return {
-                'data_sources': [
-                    {
-                        'type': ds.type,
-                        'location': ds.location,
-                        'format': ds.format,
-                        'description': ds.description
-                    }
-                    for ds in parameters.data_sources
-                ],
-                'filters': [
-                    {
-                        'field': f.field,
-                        'operator': f.operator,
-                        'value': f.value,
-                        'description': f.description
-                    }
-                    for f in parameters.filters
-                ],
-                'columns': [col.name for col in parameters.columns],
-                'aggregations': len(parameters.aggregations),
-                'visualizations': len(parameters.visualizations),
-                'output_format': parameters.output.format if parameters.output else None,
-                'confidence': parameters.confidence,
-                'complexity_score': parameters.complexity_score
-            }
-        except Exception as e:
-            logger.warning(f"Could not format parameters: {e}")
-            return {'error': 'Parameters format error'}
     def get_registry(self) -> ModuleRegistry:
         """Get module registry for adding/removing modules"""
@@ -223,7 +368,16 @@ class TaskProcessor:
         """Get orchestrator engine for advanced usage"""
         return self.orchestrator
-    def cleanup(self):
         """Clean up resources"""
-        self.orchestrator.cleanup()
-        logger.info("TaskProcessor cleanup complete")

 """
 Task Processing Service
+Simplified with unified LLM analysis in task_fetcher
 """
+from typing import Dict, Any, Optional
 from app.models.request import TaskRequest
 from app.core.logging import get_logger
 from app.core.exceptions import TaskProcessingError
 from app.orchestrator.orchestrator_engine import OrchestratorEngine
 from app.modules.registry import ModuleRegistry
 from app.modules.mock_modules import register_mock_modules
+from app.services.task_fetcher import TaskFetcher
+# from app.orchestrator.answer_submitter import AnswerSubmitter  # ✅ Fixed: uncommented
 logger = get_logger(__name__)
 class TaskProcessor:
     """
+    Service class for processing TDS quiz tasks
+    Uses unified LLM analysis from task_fetcher
     """
     def __init__(
         enable_actions: bool = True,
         auto_register_modules: bool = True
     ):
+        """Initialize task processor"""
+        logger.info("🚀 Initializing TaskProcessor")
+        # Setup components
         self.registry = ModuleRegistry()
+        # self.answer_submitter = AnswerSubmitter()  # ✅ Fixed: using this
         if auto_register_modules:
+            logger.info("📦 Registering modules...")
             register_mock_modules(self.registry)
             logger.info(f"✓ Registered {len(self.registry.get_all_modules())} modules")
+        # Initialize orchestrator engine
         self.orchestrator = OrchestratorEngine(
             module_registry=self.registry,
             enable_decomposition=enable_decomposition,
             enable_actions=enable_actions
         )
+        logger.info("✅ TaskProcessor initialized")
     async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
         """
+        Process TDS quiz task
+        Flow:
+        1. Fetch and analyze Request URL (unified LLM call in task_fetcher)
+        2. If redirect, fetch Question URL (unified LLM call in task_fetcher)
+        3. Execute orchestration
+        4. Extract answer
+        5. Submit to TDS
+        6. Build response
         """
+        logger.info("=" * 80)
         logger.info(f"🔄 Processing task for: {task_data.email}")
+        logger.info(f"📋 Request URL: {task_data.url}")
+        logger.info("=" * 80)
+        request_url = str(task_data.url)
+        question_url = None
+        submission_url = None
         try:
+            # ===================================================================
+            # STEP 1: FETCH AND ANALYZE REQUEST URL (UNIFIED LLM CALL)
+            # ===================================================================
+            logger.info("\n" + "=" * 80)
+            logger.info("STEP 1: FETCHING & ANALYZING REQUEST URL")
             logger.info("=" * 80)
+            async with TaskFetcher() as fetcher:
+                analysis = await fetcher.fetch_and_analyze(url=request_url)
+            logger.info(f"✓ Request URL analyzed")
+            logger.info(f"  Is Redirect: {analysis['is_redirect']}")
+            logger.info(f"  Complexity: {analysis['complexity']}")
+            # ===================================================================
+            # STEP 2: IF REDIRECT, FETCH QUESTION URL (UNIFIED LLM CALL)
+            # ===================================================================
+            if analysis['is_redirect'] and analysis['question_url']:
+                logger.info("\n" + "=" * 80)
+                logger.info("STEP 2: FETCHING & ANALYZING QUESTION URL")
+                logger.info("=" * 80)
+                question_url = analysis['question_url']
+                logger.info(f"🔗 Detected redirect to: {question_url}")
+                async with TaskFetcher() as fetcher:
+                    analysis = await fetcher.fetch_and_analyze(
+                        url=question_url,
+                        base_url=request_url  # For resolving relative URLs
+                    )
+                logger.info(f"✓ Question URL analyzed")
+                logger.info(f"  Task: {analysis['task_description'][:100]}...")
+            else:
+                question_url = request_url
+                logger.info(f"✓ Request URL contains actual task (no redirect)")
+            # Extract key information
+            task_description = analysis['task_description']
+            submission_url = analysis.get('submission_url')
+            instructions = analysis.get('instructions', [])
+            # Log URL hierarchy
+            logger.info("\n📍 URL Hierarchy:")
+            logger.info(f"   Request URL:    {request_url}")
+            logger.info(f"   Question URL:   {question_url}")
+            logger.info(f"   Submission URL: {submission_url}")
+            logger.info(f"   Instructions:   {len(instructions)} steps")
+            # ===================================================================
+            # STEP 3: EXECUTE ORCHESTRATION
+            # ===================================================================
+            logger.info("\n" + "=" * 80)
+            logger.info("STEP 3: EXECUTING ORCHESTRATION")
             logger.info("=" * 80)
             orchestration_result = await self.orchestrator.execute_task(
+                task_input=task_description,
+                task_url=question_url,
+                context={
+                    'email': task_data.email,
+                    'request_url': request_url,
+                    'question_url': question_url,
+                    'submission_url': submission_url,
+                    'instructions': instructions,
+                    'complexity': analysis['complexity'],
+                    'overall_goal': analysis['overall_goal']
+                }
             )
+            logger.info(f"✓ Orchestration completed")
+            logger.info(f"  Success: {orchestration_result['success']}")
+            logger.info(f"  Duration: {orchestration_result['duration']:.2f}s")
+            # ===================================================================
+            # STEP 4: EXTRACT ANSWER
+            # ===================================================================
+            logger.info("\n" + "=" * 80)
+            logger.info("STEP 4: EXTRACTING ANSWER")
             logger.info("=" * 80)
+            answer = self._extract_answer(orchestration_result)
+            logger.info(f"✓ Answer extracted: {str(answer)[:200]}")
+            # # ===================================================================
+            # # STEP 5: SUBMIT ANSWER TO TDS
+            # # ===================================================================
+            # logger.info("\n" + "=" * 80)
+            # logger.info("STEP 5: SUBMITTING ANSWER TO TDS")
+            # logger.info("=" * 80)
+            # submission_result = await self.answer_submitter.submit_answer(
+            #     email=task_data.email,
+            #     secret=task_data.secret,
+            #     url=question_url,  # ✅ Use Question URL, not Request URL
+            #     answer=answer,
+            #     submission_url=submission_url
+            # )
+            # logger.info(f"✓ Submission completed")
+            # logger.info(f"  Success: {submission_result['success']}")
+            # logger.info(f"  Status Code: {submission_result.get('status_code')}")
+            # if submission_result.get('response'):
+            #     logger.info(f"  Response: {submission_result['response']}")
+            # # ===================================================================
+            # # STEP 6: BUILD RESPONSE
+            # # ===================================================================
+            # result = self._build_response(
+            #     task_data=task_data,
+            #     request_url=request_url,
+            #     question_url=question_url,
+            #     submission_url=submission_url,
+            #     analysis=analysis,  # ✅ Fixed: pass analysis, not task_content
+            #     orchestration_result=orchestration_result,
+            #     submission_result=submission_result,
+            #     answer=answer
+            # )
+            # logger.info("\n" + "=" * 80)
+            # logger.info(f"✅ TASK COMPLETED SUCCESSFULLY")
+            # logger.info(f"   Total Duration: {orchestration_result['duration']:.2f}s")
+            # logger.info(f"   Answer Submitted: {submission_result['success']}")
+            # logger.info("=" * 80)
+            # return result  # ✅ Fixed: actually return the result
+            return
         except TaskProcessingError:
             # Re-raise task processing errors
             raise
             logger.error(f"❌ Task processing failed: {str(e)}", exc_info=True)
             raise TaskProcessingError(f"Failed to process task: {str(e)}")
+    def _extract_answer(self, orchestration_result: Dict[str, Any]) -> Any:
+        """
+        Extract final answer from orchestration result
+        Tries multiple field names and strategies to find the answer
+        """
+        logger.debug("Extracting answer from orchestration result")
+        if not orchestration_result.get('success'):
+            logger.warning("Orchestration was not successful")
+            return None
+        data = orchestration_result.get('data', {})
+        # If data is not a dict, return it directly
+        if not isinstance(data, dict):
+            logger.debug(f"Data is {type(data).__name__}, returning as-is")
+            return data
+        # Try common answer field names
+        result_fields = [
+            'answer', 'result', 'output', 'value', 'computed_value',
+            'extracted_data', 'scraped_data', 'secret_code',
+            'code', 'secret', 'solution', 'response'
+        ]
+        for field in result_fields:
+            if field in data:
+                logger.debug(f"Found answer in '{field}' field")
+                return data[field]
+        # If only one key, return its value
+        if len(data) == 1:
+            key = list(data.keys())[0]
+            logger.debug(f"Single key '{key}' in data, using its value")
+            return data[key]
+        # Return entire data dict as last resort
+        logger.debug("No specific answer field found, returning entire data")
+        return data
     def _build_response(
         self,
         task_data: TaskRequest,
+        request_url: str,
+        question_url: str,
+        submission_url: str,
+        analysis: Dict[str, Any],  # ✅ Fixed: renamed from task_content
+        orchestration_result: Dict[str, Any],
+        submission_result: Dict[str, Any],
+        answer: Any
     ) -> Dict[str, Any]:
         """
+        Build comprehensive API response
         Args:
             task_data: Original task request
+            request_url: Original URL from API request
+            question_url: URL where actual task was found
+            submission_url: URL where answer was submitted
+            analysis: Unified analysis from task_fetcher
             orchestration_result: Result from orchestrator
+            submission_result: Result from TDS submission
+            answer: Extracted answer
         Returns:
+            Formatted response dict
         """
+        overall_success = (
+            orchestration_result['success'] and
+            submission_result['success']
+        )
+        return {
+            # Status
+            'success': overall_success,
+            'status': 'completed' if overall_success else 'failed',
+            # Request info
             'email': task_data.email,
+            # URL hierarchy
+            'urls': {
+                'request_url': request_url,
+                'question_url': question_url,
+                'submission_url': submission_url
+            },
+            # IDs and timing
+            'task_id': orchestration_result.get('task_id'),
+            'execution_id': orchestration_result.get('execution_id'),
+            'duration': orchestration_result.get('duration'),
+            'timestamp': orchestration_result.get('timestamp'),
+            # Answer
+            'answer': answer,
+            # Submission details
+            'submission': {
+                'success': submission_result['success'],
+                'status_code': submission_result.get('status_code'),
+                'submitted_to': submission_url,
+                'submitted_url': question_url,  # URL included in payload
+                'response': submission_result.get('response')
+            },
+            # Task details
+            'task_details': {
+                'task_description': analysis['task_description'][:500],  # Truncate
+                'complexity': analysis.get('complexity'),
+                'overall_goal': analysis.get('overall_goal'),
+                'instructions_count': len(analysis.get('instructions', [])),
+                'was_redirect': analysis.get('is_redirect', False)
+            },
+            # Orchestration details
+            'orchestration': {
+                'success': orchestration_result['success'],
+                'strategy': orchestration_result.get('strategy'),
+                'steps_completed': list(orchestration_result.get('steps', {}).keys())
+            },
+            # LLM analysis metadata
+            'llm_analysis': analysis.get('llm_analysis', {}),
+            # Message
+            'message': self._build_message(overall_success, orchestration_result, submission_result)
+        }
+    def _build_message(
+        self,
+        overall_success: bool,
+        orchestration_result: Dict[str, Any],
+        submission_result: Dict[str, Any]
+    ) -> str:
+        """Build human-readable status message"""
+        if overall_success:
+            return "Task completed successfully and answer submitted to TDS"
+        if not orchestration_result['success']:
+            error = orchestration_result.get('error', 'Unknown error')
+            return f"Task execution failed: {error}"
+        if not submission_result['success']:
+            error = submission_result.get('error', 'Unknown error')
+            return f"Task completed but submission failed: {error}"
+        return "Task failed for unknown reason"
+    # ========================================================================
+    # UTILITY METHODS
+    # ========================================================================
     def get_registry(self) -> ModuleRegistry:
         """Get module registry for adding/removing modules"""
         """Get orchestrator engine for advanced usage"""
         return self.orchestrator
+    # def get_answer_submitter(self) -> AnswerSubmitter:
+    #     """Get answer submitter for testing"""
+    #     return self.answer_submitter
+    async def cleanup(self):
         """Clean up resources"""
+        try:
+            self.orchestrator.cleanup()
+            logger.info("✓ Orchestrator cleanup complete")
+        except Exception as e:
+            logger.warning(f"Orchestrator cleanup error: {e}")
+        logger.info("✅ TaskProcessor cleanup complete")

app/utils/prompts.py CHANGED Viewed

@@ -54,7 +54,84 @@ Extract EVERYTHING that could be useful for task execution."""
 class AnalysisPrompts:
     """Prompts for data analysis and insight generation"""
     @staticmethod
     def analysis_planning_prompt(
         question: str,
@@ -543,3 +620,88 @@ Be detailed and actionable. Each step should be implementable."""
                     lines.append(f"  ... and {len(values) - 2} more")
         return "\n".join(lines)

 class AnalysisPrompts:
     """Prompts for data analysis and insight generation"""
+    @staticmethod
+    def unified_content_analysis_prompt(
+        task_description: str,
+        found_urls: List[str],
+        current_url: str,
+        base_url: str
+    ) -> str:
+        """Optimized unified analysis prompt"""
+        urls_text = "\n".join(f"- {url}" for url in found_urls) if found_urls else "None"
+        return f"""Analyze this quiz/task content and extract all critical information.
+    **Current URL:** {current_url}
+    **Base URL:** {base_url}
+    **Content:**
+    {task_description}
+    **URLs found:**
+    {urls_text}
+    ---
+    ## EXTRACT:
+    ### 1. SUBMISSION URL (Priority #1)
+    Where to POST the final answer.
+    **Search for:** "POST to", "submit to", "send to", "answer to"
+    **Extract from:** Text, markdown links `[text](URL)`, relative paths `/submit`
+    **Set submission_url_is_relative=True** if starts with `/`
+    ### 2. REDIRECT DETECTION
+    **is_redirect=True** if content says "visit URL" or "task at URL" (directs elsewhere)
+    **is_redirect=False** if content IS the task (has instructions)
+    Provide **question_url** if redirect detected.
+    ### 3. INSTRUCTION PARSING
+    Break into steps (ONLY if is_redirect=False).
+    **Actions:** scrape, extract, calculate, submit, download, transcribe, analyze, visit
+    **Each step:** step_number, action, description, target, dependencies
+    ### 4. ASSESSMENT
+    - **overall_goal**: One sentence
+    - **complexity**: trivial/simple/moderate/complex
+    - **confidence**: 0.0-1.0
+    ---
+    ## EXAMPLE:
+    **Input:**
+    "Scrape /data?email=... Get the secret code. POST code to [/submit](https://example.com/submit)"
+    **Output:**
+    {{
+    "is_redirect": false,
+    "question_url": null,
+    "redirect_reasoning": "Contains task instructions",
+    "submission_url": "/submit",
+    "submission_url_is_relative": true,
+    "submission_reasoning": "Found 'POST code to /submit'",
+    "instructions": [
+    {{"step_number": 1, "action": "scrape", "description": "Scrape /data page", "target": "/data?email=...", "dependencies": []}},
+    {{"step_number": 2, "action": "extract", "description": "Extract secret code", "target": "secret code", "dependencies": }},
+    {{"step_number": 3, "action": "submit", "description": "POST code to /submit", "target": "/submit", "dependencies": }}
+    ],
+    "overall_goal": "Scrape, extract, and submit secret code",
+    "complexity": "simple",
+    "confidence": 0.92
+    }}
+    text
+    Now analyze the content above."""
     @staticmethod
     def analysis_planning_prompt(
         question: str,
                     lines.append(f"  ... and {len(values) - 2} more")
         return "\n".join(lines)
+    @staticmethod
+    def url_detection_prompt(
+        content: str,
+        urls: list,
+        request_url: str
+    ) -> str:
+        """
+        Prompt for detecting Question URL from content
+        Args:
+            content: Fetched content
+            urls: List of URLs found in content
+            request_url: Original request URL
+        Returns:
+            Formatted prompt
+        """
+        urls_text = "\n".join(f"{i+1}. {url}" for i, url in enumerate(urls)) if urls else "None"
+        return f"""You are analyzing content from a TDS quiz/task system to determine URL relationships.
+    **Context:**
+    - We fetched content from: {request_url}
+    - This content might either:
+    1. BE the actual task/question (return is_redirect=False)
+    2. REDIRECT/POINT to another URL where the actual task is located (return is_redirect=True)
+    **URLs found in content:**
+    {urls_text}
+    **Content (truncated to 1500 chars):**
+    {content[:1500]}
+    **Your Task:**
+    Analyze if this content IS the actual task, or if it REDIRECTS to another URL to get the task.
+    **URL Type Definitions:**
+    - **question_url**: URL to visit to GET the actual task/question
+    - **data_url**: URL to GET/SCRAPE data from (as part of task instructions)
+    - **submission_url**: URL to POST the final answer to
+    - **reference_url**: URL for reference/documentation only
+    **Decision Rules:**
+    1. **is_redirect=True** when:
+    - Content explicitly says "visit <url>", "your task is at <url>", "go to <url>"
+    - Content is very short (< 100 chars) with just a URL
+    - Content describes WHERE to find the task, not WHAT the task is
+    - Primary purpose is to direct you to another location
+    2. **is_redirect=False** when:
+    - Content contains actual task instructions (scrape, analyze, calculate, etc.)
+    - URLs are mentioned as DATA SOURCES or SUBMISSION endpoints
+    - Content is the task itself, even if it references other URLs
+    **Examples:**
+    Example 1 (REDIRECT):
+    Content: "Your quiz is available at https://example.com/quiz-834"
+    → is_redirect=True, question_url="https://example.com/quiz-834"
+    → Reasoning: Content tells you WHERE to go, not WHAT to do
+    Example 2 (ACTUAL TASK):
+    Content: "Scrape https://example.com/data and extract the top 5 prices. Submit to https://example.com/submit"
+    → is_redirect=False
+    → Reasoning: This IS the task. URLs are for data source and submission, not for getting the task
+    Example 3 (REDIRECT with multiple URLs):
+    Content: "Visit https://example.com/quiz-834 to receive your assignment. You'll be asked to scrape another website."
+    → is_redirect=True, question_url="https://example.com/quiz-834"
+    → Reasoning: Content directs you to quiz-834 to GET the actual task
+    Example 4 (ACTUAL TASK with multiple URLs):
+    Content: "Download data from https://api.example.com/data and compare with https://example.com/reference. Calculate the difference."
+    → is_redirect=False
+    → Reasoning: This IS the task. Both URLs are data sources for completing the task
+    Example 5 (SHORT REDIRECT):
+    Content: "https://example.com/quiz-834"
+    → is_redirect=True, question_url="https://example.com/quiz-834"
+    → Reasoning: Only a URL, no task instructions
+    **Now analyze the content above and provide your analysis.**"""