Spaces:
Sleeping
Sleeping
Commit
Β·
dfe7fff
1
Parent(s):
308bf1a
task processing complete
Browse files- app/__pycache__/main.cpython-313.pyc +0 -0
- app/api/routes/__pycache__/task.cpython-313.pyc +0 -0
- app/api/routes/task.py +148 -38
- app/core/__pycache__/security.cpython-313.pyc +0 -0
- app/core/security.py +26 -29
- app/models/__pycache__/response.cpython-313.pyc +0 -0
- app/models/response.py +41 -0
- app/orchestrator/models.py +118 -0
- app/services/__pycache__/task_processor.cpython-313.pyc +0 -0
- app/services/task_fetcher.py +295 -753
- app/services/task_processor.py +290 -136
- app/utils/prompts.py +163 -1
app/__pycache__/main.cpython-313.pyc
CHANGED
|
Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ
|
|
|
app/api/routes/__pycache__/task.cpython-313.pyc
CHANGED
|
Binary files a/app/api/routes/__pycache__/task.cpython-313.pyc and b/app/api/routes/__pycache__/task.cpython-313.pyc differ
|
|
|
app/api/routes/task.py
CHANGED
|
@@ -1,66 +1,176 @@
|
|
| 1 |
"""
|
| 2 |
-
Task
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
|
|
|
| 6 |
from datetime import datetime
|
| 7 |
-
from
|
| 8 |
|
| 9 |
from app.models.request import TaskRequest
|
| 10 |
-
from app.models.response import TaskResponse
|
| 11 |
-
from app.api.dependencies import verify_authentication
|
| 12 |
-
from app.services.task_processor import TaskProcessor
|
| 13 |
from app.core.logging import get_logger
|
| 14 |
-
|
| 15 |
-
import
|
|
|
|
| 16 |
|
| 17 |
logger = get_logger(__name__)
|
| 18 |
|
| 19 |
router = APIRouter()
|
|
|
|
|
|
|
| 20 |
task_processor = TaskProcessor()
|
| 21 |
|
| 22 |
|
| 23 |
-
@router.post(
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
Main API endpoint for handling task requests
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
"""
|
| 32 |
start_time = datetime.now()
|
| 33 |
|
| 34 |
logger.info("π₯ Task request received")
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
result_data = await task_processor.process(task_data)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Task API Routes
|
| 3 |
+
Handles task submission and processing
|
| 4 |
"""
|
| 5 |
|
| 6 |
+
from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
|
| 7 |
from datetime import datetime
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
|
| 10 |
from app.models.request import TaskRequest
|
| 11 |
+
from app.models.response import TaskResponse, ImmediateResponse
|
|
|
|
|
|
|
| 12 |
from app.core.logging import get_logger
|
| 13 |
+
from app.core.security import verify_authentication, AuthenticationError
|
| 14 |
+
from app.core.exceptions import TaskProcessingError
|
| 15 |
+
from app.services.task_processor import TaskProcessor
|
| 16 |
|
| 17 |
logger = get_logger(__name__)
|
| 18 |
|
| 19 |
router = APIRouter()
|
| 20 |
+
|
| 21 |
+
# Initialize task processor (singleton)
|
| 22 |
task_processor = TaskProcessor()
|
| 23 |
|
| 24 |
|
| 25 |
+
@router.post(
|
| 26 |
+
"/task",
|
| 27 |
+
response_model=ImmediateResponse,
|
| 28 |
+
status_code=status.HTTP_200_OK,
|
| 29 |
+
responses={
|
| 30 |
+
200: {"description": "Request accepted and processing started"},
|
| 31 |
+
400: {"description": "Invalid JSON format or request data"},
|
| 32 |
+
403: {"description": "Invalid secret - authentication failed"}
|
| 33 |
+
}
|
| 34 |
+
)
|
| 35 |
+
async def handle_task(
|
| 36 |
+
request: Request,
|
| 37 |
+
background_tasks: BackgroundTasks
|
| 38 |
+
):
|
| 39 |
"""
|
| 40 |
Main API endpoint for handling task requests
|
| 41 |
|
| 42 |
+
Flow:
|
| 43 |
+
1. Validate JSON format (HTTP 400 if invalid)
|
| 44 |
+
2. Verify secret (HTTP 403 if invalid)
|
| 45 |
+
3. Respond immediately with HTTP 200
|
| 46 |
+
4. Process task in background
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Immediate HTTP 200 response with task accepted message
|
| 50 |
"""
|
| 51 |
start_time = datetime.now()
|
| 52 |
|
| 53 |
logger.info("π₯ Task request received")
|
| 54 |
|
| 55 |
+
try:
|
| 56 |
+
# ================================================================
|
| 57 |
+
# STEP 1: PARSE AND VALIDATE JSON (HTTP 400 if invalid)
|
| 58 |
+
# ================================================================
|
| 59 |
+
try:
|
| 60 |
+
body = await request.json()
|
| 61 |
+
task_data = TaskRequest(**body)
|
| 62 |
+
except ValueError as e:
|
| 63 |
+
logger.error(f"β Invalid JSON format: {str(e)}")
|
| 64 |
+
raise HTTPException(
|
| 65 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 66 |
+
detail=f"Invalid JSON format: {str(e)}"
|
| 67 |
+
)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.error(f"β Request validation failed: {str(e)}")
|
| 70 |
+
raise HTTPException(
|
| 71 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 72 |
+
detail=f"Invalid request data: {str(e)}"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
logger.info(f"β
Request validated for: {task_data.email}")
|
| 76 |
+
|
| 77 |
+
# ================================================================
|
| 78 |
+
# STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
|
| 79 |
+
# ================================================================
|
| 80 |
+
logger.info("π Verifying authentication")
|
| 81 |
+
try:
|
| 82 |
+
verify_authentication(task_data.secret)
|
| 83 |
+
except AuthenticationError as e:
|
| 84 |
+
logger.error(f"β Authentication failed: {str(e)}")
|
| 85 |
+
raise HTTPException(
|
| 86 |
+
status_code=status.HTTP_403_FORBIDDEN,
|
| 87 |
+
detail="Invalid secret. Authentication failed."
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
logger.info("β
Authentication successful")
|
| 91 |
+
|
| 92 |
+
# ================================================================
|
| 93 |
+
# STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
|
| 94 |
+
# ================================================================
|
| 95 |
+
logger.info("β
Request accepted - processing in background")
|
| 96 |
+
|
| 97 |
+
# Add task processing to background
|
| 98 |
+
background_tasks.add_task(
|
| 99 |
+
process_task_background,
|
| 100 |
+
task_data=task_data,
|
| 101 |
+
start_time=start_time
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Immediate response
|
| 105 |
+
response = ImmediateResponse(
|
| 106 |
+
success=True,
|
| 107 |
+
message="Task accepted and processing started",
|
| 108 |
+
email=task_data.email,
|
| 109 |
+
task_url=str(task_data.url),
|
| 110 |
+
status="processing",
|
| 111 |
+
timestamp=datetime.now().isoformat()
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
logger.info(f"π€ Sent immediate response to client")
|
| 115 |
+
|
| 116 |
+
return response
|
| 117 |
|
| 118 |
+
except HTTPException:
|
| 119 |
+
# Re-raise HTTP exceptions (400, 403)
|
| 120 |
+
raise
|
| 121 |
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"β Unexpected error: {str(e)}", exc_info=True)
|
| 124 |
+
raise HTTPException(
|
| 125 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 126 |
+
detail=f"Internal server error: {str(e)}"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
async def process_task_background(
|
| 131 |
+
task_data: TaskRequest,
|
| 132 |
+
start_time: datetime
|
| 133 |
+
):
|
| 134 |
+
"""
|
| 135 |
+
Process task in background after sending immediate response
|
| 136 |
|
| 137 |
+
This runs asynchronously after the HTTP 200 response is sent.
|
| 138 |
+
Results are logged but not returned to client.
|
|
|
|
| 139 |
|
| 140 |
+
Args:
|
| 141 |
+
task_data: Validated task request
|
| 142 |
+
start_time: Request start time for metrics
|
| 143 |
+
"""
|
| 144 |
+
logger.info("=" * 80)
|
| 145 |
+
logger.info("π BACKGROUND TASK PROCESSING STARTED")
|
| 146 |
+
logger.info("=" * 80)
|
| 147 |
+
logger.info(f"π URL: {task_data.url}")
|
| 148 |
|
| 149 |
+
try:
|
| 150 |
+
# Process the task
|
| 151 |
+
result_data = await task_processor.process(task_data)
|
| 152 |
+
|
| 153 |
+
# Calculate execution time
|
| 154 |
+
execution_time = (datetime.now() - start_time).total_seconds()
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# Optional: Store result in database/cache for later retrieval
|
| 159 |
+
# await store_result(task_data.email, result_data)
|
| 160 |
+
|
| 161 |
+
except TaskProcessingError as e:
|
| 162 |
+
logger.error("=" * 80)
|
| 163 |
+
logger.error("β BACKGROUND TASK FAILED")
|
| 164 |
+
logger.error("=" * 80)
|
| 165 |
+
logger.error(f"Error: {str(e)}")
|
| 166 |
+
logger.error("=" * 80)
|
| 167 |
+
|
| 168 |
+
# Optional: Store error for later retrieval or send notification
|
| 169 |
+
# await store_error(task_data.email, str(e))
|
| 170 |
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.error("=" * 80)
|
| 173 |
+
logger.error("β BACKGROUND TASK UNEXPECTED ERROR")
|
| 174 |
+
logger.error("=" * 80)
|
| 175 |
+
logger.error(f"Error: {str(e)}", exc_info=True)
|
| 176 |
+
logger.error("=" * 80)
|
app/core/__pycache__/security.cpython-313.pyc
CHANGED
|
Binary files a/app/core/__pycache__/security.cpython-313.pyc and b/app/core/__pycache__/security.cpython-313.pyc differ
|
|
|
app/core/security.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Security
|
| 3 |
-
Handles authentication and authorization
|
| 4 |
"""
|
| 5 |
|
| 6 |
from app.core.config import settings
|
|
@@ -9,49 +8,47 @@ from app.core.logging import get_logger
|
|
| 9 |
logger = get_logger(__name__)
|
| 10 |
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
-
Verify
|
| 15 |
|
| 16 |
Args:
|
| 17 |
-
|
| 18 |
|
| 19 |
Returns:
|
| 20 |
-
bool: True if
|
| 21 |
"""
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
return False
|
| 25 |
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
-
if is_valid:
|
| 29 |
-
logger.
|
| 30 |
-
else:
|
| 31 |
-
logger.warning("π« Secret verification failed")
|
| 32 |
-
logger.debug(
|
| 33 |
-
f"Expected length: {len(settings.API_SECRET)}, "
|
| 34 |
-
f"Got length: {len(provided_secret)}"
|
| 35 |
-
)
|
| 36 |
|
| 37 |
return is_valid
|
| 38 |
|
| 39 |
|
| 40 |
-
def
|
| 41 |
"""
|
| 42 |
-
|
| 43 |
|
| 44 |
Args:
|
| 45 |
-
secret: Secret
|
| 46 |
-
visible_chars: Number of characters to show at the end
|
| 47 |
|
| 48 |
Returns:
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
| 50 |
"""
|
| 51 |
-
if not secret:
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
if len(secret) <= visible_chars:
|
| 55 |
-
return "*" * len(secret)
|
| 56 |
|
| 57 |
-
return
|
|
|
|
| 1 |
"""
|
| 2 |
+
Security and Authentication
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from app.core.config import settings
|
|
|
|
| 8 |
logger = get_logger(__name__)
|
| 9 |
|
| 10 |
|
| 11 |
+
class AuthenticationError(Exception):
|
| 12 |
+
"""Raised when authentication fails"""
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def verify_secret(secret: str) -> bool:
|
| 17 |
"""
|
| 18 |
+
Verify if provided secret is valid
|
| 19 |
|
| 20 |
Args:
|
| 21 |
+
secret: Secret from request
|
| 22 |
|
| 23 |
Returns:
|
| 24 |
+
bool: True if valid, False otherwise
|
| 25 |
"""
|
| 26 |
+
# Get expected secret from config/env
|
| 27 |
+
expected_secret = settings.API_SECRET
|
|
|
|
| 28 |
|
| 29 |
+
# Simple comparison (use constant-time comparison in production)
|
| 30 |
+
is_valid = secret == expected_secret
|
| 31 |
|
| 32 |
+
if not is_valid:
|
| 33 |
+
logger.warning(f"β Invalid secret attempt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
return is_valid
|
| 36 |
|
| 37 |
|
| 38 |
+
def verify_authentication(secret: str) -> bool:
|
| 39 |
"""
|
| 40 |
+
Verify request authentication
|
| 41 |
|
| 42 |
Args:
|
| 43 |
+
secret: Secret from request
|
|
|
|
| 44 |
|
| 45 |
Returns:
|
| 46 |
+
bool: True if authenticated
|
| 47 |
+
|
| 48 |
+
Raises:
|
| 49 |
+
AuthenticationError: If authentication fails (HTTP 403)
|
| 50 |
"""
|
| 51 |
+
if not verify_secret(secret):
|
| 52 |
+
raise AuthenticationError("Invalid secret. Authentication failed.")
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
return True
|
app/models/__pycache__/response.cpython-313.pyc
CHANGED
|
Binary files a/app/models/__pycache__/response.cpython-313.pyc and b/app/models/__pycache__/response.cpython-313.pyc differ
|
|
|
app/models/response.py
CHANGED
|
@@ -7,6 +7,47 @@ from typing import Optional, Dict, Any
|
|
| 7 |
from datetime import datetime
|
| 8 |
from pydantic import BaseModel, Field
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class TaskResponse(BaseModel):
|
| 12 |
"""
|
|
|
|
| 7 |
from datetime import datetime
|
| 8 |
from pydantic import BaseModel, Field
|
| 9 |
|
| 10 |
+
class ImmediateResponse(BaseModel):
|
| 11 |
+
"""
|
| 12 |
+
Immediate response sent after validation
|
| 13 |
+
HTTP 200 response before task processing
|
| 14 |
+
"""
|
| 15 |
+
success: bool = Field(
|
| 16 |
+
description="Whether request was accepted"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
message: str = Field(
|
| 20 |
+
description="Status message"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
email: str = Field(
|
| 24 |
+
description="Student email from request"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
task_url: str = Field(
|
| 28 |
+
description="Task URL from request"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
status: str = Field(
|
| 32 |
+
description="Processing status: processing, completed, failed"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
timestamp: str = Field(
|
| 36 |
+
description="Response timestamp (ISO format)"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
class Config:
|
| 40 |
+
json_schema_extra = {
|
| 41 |
+
"example": {
|
| 42 |
+
"success": True,
|
| 43 |
+
"message": "Task accepted and processing started",
|
| 44 |
+
"email": "[email protected]",
|
| 45 |
+
"task_url": "https://example.com/quiz-834",
|
| 46 |
+
"status": "processing",
|
| 47 |
+
"timestamp": "2025-11-29T12:00:00"
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
|
| 52 |
class TaskResponse(BaseModel):
|
| 53 |
"""
|
app/orchestrator/models.py
CHANGED
|
@@ -48,6 +48,124 @@ class OutputFormat(str, Enum):
|
|
| 48 |
UNKNOWN = "unknown"
|
| 49 |
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
class TaskClassification(BaseModel):
|
| 52 |
"""
|
| 53 |
Structured output for task classification
|
|
|
|
| 48 |
UNKNOWN = "unknown"
|
| 49 |
|
| 50 |
|
| 51 |
+
class URLDetection(BaseModel):
|
| 52 |
+
"""Result of URL detection analysis"""
|
| 53 |
+
|
| 54 |
+
is_redirect: bool = Field(
|
| 55 |
+
description="True if content redirects to another URL for the actual task"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
question_url: Optional[str] = Field(
|
| 59 |
+
default=None,
|
| 60 |
+
description="The URL to visit for the actual task (if is_redirect is True)"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
reasoning: str = Field(
|
| 64 |
+
description="Detailed explanation of why this is or isn't a redirect"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
url_types: Dict[str, str] = Field(
|
| 68 |
+
default_factory=dict,
|
| 69 |
+
description="Classification of each URL found (e.g., 'question_url', 'data_url', 'submission_url')"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
confidence: str = Field(
|
| 73 |
+
default="medium",
|
| 74 |
+
description="Confidence level: low, medium, high"
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class InstructionStep(BaseModel):
|
| 79 |
+
"""Single instruction step"""
|
| 80 |
+
|
| 81 |
+
step_number: int = Field(
|
| 82 |
+
description="Step number in sequence (1, 2, 3...)"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
action: str = Field(
|
| 86 |
+
description="Primary action: scrape, extract, calculate, submit, download, transcribe, analyze, visit"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
description: str = Field(
|
| 90 |
+
description="Clear description of what to do in this step"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
target: Optional[str] = Field(
|
| 94 |
+
default=None,
|
| 95 |
+
description="Target of the action (URL, field name, file, etc.)"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
dependencies: List[int] = Field(
|
| 99 |
+
default_factory=list,
|
| 100 |
+
description="Step numbers this step depends on"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class UnifiedTaskAnalysis(BaseModel):
|
| 105 |
+
"""
|
| 106 |
+
Unified analysis for task fetching
|
| 107 |
+
Combines redirect detection, submission URL extraction, and instruction parsing
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
# ========================================================================
|
| 111 |
+
# REDIRECT DETECTION
|
| 112 |
+
# ========================================================================
|
| 113 |
+
is_redirect: bool = Field(
|
| 114 |
+
description="True if this content redirects to another URL for the actual task"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
question_url: Optional[str] = Field(
|
| 118 |
+
default=None,
|
| 119 |
+
description="URL to visit for the actual task (if is_redirect=True)"
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
redirect_reasoning: str = Field(
|
| 123 |
+
default="",
|
| 124 |
+
description="Why this is or isn't a redirect"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# ========================================================================
|
| 128 |
+
# SUBMISSION URL EXTRACTION
|
| 129 |
+
# ========================================================================
|
| 130 |
+
submission_url: Optional[str] = Field(
|
| 131 |
+
default=None,
|
| 132 |
+
description="URL where the final answer should be POSTed"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
submission_url_is_relative: bool = Field(
|
| 136 |
+
default=False,
|
| 137 |
+
description="True if submission URL is relative and needs base URL resolution"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
submission_reasoning: str = Field(
|
| 141 |
+
default="",
|
| 142 |
+
description="How the submission URL was identified"
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# ========================================================================
|
| 146 |
+
# INSTRUCTION PARSING
|
| 147 |
+
# ========================================================================
|
| 148 |
+
instructions: List[InstructionStep] = Field(
|
| 149 |
+
default_factory=list,
|
| 150 |
+
description="Parsed step-by-step instructions (empty if redirect)"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
overall_goal: str = Field(
|
| 154 |
+
description="High-level summary of what needs to be accomplished"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
complexity: str = Field(
|
| 158 |
+
description="Task complexity: trivial, simple, moderate, complex"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# ========================================================================
|
| 162 |
+
# CONFIDENCE
|
| 163 |
+
# ========================================================================
|
| 164 |
+
confidence: float = Field(
|
| 165 |
+
ge=0.0,
|
| 166 |
+
le=1.0,
|
| 167 |
+
description="Overall confidence (0.0-1.0)"
|
| 168 |
+
)
|
| 169 |
class TaskClassification(BaseModel):
|
| 170 |
"""
|
| 171 |
Structured output for task classification
|
app/services/__pycache__/task_processor.cpython-313.pyc
CHANGED
|
Binary files a/app/services/__pycache__/task_processor.cpython-313.pyc and b/app/services/__pycache__/task_processor.cpython-313.pyc differ
|
|
|
app/services/task_fetcher.py
CHANGED
|
@@ -1,42 +1,53 @@
|
|
| 1 |
"""
|
| 2 |
-
Task Fetcher Service -
|
| 3 |
-
Fetches and extracts task descriptions from URLs
|
| 4 |
"""
|
| 5 |
|
| 6 |
import httpx
|
| 7 |
import json
|
| 8 |
-
import base64
|
| 9 |
import re
|
| 10 |
from typing import Optional, Dict, Any, List
|
| 11 |
-
from urllib.parse import urlparse
|
| 12 |
from bs4 import BeautifulSoup
|
| 13 |
|
| 14 |
from app.core.config import settings
|
| 15 |
from app.core.logging import get_logger
|
| 16 |
from app.core.exceptions import TaskProcessingError
|
|
|
|
|
|
|
| 17 |
|
| 18 |
logger = get_logger(__name__)
|
| 19 |
|
| 20 |
|
| 21 |
class TaskFetcher:
|
| 22 |
"""
|
| 23 |
-
Enhanced service for fetching and extracting task descriptions from URLs
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
|
| 27 |
def __init__(self, timeout: int = 30):
|
| 28 |
-
"""
|
| 29 |
-
Initialize TaskFetcher
|
| 30 |
-
|
| 31 |
-
Args:
|
| 32 |
-
timeout: Request timeout in seconds
|
| 33 |
-
"""
|
| 34 |
self.timeout = timeout
|
| 35 |
-
self.client = None
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
async def __aenter__(self):
|
| 39 |
-
"""Async context manager entry"""
|
| 40 |
self.client = httpx.AsyncClient(
|
| 41 |
timeout=self.timeout,
|
| 42 |
follow_redirects=True,
|
|
@@ -48,795 +59,326 @@ class TaskFetcher:
|
|
| 48 |
return self
|
| 49 |
|
| 50 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 51 |
-
"""Async context manager exit"""
|
| 52 |
if self.client:
|
| 53 |
await self.client.aclose()
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"""
|
| 57 |
-
Fetch and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
}
|
| 72 |
-
|
| 73 |
-
Raises:
|
| 74 |
-
TaskProcessingError: If fetching or extraction fails
|
| 75 |
-
"""
|
| 76 |
-
logger.info(f"π₯ Fetching task from URL: {url}")
|
| 77 |
|
| 78 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
if not self._is_valid_url(url):
|
| 80 |
-
logger.error(f"β Invalid URL format: {url}")
|
| 81 |
raise TaskProcessingError(f"Invalid URL format: {url}")
|
| 82 |
|
| 83 |
try:
|
| 84 |
-
# Fetch content
|
| 85 |
response = await self._fetch_url(url)
|
| 86 |
-
|
| 87 |
-
# Detect content type
|
| 88 |
content_type = self._detect_content_type(response)
|
| 89 |
-
logger.info(f"π Content type detected: {content_type}")
|
| 90 |
-
|
| 91 |
-
# Extract task based on content type
|
| 92 |
-
task_info = await self._extract_task(response, content_type, url)
|
| 93 |
-
|
| 94 |
-
# Determine if LLM analysis is needed
|
| 95 |
-
task_info['needs_llm_analysis'] = self._needs_llm_analysis(task_info)
|
| 96 |
-
|
| 97 |
-
if task_info['needs_llm_analysis']:
|
| 98 |
-
logger.warning("π€ Content requires LLM analysis for complete extraction")
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
except Exception as e:
|
| 116 |
-
logger.error(f"β Unexpected error fetching task: {str(e)}", exc_info=True)
|
| 117 |
-
raise TaskProcessingError(f"Failed to fetch task from URL: {str(e)}")
|
| 118 |
-
|
| 119 |
-
def _is_valid_url(self, url: str) -> bool:
|
| 120 |
-
"""
|
| 121 |
-
Validate URL format
|
| 122 |
|
| 123 |
-
Args:
|
| 124 |
-
url: URL to validate
|
| 125 |
-
|
| 126 |
-
Returns:
|
| 127 |
-
bool: True if valid, False otherwise
|
| 128 |
-
"""
|
| 129 |
-
try:
|
| 130 |
-
result = urlparse(url)
|
| 131 |
-
is_valid = all([result.scheme in ['http', 'https'], result.netloc])
|
| 132 |
-
|
| 133 |
-
if not is_valid:
|
| 134 |
-
logger.warning(f"Invalid URL structure: {url}")
|
| 135 |
-
|
| 136 |
-
return is_valid
|
| 137 |
-
|
| 138 |
except Exception as e:
|
| 139 |
-
logger.
|
| 140 |
-
|
| 141 |
|
| 142 |
async def _fetch_url(self, url: str) -> httpx.Response:
|
| 143 |
-
"""
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
Args:
|
| 147 |
-
url: URL to fetch
|
| 148 |
-
|
| 149 |
-
Returns:
|
| 150 |
-
httpx.Response: HTTP response
|
| 151 |
-
|
| 152 |
-
Raises:
|
| 153 |
-
httpx.HTTPStatusError: If HTTP error occurs
|
| 154 |
-
httpx.TimeoutException: If request times out
|
| 155 |
-
"""
|
| 156 |
-
max_retries = settings.MAX_RETRIES
|
| 157 |
|
| 158 |
for attempt in range(max_retries):
|
| 159 |
try:
|
| 160 |
-
logger.debug(f"
|
| 161 |
-
|
| 162 |
response = await self.client.get(url)
|
| 163 |
response.raise_for_status()
|
| 164 |
-
|
| 165 |
-
logger.debug(
|
| 166 |
-
f"β Fetch successful | Status: {response.status_code} | "
|
| 167 |
-
f"Size: {len(response.content)} bytes"
|
| 168 |
-
)
|
| 169 |
-
|
| 170 |
return response
|
| 171 |
-
|
| 172 |
except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
|
|
|
|
| 173 |
if attempt == max_retries - 1:
|
| 174 |
-
# Last attempt, raise error
|
| 175 |
raise
|
| 176 |
-
|
| 177 |
-
logger.warning(
|
| 178 |
-
f"β οΈ Attempt {attempt + 1} failed: {str(e)} | Retrying..."
|
| 179 |
-
)
|
| 180 |
continue
|
| 181 |
-
|
| 182 |
-
def
|
| 183 |
-
"""
|
| 184 |
-
Detect content type from response
|
| 185 |
-
|
| 186 |
-
Args:
|
| 187 |
-
response: HTTP response
|
| 188 |
-
|
| 189 |
-
Returns:
|
| 190 |
-
str: Content type (json, html, text, pdf, csv, etc.)
|
| 191 |
-
"""
|
| 192 |
-
content_type_header = response.headers.get('content-type', '').lower()
|
| 193 |
-
|
| 194 |
-
# Check content-type header first
|
| 195 |
-
if 'application/json' in content_type_header:
|
| 196 |
-
return 'json'
|
| 197 |
-
elif 'text/html' in content_type_header:
|
| 198 |
-
return 'html'
|
| 199 |
-
elif 'application/pdf' in content_type_header:
|
| 200 |
-
return 'pdf'
|
| 201 |
-
elif 'text/csv' in content_type_header or 'application/csv' in content_type_header:
|
| 202 |
-
return 'csv'
|
| 203 |
-
elif 'audio/' in content_type_header:
|
| 204 |
-
return 'audio'
|
| 205 |
-
elif 'video/' in content_type_header:
|
| 206 |
-
return 'video'
|
| 207 |
-
elif 'image/' in content_type_header:
|
| 208 |
-
return 'image'
|
| 209 |
-
|
| 210 |
-
# Try to detect from content
|
| 211 |
-
try:
|
| 212 |
-
json.loads(response.text)
|
| 213 |
-
return 'json'
|
| 214 |
-
except:
|
| 215 |
-
if '<html' in response.text.lower()[:100]:
|
| 216 |
-
return 'html'
|
| 217 |
-
return 'text'
|
| 218 |
-
|
| 219 |
-
async def _extract_task(
|
| 220 |
-
self,
|
| 221 |
-
response: httpx.Response,
|
| 222 |
-
content_type: str,
|
| 223 |
-
url: str
|
| 224 |
-
) -> Dict[str, Any]:
|
| 225 |
-
"""
|
| 226 |
-
Extract task description based on content type
|
| 227 |
-
|
| 228 |
-
Args:
|
| 229 |
-
response: HTTP response
|
| 230 |
-
content_type: Detected content type
|
| 231 |
-
url: Original URL
|
| 232 |
-
|
| 233 |
-
Returns:
|
| 234 |
-
Dict with task information
|
| 235 |
-
"""
|
| 236 |
-
extractors = {
|
| 237 |
-
'json': self._extract_from_json,
|
| 238 |
-
'html': self._extract_from_html,
|
| 239 |
-
'text': self._extract_from_text,
|
| 240 |
-
'pdf': self._extract_from_binary,
|
| 241 |
-
'csv': self._extract_from_binary,
|
| 242 |
-
'audio': self._extract_from_binary,
|
| 243 |
-
'video': self._extract_from_binary,
|
| 244 |
-
'image': self._extract_from_binary,
|
| 245 |
-
}
|
| 246 |
-
|
| 247 |
-
extractor = extractors.get(content_type, self._extract_from_text)
|
| 248 |
-
|
| 249 |
-
logger.debug(f"Using extractor: {extractor.__name__}")
|
| 250 |
-
|
| 251 |
-
return await extractor(response, url)
|
| 252 |
-
|
| 253 |
-
async def _extract_from_json(
|
| 254 |
-
self,
|
| 255 |
-
response: httpx.Response,
|
| 256 |
-
url: str
|
| 257 |
-
) -> Dict[str, Any]:
|
| 258 |
-
"""
|
| 259 |
-
Extract task from JSON response with base64 detection
|
| 260 |
-
|
| 261 |
-
Args:
|
| 262 |
-
response: HTTP response
|
| 263 |
-
url: Original URL
|
| 264 |
-
|
| 265 |
-
Returns:
|
| 266 |
-
Dict with task information
|
| 267 |
-
"""
|
| 268 |
-
logger.debug("Extracting task from JSON")
|
| 269 |
-
|
| 270 |
-
try:
|
| 271 |
-
data = response.json()
|
| 272 |
-
|
| 273 |
-
# Try common field names for task description
|
| 274 |
-
task_fields = [
|
| 275 |
-
'task', 'task_description', 'description',
|
| 276 |
-
'question', 'prompt', 'instruction',
|
| 277 |
-
'task_text', 'content', 'message', 'text'
|
| 278 |
-
]
|
| 279 |
-
|
| 280 |
-
task_description = None
|
| 281 |
-
found_field = None
|
| 282 |
-
|
| 283 |
-
# Search for task description in JSON
|
| 284 |
-
for field in task_fields:
|
| 285 |
-
if field in data:
|
| 286 |
-
task_description = str(data[field])
|
| 287 |
-
found_field = field
|
| 288 |
-
logger.debug(f"Found task in field: {field}")
|
| 289 |
-
break
|
| 290 |
-
|
| 291 |
-
# If not found in root, try nested
|
| 292 |
-
if not task_description:
|
| 293 |
-
task_description = self._search_nested_json(data, task_fields)
|
| 294 |
-
found_field = "nested"
|
| 295 |
-
|
| 296 |
-
# Fallback: use entire JSON as string
|
| 297 |
-
if not task_description:
|
| 298 |
-
logger.warning("No task field found, using entire JSON")
|
| 299 |
-
task_description = json.dumps(data, indent=2)
|
| 300 |
-
found_field = "full_json"
|
| 301 |
-
|
| 302 |
-
# Check for base64 encoding
|
| 303 |
-
original_description = task_description
|
| 304 |
-
task_description = self._detect_and_decode_base64(task_description)
|
| 305 |
-
was_base64_decoded = (original_description != task_description)
|
| 306 |
-
|
| 307 |
-
return {
|
| 308 |
-
'task_description': task_description.strip(),
|
| 309 |
-
'raw_content': response.text,
|
| 310 |
-
'content_type': 'json',
|
| 311 |
-
'url': url,
|
| 312 |
-
'metadata': {
|
| 313 |
-
'json_structure': list(data.keys()) if isinstance(data, dict) else [],
|
| 314 |
-
'data_type': type(data).__name__,
|
| 315 |
-
'found_in_field': found_field,
|
| 316 |
-
'was_base64_decoded': was_base64_decoded,
|
| 317 |
-
'special_elements': {} # No special elements in JSON
|
| 318 |
-
}
|
| 319 |
-
}
|
| 320 |
-
|
| 321 |
-
except json.JSONDecodeError as e:
|
| 322 |
-
logger.error(f"Failed to parse JSON: {str(e)}")
|
| 323 |
-
# Fallback to text extraction
|
| 324 |
-
return await self._extract_from_text(response, url)
|
| 325 |
-
|
| 326 |
-
def _search_nested_json(self, data: Any, fields: List[str], max_depth: int = 3) -> Optional[str]:
|
| 327 |
"""
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
data: JSON data to search
|
| 332 |
-
fields: Field names to look for
|
| 333 |
-
max_depth: Maximum recursion depth
|
| 334 |
-
|
| 335 |
-
Returns:
|
| 336 |
-
Task description if found, None otherwise
|
| 337 |
"""
|
| 338 |
-
if
|
| 339 |
-
return
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
# Search nested dicts
|
| 347 |
-
for value in data.values():
|
| 348 |
-
result = self._search_nested_json(value, fields, max_depth - 1)
|
| 349 |
-
if result:
|
| 350 |
-
return result
|
| 351 |
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
| 355 |
|
| 356 |
-
return
|
| 357 |
-
|
| 358 |
-
async def
|
| 359 |
-
self,
|
| 360 |
-
response: httpx.Response,
|
| 361 |
-
url: str
|
| 362 |
-
) -> Dict[str, Any]:
|
| 363 |
"""
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
Args:
|
| 367 |
-
response: HTTP response
|
| 368 |
-
url: Original URL
|
| 369 |
-
|
| 370 |
-
Returns:
|
| 371 |
-
Dict with task information
|
| 372 |
"""
|
| 373 |
-
|
| 374 |
|
|
|
|
|
|
|
| 375 |
try:
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
has_special = any(special_elements.values())
|
| 381 |
-
|
| 382 |
-
if has_special:
|
| 383 |
-
detected_types = [k for k, v in special_elements.items() if v]
|
| 384 |
-
logger.info(f"π Detected special elements: {', '.join(detected_types)}")
|
| 385 |
-
|
| 386 |
-
# Strategy 1: Look for common task containers
|
| 387 |
-
task_selectors = [
|
| 388 |
-
{'id': 'task'},
|
| 389 |
-
{'id': 'question'},
|
| 390 |
-
{'id': 'instruction'},
|
| 391 |
-
{'id': 'quiz'},
|
| 392 |
-
{'class_': 'task'},
|
| 393 |
-
{'class_': 'question'},
|
| 394 |
-
{'class_': 'instruction'},
|
| 395 |
-
{'class_': 'task-description'},
|
| 396 |
-
{'class_': 'quiz-question'},
|
| 397 |
-
{'data-task': True}
|
| 398 |
-
]
|
| 399 |
-
|
| 400 |
-
task_description = None
|
| 401 |
-
extraction_method = None
|
| 402 |
-
|
| 403 |
-
for selector in task_selectors:
|
| 404 |
-
element = soup.find(**selector)
|
| 405 |
-
if element:
|
| 406 |
-
task_description = element.get_text(strip=True, separator=' ')
|
| 407 |
-
extraction_method = f"selector_{list(selector.keys())[0]}"
|
| 408 |
-
logger.debug(f"Found task using selector: {selector}")
|
| 409 |
-
break
|
| 410 |
-
|
| 411 |
-
# Strategy 2: Look for main content area
|
| 412 |
-
if not task_description:
|
| 413 |
-
main_content = (
|
| 414 |
-
soup.find('main') or
|
| 415 |
-
soup.find('article') or
|
| 416 |
-
soup.find('div', class_='content') or
|
| 417 |
-
soup.find('div', id='content') or
|
| 418 |
-
soup.find('section', class_='main')
|
| 419 |
-
)
|
| 420 |
-
|
| 421 |
-
if main_content:
|
| 422 |
-
task_description = main_content.get_text(strip=True, separator=' ')
|
| 423 |
-
extraction_method = "main_content"
|
| 424 |
-
logger.debug("Found task in main content area")
|
| 425 |
-
|
| 426 |
-
# Strategy 3: Look for pre/code/textarea blocks (often contain base64 or instructions)
|
| 427 |
-
if not task_description:
|
| 428 |
-
code_blocks = soup.find_all(['pre', 'code', 'textarea'])
|
| 429 |
-
if code_blocks:
|
| 430 |
-
task_description = '\n'.join(
|
| 431 |
-
block.get_text(strip=True) for block in code_blocks
|
| 432 |
-
)
|
| 433 |
-
extraction_method = "code_blocks"
|
| 434 |
-
logger.debug(f"Found task in {len(code_blocks)} code blocks")
|
| 435 |
-
|
| 436 |
-
# Strategy 4: Use body text (last resort)
|
| 437 |
-
if not task_description:
|
| 438 |
-
# Remove script and style tags
|
| 439 |
-
for script in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
| 440 |
-
script.decompose()
|
| 441 |
-
|
| 442 |
-
task_description = soup.get_text(strip=True, separator=' ')
|
| 443 |
-
extraction_method = "body_fallback"
|
| 444 |
-
logger.warning("Using body text as task description")
|
| 445 |
|
| 446 |
-
#
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
|
|
|
|
|
|
| 450 |
|
| 451 |
-
|
| 452 |
-
title = soup.find('title')
|
| 453 |
-
title_text = title.get_text(strip=True) if title else ''
|
| 454 |
|
| 455 |
-
|
| 456 |
-
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 457 |
-
meta_description = meta_desc.get('content', '') if meta_desc else ''
|
| 458 |
|
|
|
|
| 459 |
return {
|
| 460 |
-
'task_description':
|
| 461 |
-
'raw_content':
|
| 462 |
-
'content_type': 'html',
|
| 463 |
-
'url': url,
|
| 464 |
-
'metadata': {
|
| 465 |
-
'title': title_text,
|
| 466 |
-
'meta_description': meta_description,
|
| 467 |
-
'extraction_method': extraction_method,
|
| 468 |
-
'has_forms': bool(soup.find('form')),
|
| 469 |
-
'has_tables': bool(soup.find('table')),
|
| 470 |
-
'was_base64_decoded': was_base64_decoded,
|
| 471 |
-
'special_elements': special_elements,
|
| 472 |
-
'page_size_kb': len(response.content) / 1024
|
| 473 |
-
}
|
| 474 |
-
}
|
| 475 |
-
|
| 476 |
-
except Exception as e:
|
| 477 |
-
logger.error(f"Failed to parse HTML: {str(e)}", exc_info=True)
|
| 478 |
-
# Fallback to text extraction
|
| 479 |
-
return await self._extract_from_text(response, url)
|
| 480 |
-
|
| 481 |
-
def _detect_special_elements(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
|
| 482 |
-
"""
|
| 483 |
-
Detect special elements that might contain or lead to the task
|
| 484 |
-
|
| 485 |
-
Args:
|
| 486 |
-
soup: BeautifulSoup parsed HTML
|
| 487 |
-
base_url: Base URL for resolving relative URLs
|
| 488 |
-
|
| 489 |
-
Returns:
|
| 490 |
-
Dict of detected elements with absolute URLs
|
| 491 |
-
"""
|
| 492 |
-
elements = {
|
| 493 |
-
'audio_urls': [],
|
| 494 |
-
'video_urls': [],
|
| 495 |
-
'image_urls': [],
|
| 496 |
-
'download_links': [],
|
| 497 |
-
'iframe_sources': [],
|
| 498 |
-
'external_links': [],
|
| 499 |
-
'form_actions': [],
|
| 500 |
-
'javascript_files': []
|
| 501 |
-
}
|
| 502 |
-
|
| 503 |
-
# Audio elements
|
| 504 |
-
for audio in soup.find_all(['audio', 'source']):
|
| 505 |
-
src = audio.get('src')
|
| 506 |
-
if src:
|
| 507 |
-
# Check for audio extensions or content type
|
| 508 |
-
is_audio = any(ext in src.lower() for ext in ['.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac'])
|
| 509 |
-
audio_type = audio.get('type', '')
|
| 510 |
-
if is_audio or 'audio/' in audio_type:
|
| 511 |
-
absolute_url = urljoin(base_url, src)
|
| 512 |
-
elements['audio_urls'].append(absolute_url)
|
| 513 |
-
logger.debug(f"Found audio: {absolute_url}")
|
| 514 |
-
|
| 515 |
-
# Video elements
|
| 516 |
-
for video in soup.find_all(['video', 'source']):
|
| 517 |
-
src = video.get('src')
|
| 518 |
-
if src:
|
| 519 |
-
is_video = any(ext in src.lower() for ext in ['.mp4', '.webm', '.avi', '.mov', '.mkv'])
|
| 520 |
-
video_type = video.get('type', '')
|
| 521 |
-
if is_video or 'video/' in video_type:
|
| 522 |
-
absolute_url = urljoin(base_url, src)
|
| 523 |
-
elements['video_urls'].append(absolute_url)
|
| 524 |
-
logger.debug(f"Found video: {absolute_url}")
|
| 525 |
-
|
| 526 |
-
# YouTube/Vimeo iframes
|
| 527 |
-
for iframe in soup.find_all('iframe'):
|
| 528 |
-
src = iframe.get('src', '')
|
| 529 |
-
if 'youtube.com' in src or 'vimeo.com' in src:
|
| 530 |
-
elements['video_urls'].append(src)
|
| 531 |
-
logger.debug(f"Found video iframe: {src}")
|
| 532 |
-
elif src:
|
| 533 |
-
absolute_url = urljoin(base_url, src)
|
| 534 |
-
elements['iframe_sources'].append(absolute_url)
|
| 535 |
-
logger.debug(f"Found iframe: {absolute_url}")
|
| 536 |
-
|
| 537 |
-
# Image elements (might contain QR codes, screenshots with tasks, etc.)
|
| 538 |
-
for img in soup.find_all('img'):
|
| 539 |
-
src = img.get('src')
|
| 540 |
-
if src:
|
| 541 |
-
absolute_url = urljoin(base_url, src)
|
| 542 |
-
# Only include if it looks like it might contain data (not decorative)
|
| 543 |
-
alt_text = img.get('alt', '').lower()
|
| 544 |
-
if any(keyword in alt_text for keyword in ['task', 'question', 'instruction', 'qr', 'code']):
|
| 545 |
-
elements['image_urls'].append(absolute_url)
|
| 546 |
-
logger.debug(f"Found relevant image: {absolute_url}")
|
| 547 |
-
|
| 548 |
-
# Download links
|
| 549 |
-
for link in soup.find_all('a', href=True):
|
| 550 |
-
href = link.get('href', '')
|
| 551 |
-
link_text = link.get_text().strip().lower()
|
| 552 |
-
|
| 553 |
-
# Check for downloadable files
|
| 554 |
-
download_extensions = [
|
| 555 |
-
'.pdf', '.csv', '.xlsx', '.xls', '.zip', '.txt',
|
| 556 |
-
'.json', '.xml', '.doc', '.docx', '.tsv'
|
| 557 |
-
]
|
| 558 |
-
|
| 559 |
-
is_download = any(ext in href.lower() for ext in download_extensions)
|
| 560 |
-
|
| 561 |
-
# Check for text indicating download or external task
|
| 562 |
-
download_keywords = ['download', 'get task', 'click here', 'task file', 'see task']
|
| 563 |
-
has_download_text = any(keyword in link_text for keyword in download_keywords)
|
| 564 |
-
|
| 565 |
-
if is_download:
|
| 566 |
-
absolute_url = urljoin(base_url, href)
|
| 567 |
-
elements['download_links'].append(absolute_url)
|
| 568 |
-
logger.debug(f"Found download link: {absolute_url}")
|
| 569 |
-
elif has_download_text:
|
| 570 |
-
absolute_url = urljoin(base_url, href)
|
| 571 |
-
elements['external_links'].append(absolute_url)
|
| 572 |
-
logger.debug(f"Found external link: {absolute_url}")
|
| 573 |
-
|
| 574 |
-
# Forms (might need to submit to get task)
|
| 575 |
-
for form in soup.find_all('form'):
|
| 576 |
-
action = form.get('action')
|
| 577 |
-
if action:
|
| 578 |
-
absolute_url = urljoin(base_url, action)
|
| 579 |
-
elements['form_actions'].append(absolute_url)
|
| 580 |
-
logger.debug(f"Found form action: {absolute_url}")
|
| 581 |
-
|
| 582 |
-
# JavaScript files (might load task dynamically)
|
| 583 |
-
for script in soup.find_all('script', src=True):
|
| 584 |
-
src = script.get('src')
|
| 585 |
-
if src and not any(cdn in src for cdn in ['google', 'cdn', 'jquery']):
|
| 586 |
-
absolute_url = urljoin(base_url, src)
|
| 587 |
-
elements['javascript_files'].append(absolute_url)
|
| 588 |
-
|
| 589 |
-
# Remove duplicates
|
| 590 |
-
for key in elements:
|
| 591 |
-
elements[key] = list(set(elements[key]))
|
| 592 |
-
|
| 593 |
-
return elements
|
| 594 |
-
|
| 595 |
-
async def _extract_from_text(
|
| 596 |
-
self,
|
| 597 |
-
response: httpx.Response,
|
| 598 |
-
url: str
|
| 599 |
-
) -> Dict[str, Any]:
|
| 600 |
-
"""
|
| 601 |
-
Extract task from plain text response with base64 detection
|
| 602 |
-
|
| 603 |
-
Args:
|
| 604 |
-
response: HTTP response
|
| 605 |
-
url: Original URL
|
| 606 |
-
|
| 607 |
-
Returns:
|
| 608 |
-
Dict with task information
|
| 609 |
-
"""
|
| 610 |
-
logger.debug("Extracting task from plain text")
|
| 611 |
-
|
| 612 |
-
text = response.text.strip()
|
| 613 |
-
|
| 614 |
-
# Check for base64 encoding
|
| 615 |
-
original_text = text
|
| 616 |
-
text = self._detect_and_decode_base64(text)
|
| 617 |
-
was_base64_decoded = (original_text != text)
|
| 618 |
-
|
| 619 |
-
return {
|
| 620 |
-
'task_description': text,
|
| 621 |
-
'raw_content': response.text,
|
| 622 |
-
'content_type': 'text',
|
| 623 |
-
'url': url,
|
| 624 |
-
'metadata': {
|
| 625 |
-
'length': len(text),
|
| 626 |
-
'lines': text.count('\n') + 1,
|
| 627 |
-
'was_base64_decoded': was_base64_decoded,
|
| 628 |
-
'special_elements': {}
|
| 629 |
}
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
logger.warning(f"β οΈ Binary content detected: {content_type}")
|
| 651 |
-
|
| 652 |
-
task_description = f"Binary content detected. Type: {content_type}. URL: {url}. "
|
| 653 |
-
|
| 654 |
-
if 'pdf' in content_type:
|
| 655 |
-
task_description += "This is a PDF file that needs to be downloaded and parsed."
|
| 656 |
-
elif 'audio' in content_type:
|
| 657 |
-
task_description += "This is an audio file that needs to be transcribed."
|
| 658 |
-
elif 'video' in content_type:
|
| 659 |
-
task_description += "This is a video file that might need processing or transcription."
|
| 660 |
-
elif 'image' in content_type:
|
| 661 |
-
task_description += "This is an image that might need OCR or vision analysis."
|
| 662 |
-
elif 'csv' in content_type:
|
| 663 |
-
task_description += "This is a CSV file that needs to be downloaded and parsed."
|
| 664 |
-
else:
|
| 665 |
-
task_description += "This needs to be downloaded and processed."
|
| 666 |
-
|
| 667 |
-
return {
|
| 668 |
-
'task_description': task_description,
|
| 669 |
-
'raw_content': '', # Don't include binary in raw_content
|
| 670 |
-
'content_type': content_type,
|
| 671 |
-
'url': url,
|
| 672 |
-
'metadata': {
|
| 673 |
-
'is_binary': True,
|
| 674 |
-
'content_length': len(response.content),
|
| 675 |
-
'requires_download': True,
|
| 676 |
-
'special_elements': {
|
| 677 |
-
'download_links': [url]
|
| 678 |
-
}
|
| 679 |
-
}
|
| 680 |
-
}
|
| 681 |
-
|
| 682 |
-
def _detect_and_decode_base64(self, text: str) -> str:
|
| 683 |
-
"""
|
| 684 |
-
Detect and decode base64 content in text
|
| 685 |
-
|
| 686 |
-
Args:
|
| 687 |
-
text: Text that might contain base64
|
| 688 |
-
|
| 689 |
-
Returns:
|
| 690 |
-
Decoded text if base64 found, original text otherwise
|
| 691 |
-
"""
|
| 692 |
-
# Pattern to detect base64 strings (at least 20 chars, typical base64 chars)
|
| 693 |
-
# Must be fairly long to avoid false positives
|
| 694 |
-
base64_pattern = r'([A-Za-z0-9+/]{40,}={0,2})'
|
| 695 |
-
|
| 696 |
-
matches = re.findall(base64_pattern, text)
|
| 697 |
-
|
| 698 |
-
if not matches:
|
| 699 |
-
return text
|
| 700 |
-
|
| 701 |
-
logger.debug(f"Found {len(matches)} potential base64 strings")
|
| 702 |
-
|
| 703 |
-
decoded_parts = []
|
| 704 |
-
|
| 705 |
-
for match in matches:
|
| 706 |
try:
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
logger.info(
|
| 714 |
-
f"β Decoded base64 string "
|
| 715 |
-
f"(length: {len(match)} β {len(decoded_text)} chars)"
|
| 716 |
-
)
|
| 717 |
-
decoded_parts.append(decoded_text)
|
| 718 |
-
else:
|
| 719 |
-
logger.debug("Base64 decoded to binary/unreadable data")
|
| 720 |
-
|
| 721 |
except Exception as e:
|
| 722 |
-
logger.
|
| 723 |
-
|
| 724 |
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
return
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
min_printable_ratio: Minimum ratio of printable characters
|
| 741 |
-
|
| 742 |
-
Returns:
|
| 743 |
-
bool: True if text appears readable
|
| 744 |
-
"""
|
| 745 |
-
if not text or len(text) < 10:
|
| 746 |
return False
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
)
|
| 757 |
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
def _needs_llm_analysis(self, task_info: Dict[str, Any]) -> bool:
|
| 761 |
-
"""
|
| 762 |
-
Determine if fetched content needs LLM analysis to extract actual task
|
| 763 |
-
|
| 764 |
-
Args:
|
| 765 |
-
task_info: Fetched task information
|
| 766 |
-
|
| 767 |
-
Returns:
|
| 768 |
-
bool: True if LLM analysis needed
|
| 769 |
-
"""
|
| 770 |
-
metadata = task_info.get('metadata', {})
|
| 771 |
-
task_desc = task_info.get('task_description', '').lower()
|
| 772 |
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
# Check 2: Special elements present
|
| 779 |
-
special_elements = metadata.get('special_elements', {})
|
| 780 |
-
has_audio = bool(special_elements.get('audio_urls'))
|
| 781 |
-
has_video = bool(special_elements.get('video_urls'))
|
| 782 |
-
has_downloads = bool(special_elements.get('download_links'))
|
| 783 |
-
has_iframes = bool(special_elements.get('iframe_sources'))
|
| 784 |
-
has_images = bool(special_elements.get('image_urls'))
|
| 785 |
-
has_forms = bool(special_elements.get('form_actions'))
|
| 786 |
-
|
| 787 |
-
if any([has_audio, has_video, has_downloads, has_iframes, has_images, has_forms]):
|
| 788 |
-
logger.info(
|
| 789 |
-
f"π€ Special elements detected - LLM analysis recommended "
|
| 790 |
-
f"(audio:{has_audio}, video:{has_video}, downloads:{has_downloads}, "
|
| 791 |
-
f"iframes:{has_iframes}, images:{has_images}, forms:{has_forms})"
|
| 792 |
)
|
| 793 |
-
return
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
logger.info("π€ Very short content - LLM analysis recommended")
|
| 798 |
-
return True
|
| 799 |
-
|
| 800 |
-
# Check 4: Indirect language suggesting further action needed
|
| 801 |
-
indirect_keywords = [
|
| 802 |
-
'click here', 'download', 'visit', 'listen to',
|
| 803 |
-
'watch', 'see attached', 'refer to', 'check the',
|
| 804 |
-
'navigate to', 'go to', 'follow the link',
|
| 805 |
-
'open the file', 'play the', 'view the'
|
| 806 |
-
]
|
| 807 |
-
|
| 808 |
-
has_indirect_language = any(keyword in task_desc for keyword in indirect_keywords)
|
| 809 |
-
|
| 810 |
-
if has_indirect_language:
|
| 811 |
-
logger.info("π€ Indirect language detected - LLM analysis recommended")
|
| 812 |
-
return True
|
| 813 |
-
|
| 814 |
-
# Check 5: Multiple URLs in content (might need to visit them)
|
| 815 |
-
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
|
| 816 |
-
urls_in_content = re.findall(url_pattern, task_desc)
|
| 817 |
-
|
| 818 |
-
if len(urls_in_content) > 1:
|
| 819 |
-
logger.info(f"π€ Multiple URLs found ({len(urls_in_content)}) - LLM analysis recommended")
|
| 820 |
-
return True
|
| 821 |
-
|
| 822 |
-
# Content seems straightforward
|
| 823 |
-
logger.info("β Content appears straightforward - LLM analysis not required")
|
| 824 |
-
return False
|
| 825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 826 |
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
Raises:
|
| 839 |
-
TaskProcessingError: If fetching fails
|
| 840 |
-
"""
|
| 841 |
-
async with TaskFetcher() as fetcher:
|
| 842 |
-
return await fetcher.fetch_task(url)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Task Fetcher Service - with Static/Dynamic Scraper fallback
|
| 3 |
+
Fetches and extracts task descriptions from URLs
|
| 4 |
"""
|
| 5 |
|
| 6 |
import httpx
|
| 7 |
import json
|
|
|
|
| 8 |
import re
|
| 9 |
from typing import Optional, Dict, Any, List
|
| 10 |
+
from urllib.parse import urlparse
|
| 11 |
from bs4 import BeautifulSoup
|
| 12 |
|
| 13 |
from app.core.config import settings
|
| 14 |
from app.core.logging import get_logger
|
| 15 |
from app.core.exceptions import TaskProcessingError
|
| 16 |
+
from app.utils.llm_client import get_llm_client
|
| 17 |
+
from app.utils.prompts import AnalysisPrompts
|
| 18 |
|
| 19 |
logger = get_logger(__name__)
|
| 20 |
|
| 21 |
|
| 22 |
class TaskFetcher:
|
| 23 |
"""
|
| 24 |
+
Enhanced service for fetching and extracting task descriptions from URLs.
|
| 25 |
+
Strategy:
|
| 26 |
+
1. httpx (fast)
|
| 27 |
+
2. If content looks JS-only/empty β DynamicScraper
|
| 28 |
+
3. Let orchestrator use Static/Dynamic scrapers later for real data pages
|
| 29 |
"""
|
| 30 |
|
| 31 |
def __init__(self, timeout: int = 30):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
self.timeout = timeout
|
| 33 |
+
self.client: Optional[httpx.AsyncClient] = None
|
| 34 |
+
self.llm_client = get_llm_client()
|
| 35 |
+
|
| 36 |
+
# Import here to avoid circular imports
|
| 37 |
+
from app.orchestrator.models import UnifiedTaskAnalysis
|
| 38 |
+
|
| 39 |
+
self._content_analyzer_agent = self.llm_client.create_agent(
|
| 40 |
+
output_type=UnifiedTaskAnalysis,
|
| 41 |
+
system_prompt=(
|
| 42 |
+
"You are an expert at analyzing task content. "
|
| 43 |
+
"You detect redirects, extract submission URLs, and parse instructions."
|
| 44 |
+
),
|
| 45 |
+
retries=2
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
logger.debug("TaskFetcher initialized with unified LLM analysis")
|
| 49 |
|
| 50 |
async def __aenter__(self):
|
|
|
|
| 51 |
self.client = httpx.AsyncClient(
|
| 52 |
timeout=self.timeout,
|
| 53 |
follow_redirects=True,
|
|
|
|
| 59 |
return self
|
| 60 |
|
| 61 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
|
|
| 62 |
if self.client:
|
| 63 |
await self.client.aclose()
|
| 64 |
+
|
| 65 |
+
# ======================================================================
|
| 66 |
+
# PUBLIC ENTRY POINT
|
| 67 |
+
# ======================================================================
|
| 68 |
+
|
| 69 |
+
async def fetch_and_analyze(self, url: str, base_url: Optional[str] = None) -> Dict[str, Any]:
|
| 70 |
"""
|
| 71 |
+
Fetch URL and perform unified LLM analysis.
|
| 72 |
+
1. httpx + basic extraction
|
| 73 |
+
2. If JS-only / empty β DynamicScraper
|
| 74 |
+
3. LLM: redirect + submission_url + instructions
|
| 75 |
+
"""
|
| 76 |
+
logger.info(f"π₯ Fetching and analyzing URL: {url}")
|
| 77 |
+
if base_url is None:
|
| 78 |
+
base_url = url
|
| 79 |
|
| 80 |
+
# Step 1: Fetch visible content (with fallback)
|
| 81 |
+
content = await self._fetch_content(url)
|
| 82 |
+
|
| 83 |
+
logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
|
| 84 |
+
|
| 85 |
+
# Step 2: Unified LLM analysis
|
| 86 |
+
analysis = await self._analyze_content_with_llm(
|
| 87 |
+
task_description=content['task_description'],
|
| 88 |
+
raw_content=content['raw_content'],
|
| 89 |
+
url=url,
|
| 90 |
+
base_url=base_url
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Merge content + analysis
|
| 94 |
+
result = {
|
| 95 |
+
**content,
|
| 96 |
+
'is_redirect': analysis.is_redirect,
|
| 97 |
+
'question_url': analysis.question_url,
|
| 98 |
+
'submission_url': analysis.submission_url,
|
| 99 |
+
'instructions': self._format_instructions(analysis.instructions),
|
| 100 |
+
'overall_goal': analysis.overall_goal,
|
| 101 |
+
'complexity': analysis.complexity,
|
| 102 |
+
'llm_analysis': {
|
| 103 |
+
'redirect_reasoning': analysis.redirect_reasoning,
|
| 104 |
+
'submission_reasoning': analysis.submission_reasoning,
|
| 105 |
+
'confidence': analysis.confidence,
|
| 106 |
}
|
| 107 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
# Resolve relative submission URL if needed
|
| 110 |
+
if analysis.submission_url and analysis.submission_url_is_relative:
|
| 111 |
+
absolute = str(httpx.URL(base_url).join(analysis.submission_url))
|
| 112 |
+
logger.info(f"β Resolved relative submission URL: {analysis.submission_url} β {absolute}")
|
| 113 |
+
result['submission_url'] = absolute
|
| 114 |
+
|
| 115 |
+
# Resolve relative question URL if needed
|
| 116 |
+
if analysis.question_url and analysis.question_url.startswith('/'):
|
| 117 |
+
absolute_q = str(httpx.URL(base_url).join(analysis.question_url))
|
| 118 |
+
logger.info(f"β Resolved relative question URL: {analysis.question_url} β {absolute_q}")
|
| 119 |
+
result['question_url'] = absolute_q
|
| 120 |
+
|
| 121 |
+
logger.info("β
Analysis complete:")
|
| 122 |
+
logger.info(f" Is Redirect: {result['is_redirect']}")
|
| 123 |
+
logger.info(f" Submission URL: {result['submission_url']}")
|
| 124 |
+
logger.info(f" Instructions: {len(result['instructions'])} steps")
|
| 125 |
+
logger.info(f" Complexity: {result['complexity']}")
|
| 126 |
+
|
| 127 |
+
return result
|
| 128 |
+
|
| 129 |
+
# ======================================================================
|
| 130 |
+
# FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
|
| 131 |
+
# ======================================================================
|
| 132 |
+
|
| 133 |
+
async def _fetch_content(self, url: str) -> Dict[str, Any]:
|
| 134 |
+
"""
|
| 135 |
+
Fetch content from URL.
|
| 136 |
+
- Try httpx first
|
| 137 |
+
- If JS-only/empty β fallback to DynamicScraper
|
| 138 |
+
"""
|
| 139 |
if not self._is_valid_url(url):
|
|
|
|
| 140 |
raise TaskProcessingError(f"Invalid URL format: {url}")
|
| 141 |
|
| 142 |
try:
|
|
|
|
| 143 |
response = await self._fetch_url(url)
|
|
|
|
|
|
|
| 144 |
content_type = self._detect_content_type(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
+
# Basic extraction
|
| 147 |
+
task_description = await self._extract_basic_content(response, content_type)
|
| 148 |
+
raw_content = response.text[:5000]
|
| 149 |
|
| 150 |
+
# Heuristic: if nothing useful, try dynamic scraper
|
| 151 |
+
if self._looks_js_only(task_description, raw_content):
|
| 152 |
+
logger.warning("β οΈ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
|
| 153 |
+
dyn = await self._fetch_with_dynamic_scraper(url)
|
| 154 |
+
task_description = dyn['task_description']
|
| 155 |
+
raw_content = dyn['raw_content']
|
| 156 |
|
| 157 |
+
return {
|
| 158 |
+
'task_description': task_description,
|
| 159 |
+
'raw_content': raw_content,
|
| 160 |
+
'content_type': content_type,
|
| 161 |
+
'url': url,
|
| 162 |
+
'metadata': {
|
| 163 |
+
'content_length': len(response.content),
|
| 164 |
+
'status_code': response.status_code,
|
| 165 |
+
}
|
| 166 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
except Exception as e:
|
| 169 |
+
logger.error(f"β Failed to fetch content: {e}", exc_info=True)
|
| 170 |
+
raise TaskProcessingError(f"Failed to fetch URL: {str(e)}")
|
| 171 |
|
| 172 |
async def _fetch_url(self, url: str) -> httpx.Response:
|
| 173 |
+
"""Fetch with httpx and retries."""
|
| 174 |
+
max_retries = getattr(settings, "MAX_RETRIES", 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
for attempt in range(max_retries):
|
| 177 |
try:
|
| 178 |
+
logger.debug(f"HTTPX fetch attempt {attempt + 1}/{max_retries} for {url}")
|
|
|
|
| 179 |
response = await self.client.get(url)
|
| 180 |
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
return response
|
|
|
|
| 182 |
except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
|
| 183 |
+
logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
| 184 |
if attempt == max_retries - 1:
|
|
|
|
| 185 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
continue
|
| 187 |
+
|
| 188 |
+
def _looks_js_only(self, task_description: str, html: str) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
"""
|
| 190 |
+
Detect JS-only / empty pages that need dynamic rendering.
|
| 191 |
+
- Empty or tiny text
|
| 192 |
+
- Has <script> that uses atob/innerHTML/URLSearchParams
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
"""
|
| 194 |
+
if task_description and len(task_description.strip()) > 50:
|
| 195 |
+
return False
|
| 196 |
|
| 197 |
+
# Strong JS signals
|
| 198 |
+
js_markers = ['atob(', 'innerHTML', 'URLSearchParams', 'document.querySelector']
|
| 199 |
+
if any(marker in html for marker in js_markers):
|
| 200 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
+
# Very little visible text after stripping scripts
|
| 203 |
+
cleaned = re.sub(r'<script.*?</script>', '', html, flags=re.S | re.I)
|
| 204 |
+
if len(cleaned.strip()) < 100:
|
| 205 |
+
return True
|
| 206 |
|
| 207 |
+
return False
|
| 208 |
+
|
| 209 |
+
async def _fetch_with_dynamic_scraper(self, url: str) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
"""
|
| 211 |
+
Use DynamicScraper to render the page and extract visible text
|
| 212 |
+
for instruction pages.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
"""
|
| 214 |
+
from app.modules.scrapers.dynamic_scraper import DynamicScraper
|
| 215 |
|
| 216 |
+
scraper = DynamicScraper(use_pool=True)
|
| 217 |
+
await scraper.initialize()
|
| 218 |
try:
|
| 219 |
+
# Auto-extract text blocks
|
| 220 |
+
result = await scraper.scrape_url(url)
|
| 221 |
+
if not result.success:
|
| 222 |
+
raise RuntimeError(result.error or "Dynamic scraping failed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
+
# DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
|
| 225 |
+
texts: List[str] = []
|
| 226 |
+
if isinstance(result.data, list):
|
| 227 |
+
for row in result.data:
|
| 228 |
+
if isinstance(row, dict) and 'text' in row:
|
| 229 |
+
texts.append(str(row['text']))
|
| 230 |
|
| 231 |
+
task_text = "\n".join(texts) if texts else ""
|
|
|
|
|
|
|
| 232 |
|
| 233 |
+
logger.info(f"β Got {len(texts)} text blocks via DynamicScraper")
|
|
|
|
|
|
|
| 234 |
|
| 235 |
+
# Best-effort raw_content: you could extend DynamicScraper to return page.content()
|
| 236 |
return {
|
| 237 |
+
'task_description': task_text,
|
| 238 |
+
'raw_content': task_text[:5000], # at least something readable
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
}
|
| 240 |
+
finally:
|
| 241 |
+
await scraper.cleanup()
|
| 242 |
+
|
| 243 |
+
# ======================================================================
|
| 244 |
+
# BASIC EXTRACTION (NO LLM)
|
| 245 |
+
# ======================================================================
|
| 246 |
+
|
| 247 |
+
async def _extract_basic_content(self, response: httpx.Response, content_type: str) -> str:
|
| 248 |
+
"""Fast, no-JS extraction for instruction pages."""
|
| 249 |
+
if content_type == 'json':
|
| 250 |
+
try:
|
| 251 |
+
data = response.json()
|
| 252 |
+
for field in ['task', 'description', 'question', 'content', 'text']:
|
| 253 |
+
if isinstance(data, dict) and field in data:
|
| 254 |
+
return str(data[field])
|
| 255 |
+
return json.dumps(data)
|
| 256 |
+
except Exception:
|
| 257 |
+
return response.text
|
| 258 |
+
|
| 259 |
+
if content_type == 'html':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
try:
|
| 261 |
+
html = response.text
|
| 262 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 263 |
+
for script in soup(['script', 'style', 'nav', 'header', 'footer']):
|
| 264 |
+
script.decompose()
|
| 265 |
+
text = soup.get_text(strip=True, separator=' ')
|
| 266 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
except Exception as e:
|
| 268 |
+
logger.error(f"HTML basic extraction failed: {e}")
|
| 269 |
+
return response.text
|
| 270 |
|
| 271 |
+
return response.text
|
| 272 |
+
|
| 273 |
+
def _detect_content_type(self, response: httpx.Response) -> str:
|
| 274 |
+
ct = response.headers.get('content-type', '').lower()
|
| 275 |
+
if 'application/json' in ct:
|
| 276 |
+
return 'json'
|
| 277 |
+
if 'text/html' in ct or '<html' in response.text.lower()[:200]:
|
| 278 |
+
return 'html'
|
| 279 |
+
return 'text'
|
| 280 |
+
|
| 281 |
+
def _is_valid_url(self, url: str) -> bool:
|
| 282 |
+
try:
|
| 283 |
+
r = urlparse(url)
|
| 284 |
+
return r.scheme in ('http', 'https') and bool(r.netloc)
|
| 285 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
return False
|
| 287 |
+
|
| 288 |
+
# ======================================================================
|
| 289 |
+
# LLM ANALYSIS
|
| 290 |
+
# ======================================================================
|
| 291 |
+
|
| 292 |
+
async def _analyze_content_with_llm(
|
| 293 |
+
self,
|
| 294 |
+
task_description: str,
|
| 295 |
+
raw_content: str,
|
| 296 |
+
url: str,
|
| 297 |
+
base_url: str
|
| 298 |
+
):
|
| 299 |
+
"""Unified LLM analysis."""
|
| 300 |
+
logger.info("π€ Running unified LLM analysis...")
|
| 301 |
+
|
| 302 |
+
url_pattern = r'https?://[^\s<>"\']+(?:/[^\s<>"\']*)?'
|
| 303 |
+
all_urls = re.findall(url_pattern, task_description + raw_content[:1000])
|
| 304 |
+
all_urls = list({u.rstrip('.,;:)') for u in all_urls})
|
| 305 |
+
|
| 306 |
+
prompt = AnalysisPrompts.unified_content_analysis_prompt(
|
| 307 |
+
task_description=task_description[:2000],
|
| 308 |
+
found_urls=all_urls,
|
| 309 |
+
current_url=url,
|
| 310 |
+
base_url=base_url
|
| 311 |
)
|
| 312 |
|
| 313 |
+
from app.orchestrator.models import UnifiedTaskAnalysis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
try:
|
| 316 |
+
analysis: UnifiedTaskAnalysis = await self.llm_client.run_agent(
|
| 317 |
+
self._content_analyzer_agent,
|
| 318 |
+
prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
)
|
| 320 |
+
return analysis
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.error(f"β LLM analysis failed: {e}", exc_info=True)
|
| 323 |
+
return self._fallback_analysis(task_description, all_urls, url, base_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
+
def _fallback_analysis(
|
| 326 |
+
self,
|
| 327 |
+
task_description: str,
|
| 328 |
+
all_urls: List[str],
|
| 329 |
+
url: str,
|
| 330 |
+
base_url: str
|
| 331 |
+
):
|
| 332 |
+
"""Very simple fallback if LLM fails."""
|
| 333 |
+
from app.orchestrator.models import UnifiedTaskAnalysis, InstructionStep
|
| 334 |
+
|
| 335 |
+
logger.warning("β οΈ Using fallback pattern-based analysis")
|
| 336 |
+
|
| 337 |
+
is_redirect = False
|
| 338 |
+
submission_url = None
|
| 339 |
+
|
| 340 |
+
for pattern in [r'POST\s+(?:to\s+)?([^\s<>"\']+)', r'submit\s+(?:to\s+)?([^\s<>"\']+)']:
|
| 341 |
+
m = re.search(pattern, task_description, re.IGNORECASE)
|
| 342 |
+
if m:
|
| 343 |
+
submission_url = m.group(1).rstrip('.,;:)')
|
| 344 |
+
break
|
| 345 |
+
|
| 346 |
+
sentences = re.split(r'[.;\n]', task_description)
|
| 347 |
+
instructions = []
|
| 348 |
+
step = 1
|
| 349 |
+
for s in sentences:
|
| 350 |
+
s = s.strip()
|
| 351 |
+
if len(s) > 5:
|
| 352 |
+
instructions.append(InstructionStep(
|
| 353 |
+
step_number=step,
|
| 354 |
+
action='unknown',
|
| 355 |
+
description=s,
|
| 356 |
+
target=None,
|
| 357 |
+
dependencies=[]
|
| 358 |
+
))
|
| 359 |
+
step += 1
|
| 360 |
+
|
| 361 |
+
return UnifiedTaskAnalysis(
|
| 362 |
+
is_redirect=is_redirect,
|
| 363 |
+
question_url=None,
|
| 364 |
+
redirect_reasoning="Fallback: no redirect detection",
|
| 365 |
+
submission_url=submission_url,
|
| 366 |
+
submission_url_is_relative=submission_url.startswith('/') if submission_url else False,
|
| 367 |
+
submission_reasoning="Fallback: simple regex match",
|
| 368 |
+
instructions=instructions,
|
| 369 |
+
overall_goal="Unknown (fallback)",
|
| 370 |
+
complexity="unknown",
|
| 371 |
+
confidence=0.3
|
| 372 |
+
)
|
| 373 |
|
| 374 |
+
def _format_instructions(self, steps) -> List[Dict[str, Any]]:
|
| 375 |
+
return [
|
| 376 |
+
{
|
| 377 |
+
'step': s.step_number,
|
| 378 |
+
'action': s.action,
|
| 379 |
+
'text': s.description,
|
| 380 |
+
'target': s.target,
|
| 381 |
+
'dependencies': s.dependencies,
|
| 382 |
+
}
|
| 383 |
+
for s in steps
|
| 384 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/services/task_processor.py
CHANGED
|
@@ -1,24 +1,25 @@
|
|
| 1 |
"""
|
| 2 |
Task Processing Service
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
from typing import Dict, Any
|
| 7 |
-
|
| 8 |
from app.models.request import TaskRequest
|
| 9 |
from app.core.logging import get_logger
|
| 10 |
from app.core.exceptions import TaskProcessingError
|
| 11 |
from app.orchestrator.orchestrator_engine import OrchestratorEngine
|
| 12 |
from app.modules.registry import ModuleRegistry
|
| 13 |
from app.modules.mock_modules import register_mock_modules
|
|
|
|
|
|
|
| 14 |
|
| 15 |
logger = get_logger(__name__)
|
| 16 |
|
| 17 |
|
| 18 |
class TaskProcessor:
|
| 19 |
"""
|
| 20 |
-
Service class for processing tasks
|
| 21 |
-
Uses
|
| 22 |
"""
|
| 23 |
|
| 24 |
def __init__(
|
|
@@ -27,74 +28,177 @@ class TaskProcessor:
|
|
| 27 |
enable_actions: bool = True,
|
| 28 |
auto_register_modules: bool = True
|
| 29 |
):
|
| 30 |
-
"""
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
Args:
|
| 34 |
-
enable_decomposition: Enable complex task decomposition
|
| 35 |
-
enable_actions: Enable action execution (downloads, transcription, OCR)
|
| 36 |
-
auto_register_modules: Auto-register mock modules
|
| 37 |
-
"""
|
| 38 |
-
logger.info("π Initializing TaskProcessor with OrchestratorEngine")
|
| 39 |
|
| 40 |
-
# Setup
|
| 41 |
self.registry = ModuleRegistry()
|
|
|
|
| 42 |
|
| 43 |
if auto_register_modules:
|
| 44 |
-
logger.info("π¦ Registering
|
| 45 |
register_mock_modules(self.registry)
|
| 46 |
logger.info(f"β Registered {len(self.registry.get_all_modules())} modules")
|
| 47 |
|
| 48 |
-
# Initialize orchestrator engine
|
| 49 |
self.orchestrator = OrchestratorEngine(
|
| 50 |
module_registry=self.registry,
|
| 51 |
enable_decomposition=enable_decomposition,
|
| 52 |
enable_actions=enable_actions
|
| 53 |
)
|
| 54 |
|
| 55 |
-
logger.info("β
TaskProcessor initialized
|
| 56 |
|
| 57 |
async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
|
| 58 |
"""
|
| 59 |
-
Process
|
| 60 |
-
Uses complete orchestration pipeline (Steps 1-8)
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
TaskProcessingError: If processing fails
|
| 70 |
"""
|
|
|
|
| 71 |
logger.info(f"π Processing task for: {task_data.email}")
|
| 72 |
-
logger.info(f"π
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
try:
|
| 75 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
logger.info("=" * 80)
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
logger.info("=" * 80)
|
| 79 |
|
| 80 |
orchestration_result = await self.orchestrator.execute_task(
|
| 81 |
-
task_input=
|
| 82 |
-
task_url=
|
| 83 |
-
context={
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
)
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
orchestration_result=orchestration_result
|
| 90 |
-
)
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
| 94 |
logger.info("=" * 80)
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
except TaskProcessingError:
|
| 99 |
# Re-raise task processing errors
|
| 100 |
raise
|
|
@@ -103,117 +207,158 @@ class TaskProcessor:
|
|
| 103 |
logger.error(f"β Task processing failed: {str(e)}", exc_info=True)
|
| 104 |
raise TaskProcessingError(f"Failed to process task: {str(e)}")
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
def _build_response(
|
| 107 |
self,
|
| 108 |
task_data: TaskRequest,
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
) -> Dict[str, Any]:
|
| 111 |
"""
|
| 112 |
-
Build API response
|
| 113 |
|
| 114 |
Args:
|
| 115 |
task_data: Original task request
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
orchestration_result: Result from orchestrator
|
|
|
|
|
|
|
| 117 |
|
| 118 |
Returns:
|
| 119 |
-
|
| 120 |
"""
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
'
|
|
|
|
|
|
|
|
|
|
| 128 |
'email': task_data.email,
|
| 129 |
-
'task_url': str(task_data.url),
|
| 130 |
-
'task_id': orchestration_result['task_id'],
|
| 131 |
-
'execution_id': orchestration_result['execution_id'],
|
| 132 |
-
'duration': orchestration_result['duration'],
|
| 133 |
-
'strategy': orchestration_result.get('strategy', 'unknown'),
|
| 134 |
-
'success': orchestration_result['success']
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
-
# Add execution details
|
| 138 |
-
if orchestration_result['success']:
|
| 139 |
-
response['result'] = {
|
| 140 |
-
'data': orchestration_result.get('data'),
|
| 141 |
-
'execution_details': orchestration_result.get('execution_details', {}),
|
| 142 |
-
'steps_completed': orchestration_result.get('steps', {})
|
| 143 |
-
}
|
| 144 |
|
| 145 |
-
#
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
#
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
|
|
|
| 161 |
|
| 162 |
-
return
|
| 163 |
-
|
| 164 |
-
def _format_classification(self, classification: Any) -> Dict[str, Any]:
|
| 165 |
-
"""Format classification for API response"""
|
| 166 |
-
try:
|
| 167 |
-
return {
|
| 168 |
-
'primary_task': classification.primary_task.value,
|
| 169 |
-
'secondary_tasks': [t.value for t in classification.secondary_tasks],
|
| 170 |
-
'complexity': classification.complexity.value,
|
| 171 |
-
'estimated_steps': classification.estimated_steps,
|
| 172 |
-
'requires_javascript': classification.requires_javascript,
|
| 173 |
-
'requires_authentication': classification.requires_authentication,
|
| 174 |
-
'requires_external_data': classification.requires_external_data,
|
| 175 |
-
'output_format': classification.output_format.value,
|
| 176 |
-
'confidence': classification.confidence,
|
| 177 |
-
'reasoning': classification.reasoning,
|
| 178 |
-
'key_entities': classification.key_entities,
|
| 179 |
-
'suggested_tools': classification.suggested_tools
|
| 180 |
-
}
|
| 181 |
-
except Exception as e:
|
| 182 |
-
logger.warning(f"Could not format classification: {e}")
|
| 183 |
-
return {'error': 'Classification format error'}
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
return {
|
| 189 |
-
'data_sources': [
|
| 190 |
-
{
|
| 191 |
-
'type': ds.type,
|
| 192 |
-
'location': ds.location,
|
| 193 |
-
'format': ds.format,
|
| 194 |
-
'description': ds.description
|
| 195 |
-
}
|
| 196 |
-
for ds in parameters.data_sources
|
| 197 |
-
],
|
| 198 |
-
'filters': [
|
| 199 |
-
{
|
| 200 |
-
'field': f.field,
|
| 201 |
-
'operator': f.operator,
|
| 202 |
-
'value': f.value,
|
| 203 |
-
'description': f.description
|
| 204 |
-
}
|
| 205 |
-
for f in parameters.filters
|
| 206 |
-
],
|
| 207 |
-
'columns': [col.name for col in parameters.columns],
|
| 208 |
-
'aggregations': len(parameters.aggregations),
|
| 209 |
-
'visualizations': len(parameters.visualizations),
|
| 210 |
-
'output_format': parameters.output.format if parameters.output else None,
|
| 211 |
-
'confidence': parameters.confidence,
|
| 212 |
-
'complexity_score': parameters.complexity_score
|
| 213 |
-
}
|
| 214 |
-
except Exception as e:
|
| 215 |
-
logger.warning(f"Could not format parameters: {e}")
|
| 216 |
-
return {'error': 'Parameters format error'}
|
| 217 |
|
| 218 |
def get_registry(self) -> ModuleRegistry:
|
| 219 |
"""Get module registry for adding/removing modules"""
|
|
@@ -223,7 +368,16 @@ class TaskProcessor:
|
|
| 223 |
"""Get orchestrator engine for advanced usage"""
|
| 224 |
return self.orchestrator
|
| 225 |
|
| 226 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
"""Clean up resources"""
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Task Processing Service
|
| 3 |
+
Simplified with unified LLM analysis in task_fetcher
|
| 4 |
"""
|
| 5 |
|
| 6 |
+
from typing import Dict, Any, Optional
|
|
|
|
| 7 |
from app.models.request import TaskRequest
|
| 8 |
from app.core.logging import get_logger
|
| 9 |
from app.core.exceptions import TaskProcessingError
|
| 10 |
from app.orchestrator.orchestrator_engine import OrchestratorEngine
|
| 11 |
from app.modules.registry import ModuleRegistry
|
| 12 |
from app.modules.mock_modules import register_mock_modules
|
| 13 |
+
from app.services.task_fetcher import TaskFetcher
|
| 14 |
+
# from app.orchestrator.answer_submitter import AnswerSubmitter # β
Fixed: uncommented
|
| 15 |
|
| 16 |
logger = get_logger(__name__)
|
| 17 |
|
| 18 |
|
| 19 |
class TaskProcessor:
|
| 20 |
"""
|
| 21 |
+
Service class for processing TDS quiz tasks
|
| 22 |
+
Uses unified LLM analysis from task_fetcher
|
| 23 |
"""
|
| 24 |
|
| 25 |
def __init__(
|
|
|
|
| 28 |
enable_actions: bool = True,
|
| 29 |
auto_register_modules: bool = True
|
| 30 |
):
|
| 31 |
+
"""Initialize task processor"""
|
| 32 |
+
logger.info("π Initializing TaskProcessor")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# Setup components
|
| 35 |
self.registry = ModuleRegistry()
|
| 36 |
+
# self.answer_submitter = AnswerSubmitter() # β
Fixed: using this
|
| 37 |
|
| 38 |
if auto_register_modules:
|
| 39 |
+
logger.info("π¦ Registering modules...")
|
| 40 |
register_mock_modules(self.registry)
|
| 41 |
logger.info(f"β Registered {len(self.registry.get_all_modules())} modules")
|
| 42 |
|
| 43 |
+
# Initialize orchestrator engine
|
| 44 |
self.orchestrator = OrchestratorEngine(
|
| 45 |
module_registry=self.registry,
|
| 46 |
enable_decomposition=enable_decomposition,
|
| 47 |
enable_actions=enable_actions
|
| 48 |
)
|
| 49 |
|
| 50 |
+
logger.info("β
TaskProcessor initialized")
|
| 51 |
|
| 52 |
async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
|
| 53 |
"""
|
| 54 |
+
Process TDS quiz task
|
|
|
|
| 55 |
|
| 56 |
+
Flow:
|
| 57 |
+
1. Fetch and analyze Request URL (unified LLM call in task_fetcher)
|
| 58 |
+
2. If redirect, fetch Question URL (unified LLM call in task_fetcher)
|
| 59 |
+
3. Execute orchestration
|
| 60 |
+
4. Extract answer
|
| 61 |
+
5. Submit to TDS
|
| 62 |
+
6. Build response
|
|
|
|
| 63 |
"""
|
| 64 |
+
logger.info("=" * 80)
|
| 65 |
logger.info(f"π Processing task for: {task_data.email}")
|
| 66 |
+
logger.info(f"π Request URL: {task_data.url}")
|
| 67 |
+
logger.info("=" * 80)
|
| 68 |
+
|
| 69 |
+
request_url = str(task_data.url)
|
| 70 |
+
question_url = None
|
| 71 |
+
submission_url = None
|
| 72 |
|
| 73 |
try:
|
| 74 |
+
# ===================================================================
|
| 75 |
+
# STEP 1: FETCH AND ANALYZE REQUEST URL (UNIFIED LLM CALL)
|
| 76 |
+
# ===================================================================
|
| 77 |
+
logger.info("\n" + "=" * 80)
|
| 78 |
+
logger.info("STEP 1: FETCHING & ANALYZING REQUEST URL")
|
| 79 |
logger.info("=" * 80)
|
| 80 |
+
|
| 81 |
+
async with TaskFetcher() as fetcher:
|
| 82 |
+
analysis = await fetcher.fetch_and_analyze(url=request_url)
|
| 83 |
+
|
| 84 |
+
logger.info(f"β Request URL analyzed")
|
| 85 |
+
logger.info(f" Is Redirect: {analysis['is_redirect']}")
|
| 86 |
+
logger.info(f" Complexity: {analysis['complexity']}")
|
| 87 |
+
|
| 88 |
+
# ===================================================================
|
| 89 |
+
# STEP 2: IF REDIRECT, FETCH QUESTION URL (UNIFIED LLM CALL)
|
| 90 |
+
# ===================================================================
|
| 91 |
+
if analysis['is_redirect'] and analysis['question_url']:
|
| 92 |
+
logger.info("\n" + "=" * 80)
|
| 93 |
+
logger.info("STEP 2: FETCHING & ANALYZING QUESTION URL")
|
| 94 |
+
logger.info("=" * 80)
|
| 95 |
+
|
| 96 |
+
question_url = analysis['question_url']
|
| 97 |
+
logger.info(f"π Detected redirect to: {question_url}")
|
| 98 |
+
|
| 99 |
+
async with TaskFetcher() as fetcher:
|
| 100 |
+
analysis = await fetcher.fetch_and_analyze(
|
| 101 |
+
url=question_url,
|
| 102 |
+
base_url=request_url # For resolving relative URLs
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
logger.info(f"β Question URL analyzed")
|
| 106 |
+
logger.info(f" Task: {analysis['task_description'][:100]}...")
|
| 107 |
+
else:
|
| 108 |
+
question_url = request_url
|
| 109 |
+
logger.info(f"β Request URL contains actual task (no redirect)")
|
| 110 |
+
|
| 111 |
+
# Extract key information
|
| 112 |
+
task_description = analysis['task_description']
|
| 113 |
+
submission_url = analysis.get('submission_url')
|
| 114 |
+
instructions = analysis.get('instructions', [])
|
| 115 |
+
|
| 116 |
+
# Log URL hierarchy
|
| 117 |
+
logger.info("\nπ URL Hierarchy:")
|
| 118 |
+
logger.info(f" Request URL: {request_url}")
|
| 119 |
+
logger.info(f" Question URL: {question_url}")
|
| 120 |
+
logger.info(f" Submission URL: {submission_url}")
|
| 121 |
+
logger.info(f" Instructions: {len(instructions)} steps")
|
| 122 |
+
|
| 123 |
+
# ===================================================================
|
| 124 |
+
# STEP 3: EXECUTE ORCHESTRATION
|
| 125 |
+
# ===================================================================
|
| 126 |
+
logger.info("\n" + "=" * 80)
|
| 127 |
+
logger.info("STEP 3: EXECUTING ORCHESTRATION")
|
| 128 |
logger.info("=" * 80)
|
| 129 |
|
| 130 |
orchestration_result = await self.orchestrator.execute_task(
|
| 131 |
+
task_input=task_description,
|
| 132 |
+
task_url=question_url,
|
| 133 |
+
context={
|
| 134 |
+
'email': task_data.email,
|
| 135 |
+
'request_url': request_url,
|
| 136 |
+
'question_url': question_url,
|
| 137 |
+
'submission_url': submission_url,
|
| 138 |
+
'instructions': instructions,
|
| 139 |
+
'complexity': analysis['complexity'],
|
| 140 |
+
'overall_goal': analysis['overall_goal']
|
| 141 |
+
}
|
| 142 |
)
|
| 143 |
|
| 144 |
+
logger.info(f"β Orchestration completed")
|
| 145 |
+
logger.info(f" Success: {orchestration_result['success']}")
|
| 146 |
+
logger.info(f" Duration: {orchestration_result['duration']:.2f}s")
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
# ===================================================================
|
| 149 |
+
# STEP 4: EXTRACT ANSWER
|
| 150 |
+
# ===================================================================
|
| 151 |
+
logger.info("\n" + "=" * 80)
|
| 152 |
+
logger.info("STEP 4: EXTRACTING ANSWER")
|
| 153 |
logger.info("=" * 80)
|
| 154 |
|
| 155 |
+
answer = self._extract_answer(orchestration_result)
|
| 156 |
+
logger.info(f"β Answer extracted: {str(answer)[:200]}")
|
| 157 |
+
|
| 158 |
+
# # ===================================================================
|
| 159 |
+
# # STEP 5: SUBMIT ANSWER TO TDS
|
| 160 |
+
# # ===================================================================
|
| 161 |
+
# logger.info("\n" + "=" * 80)
|
| 162 |
+
# logger.info("STEP 5: SUBMITTING ANSWER TO TDS")
|
| 163 |
+
# logger.info("=" * 80)
|
| 164 |
+
|
| 165 |
+
# submission_result = await self.answer_submitter.submit_answer(
|
| 166 |
+
# email=task_data.email,
|
| 167 |
+
# secret=task_data.secret,
|
| 168 |
+
# url=question_url, # β
Use Question URL, not Request URL
|
| 169 |
+
# answer=answer,
|
| 170 |
+
# submission_url=submission_url
|
| 171 |
+
# )
|
| 172 |
+
|
| 173 |
+
# logger.info(f"β Submission completed")
|
| 174 |
+
# logger.info(f" Success: {submission_result['success']}")
|
| 175 |
+
# logger.info(f" Status Code: {submission_result.get('status_code')}")
|
| 176 |
+
|
| 177 |
+
# if submission_result.get('response'):
|
| 178 |
+
# logger.info(f" Response: {submission_result['response']}")
|
| 179 |
+
|
| 180 |
+
# # ===================================================================
|
| 181 |
+
# # STEP 6: BUILD RESPONSE
|
| 182 |
+
# # ===================================================================
|
| 183 |
+
# result = self._build_response(
|
| 184 |
+
# task_data=task_data,
|
| 185 |
+
# request_url=request_url,
|
| 186 |
+
# question_url=question_url,
|
| 187 |
+
# submission_url=submission_url,
|
| 188 |
+
# analysis=analysis, # β
Fixed: pass analysis, not task_content
|
| 189 |
+
# orchestration_result=orchestration_result,
|
| 190 |
+
# submission_result=submission_result,
|
| 191 |
+
# answer=answer
|
| 192 |
+
# )
|
| 193 |
+
|
| 194 |
+
# logger.info("\n" + "=" * 80)
|
| 195 |
+
# logger.info(f"β
TASK COMPLETED SUCCESSFULLY")
|
| 196 |
+
# logger.info(f" Total Duration: {orchestration_result['duration']:.2f}s")
|
| 197 |
+
# logger.info(f" Answer Submitted: {submission_result['success']}")
|
| 198 |
+
# logger.info("=" * 80)
|
| 199 |
+
|
| 200 |
+
# return result # β
Fixed: actually return the result
|
| 201 |
+
return
|
| 202 |
except TaskProcessingError:
|
| 203 |
# Re-raise task processing errors
|
| 204 |
raise
|
|
|
|
| 207 |
logger.error(f"β Task processing failed: {str(e)}", exc_info=True)
|
| 208 |
raise TaskProcessingError(f"Failed to process task: {str(e)}")
|
| 209 |
|
| 210 |
+
def _extract_answer(self, orchestration_result: Dict[str, Any]) -> Any:
|
| 211 |
+
"""
|
| 212 |
+
Extract final answer from orchestration result
|
| 213 |
+
|
| 214 |
+
Tries multiple field names and strategies to find the answer
|
| 215 |
+
"""
|
| 216 |
+
logger.debug("Extracting answer from orchestration result")
|
| 217 |
+
|
| 218 |
+
if not orchestration_result.get('success'):
|
| 219 |
+
logger.warning("Orchestration was not successful")
|
| 220 |
+
return None
|
| 221 |
+
|
| 222 |
+
data = orchestration_result.get('data', {})
|
| 223 |
+
|
| 224 |
+
# If data is not a dict, return it directly
|
| 225 |
+
if not isinstance(data, dict):
|
| 226 |
+
logger.debug(f"Data is {type(data).__name__}, returning as-is")
|
| 227 |
+
return data
|
| 228 |
+
|
| 229 |
+
# Try common answer field names
|
| 230 |
+
result_fields = [
|
| 231 |
+
'answer', 'result', 'output', 'value', 'computed_value',
|
| 232 |
+
'extracted_data', 'scraped_data', 'secret_code',
|
| 233 |
+
'code', 'secret', 'solution', 'response'
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
for field in result_fields:
|
| 237 |
+
if field in data:
|
| 238 |
+
logger.debug(f"Found answer in '{field}' field")
|
| 239 |
+
return data[field]
|
| 240 |
+
|
| 241 |
+
# If only one key, return its value
|
| 242 |
+
if len(data) == 1:
|
| 243 |
+
key = list(data.keys())[0]
|
| 244 |
+
logger.debug(f"Single key '{key}' in data, using its value")
|
| 245 |
+
return data[key]
|
| 246 |
+
|
| 247 |
+
# Return entire data dict as last resort
|
| 248 |
+
logger.debug("No specific answer field found, returning entire data")
|
| 249 |
+
return data
|
| 250 |
+
|
| 251 |
def _build_response(
|
| 252 |
self,
|
| 253 |
task_data: TaskRequest,
|
| 254 |
+
request_url: str,
|
| 255 |
+
question_url: str,
|
| 256 |
+
submission_url: str,
|
| 257 |
+
analysis: Dict[str, Any], # β
Fixed: renamed from task_content
|
| 258 |
+
orchestration_result: Dict[str, Any],
|
| 259 |
+
submission_result: Dict[str, Any],
|
| 260 |
+
answer: Any
|
| 261 |
) -> Dict[str, Any]:
|
| 262 |
"""
|
| 263 |
+
Build comprehensive API response
|
| 264 |
|
| 265 |
Args:
|
| 266 |
task_data: Original task request
|
| 267 |
+
request_url: Original URL from API request
|
| 268 |
+
question_url: URL where actual task was found
|
| 269 |
+
submission_url: URL where answer was submitted
|
| 270 |
+
analysis: Unified analysis from task_fetcher
|
| 271 |
orchestration_result: Result from orchestrator
|
| 272 |
+
submission_result: Result from TDS submission
|
| 273 |
+
answer: Extracted answer
|
| 274 |
|
| 275 |
Returns:
|
| 276 |
+
Formatted response dict
|
| 277 |
"""
|
| 278 |
+
overall_success = (
|
| 279 |
+
orchestration_result['success'] and
|
| 280 |
+
submission_result['success']
|
| 281 |
+
)
|
| 282 |
|
| 283 |
+
return {
|
| 284 |
+
# Status
|
| 285 |
+
'success': overall_success,
|
| 286 |
+
'status': 'completed' if overall_success else 'failed',
|
| 287 |
+
|
| 288 |
+
# Request info
|
| 289 |
'email': task_data.email,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
+
# URL hierarchy
|
| 292 |
+
'urls': {
|
| 293 |
+
'request_url': request_url,
|
| 294 |
+
'question_url': question_url,
|
| 295 |
+
'submission_url': submission_url
|
| 296 |
+
},
|
| 297 |
+
|
| 298 |
+
# IDs and timing
|
| 299 |
+
'task_id': orchestration_result.get('task_id'),
|
| 300 |
+
'execution_id': orchestration_result.get('execution_id'),
|
| 301 |
+
'duration': orchestration_result.get('duration'),
|
| 302 |
+
'timestamp': orchestration_result.get('timestamp'),
|
| 303 |
+
|
| 304 |
+
# Answer
|
| 305 |
+
'answer': answer,
|
| 306 |
+
|
| 307 |
+
# Submission details
|
| 308 |
+
'submission': {
|
| 309 |
+
'success': submission_result['success'],
|
| 310 |
+
'status_code': submission_result.get('status_code'),
|
| 311 |
+
'submitted_to': submission_url,
|
| 312 |
+
'submitted_url': question_url, # URL included in payload
|
| 313 |
+
'response': submission_result.get('response')
|
| 314 |
+
},
|
| 315 |
|
| 316 |
+
# Task details
|
| 317 |
+
'task_details': {
|
| 318 |
+
'task_description': analysis['task_description'][:500], # Truncate
|
| 319 |
+
'complexity': analysis.get('complexity'),
|
| 320 |
+
'overall_goal': analysis.get('overall_goal'),
|
| 321 |
+
'instructions_count': len(analysis.get('instructions', [])),
|
| 322 |
+
'was_redirect': analysis.get('is_redirect', False)
|
| 323 |
+
},
|
| 324 |
|
| 325 |
+
# Orchestration details
|
| 326 |
+
'orchestration': {
|
| 327 |
+
'success': orchestration_result['success'],
|
| 328 |
+
'strategy': orchestration_result.get('strategy'),
|
| 329 |
+
'steps_completed': list(orchestration_result.get('steps', {}).keys())
|
| 330 |
+
},
|
| 331 |
+
|
| 332 |
+
# LLM analysis metadata
|
| 333 |
+
'llm_analysis': analysis.get('llm_analysis', {}),
|
| 334 |
+
|
| 335 |
+
# Message
|
| 336 |
+
'message': self._build_message(overall_success, orchestration_result, submission_result)
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
def _build_message(
|
| 340 |
+
self,
|
| 341 |
+
overall_success: bool,
|
| 342 |
+
orchestration_result: Dict[str, Any],
|
| 343 |
+
submission_result: Dict[str, Any]
|
| 344 |
+
) -> str:
|
| 345 |
+
"""Build human-readable status message"""
|
| 346 |
+
if overall_success:
|
| 347 |
+
return "Task completed successfully and answer submitted to TDS"
|
| 348 |
|
| 349 |
+
if not orchestration_result['success']:
|
| 350 |
+
error = orchestration_result.get('error', 'Unknown error')
|
| 351 |
+
return f"Task execution failed: {error}"
|
| 352 |
|
| 353 |
+
if not submission_result['success']:
|
| 354 |
+
error = submission_result.get('error', 'Unknown error')
|
| 355 |
+
return f"Task completed but submission failed: {error}"
|
| 356 |
|
| 357 |
+
return "Task failed for unknown reason"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
+
# ========================================================================
|
| 360 |
+
# UTILITY METHODS
|
| 361 |
+
# ========================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
|
| 363 |
def get_registry(self) -> ModuleRegistry:
|
| 364 |
"""Get module registry for adding/removing modules"""
|
|
|
|
| 368 |
"""Get orchestrator engine for advanced usage"""
|
| 369 |
return self.orchestrator
|
| 370 |
|
| 371 |
+
# def get_answer_submitter(self) -> AnswerSubmitter:
|
| 372 |
+
# """Get answer submitter for testing"""
|
| 373 |
+
# return self.answer_submitter
|
| 374 |
+
|
| 375 |
+
async def cleanup(self):
|
| 376 |
"""Clean up resources"""
|
| 377 |
+
try:
|
| 378 |
+
self.orchestrator.cleanup()
|
| 379 |
+
logger.info("β Orchestrator cleanup complete")
|
| 380 |
+
except Exception as e:
|
| 381 |
+
logger.warning(f"Orchestrator cleanup error: {e}")
|
| 382 |
+
|
| 383 |
+
logger.info("β
TaskProcessor cleanup complete")
|
app/utils/prompts.py
CHANGED
|
@@ -54,7 +54,84 @@ Extract EVERYTHING that could be useful for task execution."""
|
|
| 54 |
|
| 55 |
class AnalysisPrompts:
|
| 56 |
"""Prompts for data analysis and insight generation"""
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
@staticmethod
|
| 59 |
def analysis_planning_prompt(
|
| 60 |
question: str,
|
|
@@ -543,3 +620,88 @@ Be detailed and actionable. Each step should be implementable."""
|
|
| 543 |
lines.append(f" ... and {len(values) - 2} more")
|
| 544 |
|
| 545 |
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
class AnalysisPrompts:
|
| 56 |
"""Prompts for data analysis and insight generation"""
|
| 57 |
+
@staticmethod
|
| 58 |
+
def unified_content_analysis_prompt(
|
| 59 |
+
task_description: str,
|
| 60 |
+
found_urls: List[str],
|
| 61 |
+
current_url: str,
|
| 62 |
+
base_url: str
|
| 63 |
+
) -> str:
|
| 64 |
+
"""Optimized unified analysis prompt"""
|
| 65 |
+
urls_text = "\n".join(f"- {url}" for url in found_urls) if found_urls else "None"
|
| 66 |
+
|
| 67 |
+
return f"""Analyze this quiz/task content and extract all critical information.
|
| 68 |
+
|
| 69 |
+
**Current URL:** {current_url}
|
| 70 |
+
**Base URL:** {base_url}
|
| 71 |
+
|
| 72 |
+
**Content:**
|
| 73 |
+
{task_description}
|
| 74 |
+
|
| 75 |
+
**URLs found:**
|
| 76 |
+
{urls_text}
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## EXTRACT:
|
| 81 |
+
|
| 82 |
+
### 1. SUBMISSION URL (Priority #1)
|
| 83 |
+
Where to POST the final answer.
|
| 84 |
+
|
| 85 |
+
**Search for:** "POST to", "submit to", "send to", "answer to"
|
| 86 |
+
**Extract from:** Text, markdown links `[text](URL)`, relative paths `/submit`
|
| 87 |
+
**Set submission_url_is_relative=True** if starts with `/`
|
| 88 |
+
|
| 89 |
+
### 2. REDIRECT DETECTION
|
| 90 |
+
**is_redirect=True** if content says "visit URL" or "task at URL" (directs elsewhere)
|
| 91 |
+
**is_redirect=False** if content IS the task (has instructions)
|
| 92 |
+
|
| 93 |
+
Provide **question_url** if redirect detected.
|
| 94 |
+
|
| 95 |
+
### 3. INSTRUCTION PARSING
|
| 96 |
+
Break into steps (ONLY if is_redirect=False).
|
| 97 |
+
|
| 98 |
+
**Actions:** scrape, extract, calculate, submit, download, transcribe, analyze, visit
|
| 99 |
+
**Each step:** step_number, action, description, target, dependencies
|
| 100 |
+
|
| 101 |
+
### 4. ASSESSMENT
|
| 102 |
+
- **overall_goal**: One sentence
|
| 103 |
+
- **complexity**: trivial/simple/moderate/complex
|
| 104 |
+
- **confidence**: 0.0-1.0
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## EXAMPLE:
|
| 109 |
+
|
| 110 |
+
**Input:**
|
| 111 |
+
"Scrape /data?email=... Get the secret code. POST code to [/submit](https://example.com/submit)"
|
| 112 |
+
|
| 113 |
+
**Output:**
|
| 114 |
+
{{
|
| 115 |
+
"is_redirect": false,
|
| 116 |
+
"question_url": null,
|
| 117 |
+
"redirect_reasoning": "Contains task instructions",
|
| 118 |
+
"submission_url": "/submit",
|
| 119 |
+
"submission_url_is_relative": true,
|
| 120 |
+
"submission_reasoning": "Found 'POST code to /submit'",
|
| 121 |
+
"instructions": [
|
| 122 |
+
{{"step_number": 1, "action": "scrape", "description": "Scrape /data page", "target": "/data?email=...", "dependencies": []}},
|
| 123 |
+
{{"step_number": 2, "action": "extract", "description": "Extract secret code", "target": "secret code", "dependencies": }},
|
| 124 |
+
{{"step_number": 3, "action": "submit", "description": "POST code to /submit", "target": "/submit", "dependencies": }}
|
| 125 |
+
],
|
| 126 |
+
"overall_goal": "Scrape, extract, and submit secret code",
|
| 127 |
+
"complexity": "simple",
|
| 128 |
+
"confidence": 0.92
|
| 129 |
+
}}
|
| 130 |
+
|
| 131 |
+
text
|
| 132 |
+
|
| 133 |
+
Now analyze the content above."""
|
| 134 |
+
|
| 135 |
@staticmethod
|
| 136 |
def analysis_planning_prompt(
|
| 137 |
question: str,
|
|
|
|
| 620 |
lines.append(f" ... and {len(values) - 2} more")
|
| 621 |
|
| 622 |
return "\n".join(lines)
|
| 623 |
+
|
| 624 |
+
@staticmethod
|
| 625 |
+
def url_detection_prompt(
|
| 626 |
+
content: str,
|
| 627 |
+
urls: list,
|
| 628 |
+
request_url: str
|
| 629 |
+
) -> str:
|
| 630 |
+
"""
|
| 631 |
+
Prompt for detecting Question URL from content
|
| 632 |
+
|
| 633 |
+
Args:
|
| 634 |
+
content: Fetched content
|
| 635 |
+
urls: List of URLs found in content
|
| 636 |
+
request_url: Original request URL
|
| 637 |
+
|
| 638 |
+
Returns:
|
| 639 |
+
Formatted prompt
|
| 640 |
+
"""
|
| 641 |
+
urls_text = "\n".join(f"{i+1}. {url}" for i, url in enumerate(urls)) if urls else "None"
|
| 642 |
+
|
| 643 |
+
return f"""You are analyzing content from a TDS quiz/task system to determine URL relationships.
|
| 644 |
+
|
| 645 |
+
**Context:**
|
| 646 |
+
- We fetched content from: {request_url}
|
| 647 |
+
- This content might either:
|
| 648 |
+
1. BE the actual task/question (return is_redirect=False)
|
| 649 |
+
2. REDIRECT/POINT to another URL where the actual task is located (return is_redirect=True)
|
| 650 |
+
|
| 651 |
+
**URLs found in content:**
|
| 652 |
+
{urls_text}
|
| 653 |
+
|
| 654 |
+
**Content (truncated to 1500 chars):**
|
| 655 |
+
{content[:1500]}
|
| 656 |
+
|
| 657 |
+
**Your Task:**
|
| 658 |
+
Analyze if this content IS the actual task, or if it REDIRECTS to another URL to get the task.
|
| 659 |
+
|
| 660 |
+
**URL Type Definitions:**
|
| 661 |
+
- **question_url**: URL to visit to GET the actual task/question
|
| 662 |
+
- **data_url**: URL to GET/SCRAPE data from (as part of task instructions)
|
| 663 |
+
- **submission_url**: URL to POST the final answer to
|
| 664 |
+
- **reference_url**: URL for reference/documentation only
|
| 665 |
+
|
| 666 |
+
**Decision Rules:**
|
| 667 |
+
|
| 668 |
+
1. **is_redirect=True** when:
|
| 669 |
+
- Content explicitly says "visit <url>", "your task is at <url>", "go to <url>"
|
| 670 |
+
- Content is very short (< 100 chars) with just a URL
|
| 671 |
+
- Content describes WHERE to find the task, not WHAT the task is
|
| 672 |
+
- Primary purpose is to direct you to another location
|
| 673 |
+
|
| 674 |
+
2. **is_redirect=False** when:
|
| 675 |
+
- Content contains actual task instructions (scrape, analyze, calculate, etc.)
|
| 676 |
+
- URLs are mentioned as DATA SOURCES or SUBMISSION endpoints
|
| 677 |
+
- Content is the task itself, even if it references other URLs
|
| 678 |
+
|
| 679 |
+
**Examples:**
|
| 680 |
+
|
| 681 |
+
Example 1 (REDIRECT):
|
| 682 |
+
Content: "Your quiz is available at https://example.com/quiz-834"
|
| 683 |
+
β is_redirect=True, question_url="https://example.com/quiz-834"
|
| 684 |
+
β Reasoning: Content tells you WHERE to go, not WHAT to do
|
| 685 |
+
|
| 686 |
+
Example 2 (ACTUAL TASK):
|
| 687 |
+
Content: "Scrape https://example.com/data and extract the top 5 prices. Submit to https://example.com/submit"
|
| 688 |
+
β is_redirect=False
|
| 689 |
+
β Reasoning: This IS the task. URLs are for data source and submission, not for getting the task
|
| 690 |
+
|
| 691 |
+
Example 3 (REDIRECT with multiple URLs):
|
| 692 |
+
Content: "Visit https://example.com/quiz-834 to receive your assignment. You'll be asked to scrape another website."
|
| 693 |
+
β is_redirect=True, question_url="https://example.com/quiz-834"
|
| 694 |
+
β Reasoning: Content directs you to quiz-834 to GET the actual task
|
| 695 |
+
|
| 696 |
+
Example 4 (ACTUAL TASK with multiple URLs):
|
| 697 |
+
Content: "Download data from https://api.example.com/data and compare with https://example.com/reference. Calculate the difference."
|
| 698 |
+
β is_redirect=False
|
| 699 |
+
β Reasoning: This IS the task. Both URLs are data sources for completing the task
|
| 700 |
+
|
| 701 |
+
Example 5 (SHORT REDIRECT):
|
| 702 |
+
Content: "https://example.com/quiz-834"
|
| 703 |
+
β is_redirect=True, question_url="https://example.com/quiz-834"
|
| 704 |
+
β Reasoning: Only a URL, no task instructions
|
| 705 |
+
|
| 706 |
+
**Now analyze the content above and provide your analysis.**"""
|
| 707 |
+
|