23f3003322 commited on
Commit
dfe7fff
Β·
1 Parent(s): 308bf1a

task processing complete

Browse files
app/__pycache__/main.cpython-313.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ
 
app/api/routes/__pycache__/task.cpython-313.pyc CHANGED
Binary files a/app/api/routes/__pycache__/task.cpython-313.pyc and b/app/api/routes/__pycache__/task.cpython-313.pyc differ
 
app/api/routes/task.py CHANGED
@@ -1,66 +1,176 @@
1
  """
2
- Task Processing Routes
3
- Main API endpoint for handling task requests
4
  """
5
 
 
6
  from datetime import datetime
7
- from fastapi import APIRouter, Request, status
8
 
9
  from app.models.request import TaskRequest
10
- from app.models.response import TaskResponse
11
- from app.api.dependencies import verify_authentication
12
- from app.services.task_processor import TaskProcessor
13
  from app.core.logging import get_logger
14
-
15
- import requests
 
16
 
17
  logger = get_logger(__name__)
18
 
19
  router = APIRouter()
 
 
20
  task_processor = TaskProcessor()
21
 
22
 
23
- @router.post("/task", response_model=TaskResponse, status_code=status.HTTP_200_OK)
24
- async def handle_task(request: Request):
 
 
 
 
 
 
 
 
 
 
 
 
25
  """
26
  Main API endpoint for handling task requests
27
 
28
- - Validates request format (HTTP 400 if invalid)
29
- - Verifies secret (HTTP 403 if invalid)
30
- - Processes task and returns results (HTTP 200 if successful)
 
 
 
 
 
31
  """
32
  start_time = datetime.now()
33
 
34
  logger.info("πŸ“₯ Task request received")
35
 
36
- # Parse and validate request body with Pydantic
37
- body = await request.json()
38
- task_data = TaskRequest(**body)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- logger.info(f"βœ… Request validated for: {task_data.email}")
 
 
41
 
42
- # Verify authentication
43
- logger.info("πŸ” Verifying authentication")
44
- verify_authentication(task_data.secret)
45
- logger.info("βœ… Authentication successful")
 
 
 
 
 
 
 
 
 
 
46
 
47
- # Process the task
48
- logger.info("πŸš€ Starting task execution")
49
- result_data = await task_processor.process(task_data)
50
 
51
- # Calculate execution time
52
- execution_time = (datetime.now() - start_time).total_seconds()
53
- logger.info(f"⏱️ Task completed in {execution_time:.3f}s")
 
 
 
 
 
54
 
55
- # Prepare response
56
- response = TaskResponse(
57
- success=True,
58
- message="Task completed successfully",
59
- data=result_data,
60
- email=task_data.email,
61
- task_url=str(task_data.url),
62
- execution_time=execution_time
63
- )
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- logger.info("βœ… Response prepared successfully")
66
- return response
 
 
 
 
 
1
  """
2
+ Task API Routes
3
+ Handles task submission and processing
4
  """
5
 
6
+ from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
7
  from datetime import datetime
8
+ from typing import Dict, Any
9
 
10
  from app.models.request import TaskRequest
11
+ from app.models.response import TaskResponse, ImmediateResponse
 
 
12
  from app.core.logging import get_logger
13
+ from app.core.security import verify_authentication, AuthenticationError
14
+ from app.core.exceptions import TaskProcessingError
15
+ from app.services.task_processor import TaskProcessor
16
 
17
  logger = get_logger(__name__)
18
 
19
  router = APIRouter()
20
+
21
+ # Initialize task processor (singleton)
22
  task_processor = TaskProcessor()
23
 
24
 
25
+ @router.post(
26
+ "/task",
27
+ response_model=ImmediateResponse,
28
+ status_code=status.HTTP_200_OK,
29
+ responses={
30
+ 200: {"description": "Request accepted and processing started"},
31
+ 400: {"description": "Invalid JSON format or request data"},
32
+ 403: {"description": "Invalid secret - authentication failed"}
33
+ }
34
+ )
35
+ async def handle_task(
36
+ request: Request,
37
+ background_tasks: BackgroundTasks
38
+ ):
39
  """
40
  Main API endpoint for handling task requests
41
 
42
+ Flow:
43
+ 1. Validate JSON format (HTTP 400 if invalid)
44
+ 2. Verify secret (HTTP 403 if invalid)
45
+ 3. Respond immediately with HTTP 200
46
+ 4. Process task in background
47
+
48
+ Returns:
49
+ Immediate HTTP 200 response with task accepted message
50
  """
51
  start_time = datetime.now()
52
 
53
  logger.info("πŸ“₯ Task request received")
54
 
55
+ try:
56
+ # ================================================================
57
+ # STEP 1: PARSE AND VALIDATE JSON (HTTP 400 if invalid)
58
+ # ================================================================
59
+ try:
60
+ body = await request.json()
61
+ task_data = TaskRequest(**body)
62
+ except ValueError as e:
63
+ logger.error(f"❌ Invalid JSON format: {str(e)}")
64
+ raise HTTPException(
65
+ status_code=status.HTTP_400_BAD_REQUEST,
66
+ detail=f"Invalid JSON format: {str(e)}"
67
+ )
68
+ except Exception as e:
69
+ logger.error(f"❌ Request validation failed: {str(e)}")
70
+ raise HTTPException(
71
+ status_code=status.HTTP_400_BAD_REQUEST,
72
+ detail=f"Invalid request data: {str(e)}"
73
+ )
74
+
75
+ logger.info(f"βœ… Request validated for: {task_data.email}")
76
+
77
+ # ================================================================
78
+ # STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
79
+ # ================================================================
80
+ logger.info("πŸ” Verifying authentication")
81
+ try:
82
+ verify_authentication(task_data.secret)
83
+ except AuthenticationError as e:
84
+ logger.error(f"❌ Authentication failed: {str(e)}")
85
+ raise HTTPException(
86
+ status_code=status.HTTP_403_FORBIDDEN,
87
+ detail="Invalid secret. Authentication failed."
88
+ )
89
+
90
+ logger.info("βœ… Authentication successful")
91
+
92
+ # ================================================================
93
+ # STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
94
+ # ================================================================
95
+ logger.info("βœ… Request accepted - processing in background")
96
+
97
+ # Add task processing to background
98
+ background_tasks.add_task(
99
+ process_task_background,
100
+ task_data=task_data,
101
+ start_time=start_time
102
+ )
103
+
104
+ # Immediate response
105
+ response = ImmediateResponse(
106
+ success=True,
107
+ message="Task accepted and processing started",
108
+ email=task_data.email,
109
+ task_url=str(task_data.url),
110
+ status="processing",
111
+ timestamp=datetime.now().isoformat()
112
+ )
113
+
114
+ logger.info(f"πŸ“€ Sent immediate response to client")
115
+
116
+ return response
117
 
118
+ except HTTPException:
119
+ # Re-raise HTTP exceptions (400, 403)
120
+ raise
121
 
122
+ except Exception as e:
123
+ logger.error(f"❌ Unexpected error: {str(e)}", exc_info=True)
124
+ raise HTTPException(
125
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
126
+ detail=f"Internal server error: {str(e)}"
127
+ )
128
+
129
+
130
+ async def process_task_background(
131
+ task_data: TaskRequest,
132
+ start_time: datetime
133
+ ):
134
+ """
135
+ Process task in background after sending immediate response
136
 
137
+ This runs asynchronously after the HTTP 200 response is sent.
138
+ Results are logged but not returned to client.
 
139
 
140
+ Args:
141
+ task_data: Validated task request
142
+ start_time: Request start time for metrics
143
+ """
144
+ logger.info("=" * 80)
145
+ logger.info("πŸ”„ BACKGROUND TASK PROCESSING STARTED")
146
+ logger.info("=" * 80)
147
+ logger.info(f"πŸ”— URL: {task_data.url}")
148
 
149
+ try:
150
+ # Process the task
151
+ result_data = await task_processor.process(task_data)
152
+
153
+ # Calculate execution time
154
+ execution_time = (datetime.now() - start_time).total_seconds()
155
+
156
+
157
+
158
+ # Optional: Store result in database/cache for later retrieval
159
+ # await store_result(task_data.email, result_data)
160
+
161
+ except TaskProcessingError as e:
162
+ logger.error("=" * 80)
163
+ logger.error("❌ BACKGROUND TASK FAILED")
164
+ logger.error("=" * 80)
165
+ logger.error(f"Error: {str(e)}")
166
+ logger.error("=" * 80)
167
+
168
+ # Optional: Store error for later retrieval or send notification
169
+ # await store_error(task_data.email, str(e))
170
 
171
+ except Exception as e:
172
+ logger.error("=" * 80)
173
+ logger.error("❌ BACKGROUND TASK UNEXPECTED ERROR")
174
+ logger.error("=" * 80)
175
+ logger.error(f"Error: {str(e)}", exc_info=True)
176
+ logger.error("=" * 80)
app/core/__pycache__/security.cpython-313.pyc CHANGED
Binary files a/app/core/__pycache__/security.cpython-313.pyc and b/app/core/__pycache__/security.cpython-313.pyc differ
 
app/core/security.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
- Security Utilities
3
- Handles authentication and authorization
4
  """
5
 
6
  from app.core.config import settings
@@ -9,49 +8,47 @@ from app.core.logging import get_logger
9
  logger = get_logger(__name__)
10
 
11
 
12
- def verify_secret(provided_secret: str) -> bool:
 
 
 
 
 
13
  """
14
- Verify the provided secret against environment configuration
15
 
16
  Args:
17
- provided_secret: Secret from request
18
 
19
  Returns:
20
- bool: True if secret matches, False otherwise
21
  """
22
- if not settings.is_secret_configured():
23
- logger.error("⚠️ API_SECRET not configured in environment")
24
- return False
25
 
26
- is_valid = provided_secret == settings.API_SECRET
 
27
 
28
- if is_valid:
29
- logger.info("βœ… Secret verification successful")
30
- else:
31
- logger.warning("🚫 Secret verification failed")
32
- logger.debug(
33
- f"Expected length: {len(settings.API_SECRET)}, "
34
- f"Got length: {len(provided_secret)}"
35
- )
36
 
37
  return is_valid
38
 
39
 
40
- def mask_secret(secret: str, visible_chars: int = 4) -> str:
41
  """
42
- Mask secret for logging purposes
43
 
44
  Args:
45
- secret: Secret to mask
46
- visible_chars: Number of characters to show at the end
47
 
48
  Returns:
49
- str: Masked secret
 
 
 
50
  """
51
- if not secret:
52
- return ""
53
-
54
- if len(secret) <= visible_chars:
55
- return "*" * len(secret)
56
 
57
- return "*" * (len(secret) - visible_chars) + secret[-visible_chars:]
 
1
  """
2
+ Security and Authentication
 
3
  """
4
 
5
  from app.core.config import settings
 
8
  logger = get_logger(__name__)
9
 
10
 
11
+ class AuthenticationError(Exception):
12
+ """Raised when authentication fails"""
13
+ pass
14
+
15
+
16
+ def verify_secret(secret: str) -> bool:
17
  """
18
+ Verify if provided secret is valid
19
 
20
  Args:
21
+ secret: Secret from request
22
 
23
  Returns:
24
+ bool: True if valid, False otherwise
25
  """
26
+ # Get expected secret from config/env
27
+ expected_secret = settings.API_SECRET
 
28
 
29
+ # Simple comparison (use constant-time comparison in production)
30
+ is_valid = secret == expected_secret
31
 
32
+ if not is_valid:
33
+ logger.warning(f"❌ Invalid secret attempt")
 
 
 
 
 
 
34
 
35
  return is_valid
36
 
37
 
38
+ def verify_authentication(secret: str) -> bool:
39
  """
40
+ Verify request authentication
41
 
42
  Args:
43
+ secret: Secret from request
 
44
 
45
  Returns:
46
+ bool: True if authenticated
47
+
48
+ Raises:
49
+ AuthenticationError: If authentication fails (HTTP 403)
50
  """
51
+ if not verify_secret(secret):
52
+ raise AuthenticationError("Invalid secret. Authentication failed.")
 
 
 
53
 
54
+ return True
app/models/__pycache__/response.cpython-313.pyc CHANGED
Binary files a/app/models/__pycache__/response.cpython-313.pyc and b/app/models/__pycache__/response.cpython-313.pyc differ
 
app/models/response.py CHANGED
@@ -7,6 +7,47 @@ from typing import Optional, Dict, Any
7
  from datetime import datetime
8
  from pydantic import BaseModel, Field
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class TaskResponse(BaseModel):
12
  """
 
7
  from datetime import datetime
8
  from pydantic import BaseModel, Field
9
 
10
+ class ImmediateResponse(BaseModel):
11
+ """
12
+ Immediate response sent after validation
13
+ HTTP 200 response before task processing
14
+ """
15
+ success: bool = Field(
16
+ description="Whether request was accepted"
17
+ )
18
+
19
+ message: str = Field(
20
+ description="Status message"
21
+ )
22
+
23
+ email: str = Field(
24
+ description="Student email from request"
25
+ )
26
+
27
+ task_url: str = Field(
28
+ description="Task URL from request"
29
+ )
30
+
31
+ status: str = Field(
32
+ description="Processing status: processing, completed, failed"
33
+ )
34
+
35
+ timestamp: str = Field(
36
+ description="Response timestamp (ISO format)"
37
+ )
38
+
39
+ class Config:
40
+ json_schema_extra = {
41
+ "example": {
42
+ "success": True,
43
+ "message": "Task accepted and processing started",
44
+ "email": "[email protected]",
45
+ "task_url": "https://example.com/quiz-834",
46
+ "status": "processing",
47
+ "timestamp": "2025-11-29T12:00:00"
48
+ }
49
+ }
50
+
51
 
52
  class TaskResponse(BaseModel):
53
  """
app/orchestrator/models.py CHANGED
@@ -48,6 +48,124 @@ class OutputFormat(str, Enum):
48
  UNKNOWN = "unknown"
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  class TaskClassification(BaseModel):
52
  """
53
  Structured output for task classification
 
48
  UNKNOWN = "unknown"
49
 
50
 
51
+ class URLDetection(BaseModel):
52
+ """Result of URL detection analysis"""
53
+
54
+ is_redirect: bool = Field(
55
+ description="True if content redirects to another URL for the actual task"
56
+ )
57
+
58
+ question_url: Optional[str] = Field(
59
+ default=None,
60
+ description="The URL to visit for the actual task (if is_redirect is True)"
61
+ )
62
+
63
+ reasoning: str = Field(
64
+ description="Detailed explanation of why this is or isn't a redirect"
65
+ )
66
+
67
+ url_types: Dict[str, str] = Field(
68
+ default_factory=dict,
69
+ description="Classification of each URL found (e.g., 'question_url', 'data_url', 'submission_url')"
70
+ )
71
+
72
+ confidence: str = Field(
73
+ default="medium",
74
+ description="Confidence level: low, medium, high"
75
+ )
76
+
77
+
78
+ class InstructionStep(BaseModel):
79
+ """Single instruction step"""
80
+
81
+ step_number: int = Field(
82
+ description="Step number in sequence (1, 2, 3...)"
83
+ )
84
+
85
+ action: str = Field(
86
+ description="Primary action: scrape, extract, calculate, submit, download, transcribe, analyze, visit"
87
+ )
88
+
89
+ description: str = Field(
90
+ description="Clear description of what to do in this step"
91
+ )
92
+
93
+ target: Optional[str] = Field(
94
+ default=None,
95
+ description="Target of the action (URL, field name, file, etc.)"
96
+ )
97
+
98
+ dependencies: List[int] = Field(
99
+ default_factory=list,
100
+ description="Step numbers this step depends on"
101
+ )
102
+
103
+
104
+ class UnifiedTaskAnalysis(BaseModel):
105
+ """
106
+ Unified analysis for task fetching
107
+ Combines redirect detection, submission URL extraction, and instruction parsing
108
+ """
109
+
110
+ # ========================================================================
111
+ # REDIRECT DETECTION
112
+ # ========================================================================
113
+ is_redirect: bool = Field(
114
+ description="True if this content redirects to another URL for the actual task"
115
+ )
116
+
117
+ question_url: Optional[str] = Field(
118
+ default=None,
119
+ description="URL to visit for the actual task (if is_redirect=True)"
120
+ )
121
+
122
+ redirect_reasoning: str = Field(
123
+ default="",
124
+ description="Why this is or isn't a redirect"
125
+ )
126
+
127
+ # ========================================================================
128
+ # SUBMISSION URL EXTRACTION
129
+ # ========================================================================
130
+ submission_url: Optional[str] = Field(
131
+ default=None,
132
+ description="URL where the final answer should be POSTed"
133
+ )
134
+
135
+ submission_url_is_relative: bool = Field(
136
+ default=False,
137
+ description="True if submission URL is relative and needs base URL resolution"
138
+ )
139
+
140
+ submission_reasoning: str = Field(
141
+ default="",
142
+ description="How the submission URL was identified"
143
+ )
144
+
145
+ # ========================================================================
146
+ # INSTRUCTION PARSING
147
+ # ========================================================================
148
+ instructions: List[InstructionStep] = Field(
149
+ default_factory=list,
150
+ description="Parsed step-by-step instructions (empty if redirect)"
151
+ )
152
+
153
+ overall_goal: str = Field(
154
+ description="High-level summary of what needs to be accomplished"
155
+ )
156
+
157
+ complexity: str = Field(
158
+ description="Task complexity: trivial, simple, moderate, complex"
159
+ )
160
+
161
+ # ========================================================================
162
+ # CONFIDENCE
163
+ # ========================================================================
164
+ confidence: float = Field(
165
+ ge=0.0,
166
+ le=1.0,
167
+ description="Overall confidence (0.0-1.0)"
168
+ )
169
  class TaskClassification(BaseModel):
170
  """
171
  Structured output for task classification
app/services/__pycache__/task_processor.cpython-313.pyc CHANGED
Binary files a/app/services/__pycache__/task_processor.cpython-313.pyc and b/app/services/__pycache__/task_processor.cpython-313.pyc differ
 
app/services/task_fetcher.py CHANGED
@@ -1,42 +1,53 @@
1
  """
2
- Task Fetcher Service - Enhanced Version
3
- Fetches and extracts task descriptions from URLs with intelligent content detection
4
  """
5
 
6
  import httpx
7
  import json
8
- import base64
9
  import re
10
  from typing import Optional, Dict, Any, List
11
- from urllib.parse import urlparse, urljoin
12
  from bs4 import BeautifulSoup
13
 
14
  from app.core.config import settings
15
  from app.core.logging import get_logger
16
  from app.core.exceptions import TaskProcessingError
 
 
17
 
18
  logger = get_logger(__name__)
19
 
20
 
21
  class TaskFetcher:
22
  """
23
- Enhanced service for fetching and extracting task descriptions from URLs
24
- Handles multiple content types and detects special elements requiring processing
 
 
 
25
  """
26
 
27
  def __init__(self, timeout: int = 30):
28
- """
29
- Initialize TaskFetcher
30
-
31
- Args:
32
- timeout: Request timeout in seconds
33
- """
34
  self.timeout = timeout
35
- self.client = None
36
- logger.debug("TaskFetcher initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  async def __aenter__(self):
39
- """Async context manager entry"""
40
  self.client = httpx.AsyncClient(
41
  timeout=self.timeout,
42
  follow_redirects=True,
@@ -48,795 +59,326 @@ class TaskFetcher:
48
  return self
49
 
50
  async def __aexit__(self, exc_type, exc_val, exc_tb):
51
- """Async context manager exit"""
52
  if self.client:
53
  await self.client.aclose()
54
-
55
- async def fetch_task(self, url: str) -> Dict[str, Any]:
 
 
 
 
56
  """
57
- Fetch and extract task description from URL with intelligent detection
 
 
 
 
 
 
 
58
 
59
- Args:
60
- url: URL to fetch task from
61
-
62
- Returns:
63
- Dict containing task information:
64
- {
65
- "task_description": str,
66
- "raw_content": str,
67
- "content_type": str,
68
- "url": str,
69
- "needs_llm_analysis": bool,
70
- "metadata": dict with special_elements, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  }
72
-
73
- Raises:
74
- TaskProcessingError: If fetching or extraction fails
75
- """
76
- logger.info(f"πŸ“₯ Fetching task from URL: {url}")
77
 
78
- # Validate URL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  if not self._is_valid_url(url):
80
- logger.error(f"❌ Invalid URL format: {url}")
81
  raise TaskProcessingError(f"Invalid URL format: {url}")
82
 
83
  try:
84
- # Fetch content
85
  response = await self._fetch_url(url)
86
-
87
- # Detect content type
88
  content_type = self._detect_content_type(response)
89
- logger.info(f"πŸ“„ Content type detected: {content_type}")
90
-
91
- # Extract task based on content type
92
- task_info = await self._extract_task(response, content_type, url)
93
-
94
- # Determine if LLM analysis is needed
95
- task_info['needs_llm_analysis'] = self._needs_llm_analysis(task_info)
96
-
97
- if task_info['needs_llm_analysis']:
98
- logger.warning("πŸ€– Content requires LLM analysis for complete extraction")
99
 
100
- logger.info(f"βœ… Task fetched successfully")
101
- logger.debug(f"Task description length: {len(task_info['task_description'])} chars")
 
102
 
103
- return task_info
 
 
 
 
 
104
 
105
- except httpx.TimeoutException:
106
- logger.error(f"⏱️ Timeout fetching URL: {url}")
107
- raise TaskProcessingError(f"Request timeout for URL: {url}")
108
-
109
- except httpx.HTTPStatusError as e:
110
- logger.error(f"❌ HTTP error {e.response.status_code}: {url}")
111
- raise TaskProcessingError(
112
- f"HTTP {e.response.status_code} error fetching URL: {url}"
113
- )
114
-
115
- except Exception as e:
116
- logger.error(f"❌ Unexpected error fetching task: {str(e)}", exc_info=True)
117
- raise TaskProcessingError(f"Failed to fetch task from URL: {str(e)}")
118
-
119
- def _is_valid_url(self, url: str) -> bool:
120
- """
121
- Validate URL format
122
 
123
- Args:
124
- url: URL to validate
125
-
126
- Returns:
127
- bool: True if valid, False otherwise
128
- """
129
- try:
130
- result = urlparse(url)
131
- is_valid = all([result.scheme in ['http', 'https'], result.netloc])
132
-
133
- if not is_valid:
134
- logger.warning(f"Invalid URL structure: {url}")
135
-
136
- return is_valid
137
-
138
  except Exception as e:
139
- logger.warning(f"URL validation error: {str(e)}")
140
- return False
141
 
142
  async def _fetch_url(self, url: str) -> httpx.Response:
143
- """
144
- Fetch content from URL with retry logic
145
-
146
- Args:
147
- url: URL to fetch
148
-
149
- Returns:
150
- httpx.Response: HTTP response
151
-
152
- Raises:
153
- httpx.HTTPStatusError: If HTTP error occurs
154
- httpx.TimeoutException: If request times out
155
- """
156
- max_retries = settings.MAX_RETRIES
157
 
158
  for attempt in range(max_retries):
159
  try:
160
- logger.debug(f"Attempt {attempt + 1}/{max_retries} to fetch URL")
161
-
162
  response = await self.client.get(url)
163
  response.raise_for_status()
164
-
165
- logger.debug(
166
- f"βœ“ Fetch successful | Status: {response.status_code} | "
167
- f"Size: {len(response.content)} bytes"
168
- )
169
-
170
  return response
171
-
172
  except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
 
173
  if attempt == max_retries - 1:
174
- # Last attempt, raise error
175
  raise
176
-
177
- logger.warning(
178
- f"⚠️ Attempt {attempt + 1} failed: {str(e)} | Retrying..."
179
- )
180
  continue
181
-
182
- def _detect_content_type(self, response: httpx.Response) -> str:
183
- """
184
- Detect content type from response
185
-
186
- Args:
187
- response: HTTP response
188
-
189
- Returns:
190
- str: Content type (json, html, text, pdf, csv, etc.)
191
- """
192
- content_type_header = response.headers.get('content-type', '').lower()
193
-
194
- # Check content-type header first
195
- if 'application/json' in content_type_header:
196
- return 'json'
197
- elif 'text/html' in content_type_header:
198
- return 'html'
199
- elif 'application/pdf' in content_type_header:
200
- return 'pdf'
201
- elif 'text/csv' in content_type_header or 'application/csv' in content_type_header:
202
- return 'csv'
203
- elif 'audio/' in content_type_header:
204
- return 'audio'
205
- elif 'video/' in content_type_header:
206
- return 'video'
207
- elif 'image/' in content_type_header:
208
- return 'image'
209
-
210
- # Try to detect from content
211
- try:
212
- json.loads(response.text)
213
- return 'json'
214
- except:
215
- if '<html' in response.text.lower()[:100]:
216
- return 'html'
217
- return 'text'
218
-
219
- async def _extract_task(
220
- self,
221
- response: httpx.Response,
222
- content_type: str,
223
- url: str
224
- ) -> Dict[str, Any]:
225
- """
226
- Extract task description based on content type
227
-
228
- Args:
229
- response: HTTP response
230
- content_type: Detected content type
231
- url: Original URL
232
-
233
- Returns:
234
- Dict with task information
235
- """
236
- extractors = {
237
- 'json': self._extract_from_json,
238
- 'html': self._extract_from_html,
239
- 'text': self._extract_from_text,
240
- 'pdf': self._extract_from_binary,
241
- 'csv': self._extract_from_binary,
242
- 'audio': self._extract_from_binary,
243
- 'video': self._extract_from_binary,
244
- 'image': self._extract_from_binary,
245
- }
246
-
247
- extractor = extractors.get(content_type, self._extract_from_text)
248
-
249
- logger.debug(f"Using extractor: {extractor.__name__}")
250
-
251
- return await extractor(response, url)
252
-
253
- async def _extract_from_json(
254
- self,
255
- response: httpx.Response,
256
- url: str
257
- ) -> Dict[str, Any]:
258
- """
259
- Extract task from JSON response with base64 detection
260
-
261
- Args:
262
- response: HTTP response
263
- url: Original URL
264
-
265
- Returns:
266
- Dict with task information
267
- """
268
- logger.debug("Extracting task from JSON")
269
-
270
- try:
271
- data = response.json()
272
-
273
- # Try common field names for task description
274
- task_fields = [
275
- 'task', 'task_description', 'description',
276
- 'question', 'prompt', 'instruction',
277
- 'task_text', 'content', 'message', 'text'
278
- ]
279
-
280
- task_description = None
281
- found_field = None
282
-
283
- # Search for task description in JSON
284
- for field in task_fields:
285
- if field in data:
286
- task_description = str(data[field])
287
- found_field = field
288
- logger.debug(f"Found task in field: {field}")
289
- break
290
-
291
- # If not found in root, try nested
292
- if not task_description:
293
- task_description = self._search_nested_json(data, task_fields)
294
- found_field = "nested"
295
-
296
- # Fallback: use entire JSON as string
297
- if not task_description:
298
- logger.warning("No task field found, using entire JSON")
299
- task_description = json.dumps(data, indent=2)
300
- found_field = "full_json"
301
-
302
- # Check for base64 encoding
303
- original_description = task_description
304
- task_description = self._detect_and_decode_base64(task_description)
305
- was_base64_decoded = (original_description != task_description)
306
-
307
- return {
308
- 'task_description': task_description.strip(),
309
- 'raw_content': response.text,
310
- 'content_type': 'json',
311
- 'url': url,
312
- 'metadata': {
313
- 'json_structure': list(data.keys()) if isinstance(data, dict) else [],
314
- 'data_type': type(data).__name__,
315
- 'found_in_field': found_field,
316
- 'was_base64_decoded': was_base64_decoded,
317
- 'special_elements': {} # No special elements in JSON
318
- }
319
- }
320
-
321
- except json.JSONDecodeError as e:
322
- logger.error(f"Failed to parse JSON: {str(e)}")
323
- # Fallback to text extraction
324
- return await self._extract_from_text(response, url)
325
-
326
- def _search_nested_json(self, data: Any, fields: List[str], max_depth: int = 3) -> Optional[str]:
327
  """
328
- Recursively search for task description in nested JSON
329
-
330
- Args:
331
- data: JSON data to search
332
- fields: Field names to look for
333
- max_depth: Maximum recursion depth
334
-
335
- Returns:
336
- Task description if found, None otherwise
337
  """
338
- if max_depth <= 0:
339
- return None
340
 
341
- if isinstance(data, dict):
342
- for field in fields:
343
- if field in data:
344
- return str(data[field])
345
-
346
- # Search nested dicts
347
- for value in data.values():
348
- result = self._search_nested_json(value, fields, max_depth - 1)
349
- if result:
350
- return result
351
 
352
- elif isinstance(data, list) and len(data) > 0:
353
- # Search first item in list
354
- return self._search_nested_json(data[0], fields, max_depth - 1)
 
355
 
356
- return None
357
-
358
- async def _extract_from_html(
359
- self,
360
- response: httpx.Response,
361
- url: str
362
- ) -> Dict[str, Any]:
363
  """
364
- Extract task from HTML response with comprehensive element detection
365
-
366
- Args:
367
- response: HTTP response
368
- url: Original URL
369
-
370
- Returns:
371
- Dict with task information
372
  """
373
- logger.debug("Extracting task from HTML")
374
 
 
 
375
  try:
376
- soup = BeautifulSoup(response.text, 'html.parser')
377
-
378
- # FIRST: Detect special elements
379
- special_elements = self._detect_special_elements(soup, url)
380
- has_special = any(special_elements.values())
381
-
382
- if has_special:
383
- detected_types = [k for k, v in special_elements.items() if v]
384
- logger.info(f"πŸ” Detected special elements: {', '.join(detected_types)}")
385
-
386
- # Strategy 1: Look for common task containers
387
- task_selectors = [
388
- {'id': 'task'},
389
- {'id': 'question'},
390
- {'id': 'instruction'},
391
- {'id': 'quiz'},
392
- {'class_': 'task'},
393
- {'class_': 'question'},
394
- {'class_': 'instruction'},
395
- {'class_': 'task-description'},
396
- {'class_': 'quiz-question'},
397
- {'data-task': True}
398
- ]
399
-
400
- task_description = None
401
- extraction_method = None
402
-
403
- for selector in task_selectors:
404
- element = soup.find(**selector)
405
- if element:
406
- task_description = element.get_text(strip=True, separator=' ')
407
- extraction_method = f"selector_{list(selector.keys())[0]}"
408
- logger.debug(f"Found task using selector: {selector}")
409
- break
410
-
411
- # Strategy 2: Look for main content area
412
- if not task_description:
413
- main_content = (
414
- soup.find('main') or
415
- soup.find('article') or
416
- soup.find('div', class_='content') or
417
- soup.find('div', id='content') or
418
- soup.find('section', class_='main')
419
- )
420
-
421
- if main_content:
422
- task_description = main_content.get_text(strip=True, separator=' ')
423
- extraction_method = "main_content"
424
- logger.debug("Found task in main content area")
425
-
426
- # Strategy 3: Look for pre/code/textarea blocks (often contain base64 or instructions)
427
- if not task_description:
428
- code_blocks = soup.find_all(['pre', 'code', 'textarea'])
429
- if code_blocks:
430
- task_description = '\n'.join(
431
- block.get_text(strip=True) for block in code_blocks
432
- )
433
- extraction_method = "code_blocks"
434
- logger.debug(f"Found task in {len(code_blocks)} code blocks")
435
-
436
- # Strategy 4: Use body text (last resort)
437
- if not task_description:
438
- # Remove script and style tags
439
- for script in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
440
- script.decompose()
441
-
442
- task_description = soup.get_text(strip=True, separator=' ')
443
- extraction_method = "body_fallback"
444
- logger.warning("Using body text as task description")
445
 
446
- # Check for base64 encoding
447
- original_description = task_description
448
- task_description = self._detect_and_decode_base64(task_description)
449
- was_base64_decoded = (original_description != task_description)
 
 
450
 
451
- # Extract metadata
452
- title = soup.find('title')
453
- title_text = title.get_text(strip=True) if title else ''
454
 
455
- # Check for meta description
456
- meta_desc = soup.find('meta', attrs={'name': 'description'})
457
- meta_description = meta_desc.get('content', '') if meta_desc else ''
458
 
 
459
  return {
460
- 'task_description': task_description.strip(),
461
- 'raw_content': response.text,
462
- 'content_type': 'html',
463
- 'url': url,
464
- 'metadata': {
465
- 'title': title_text,
466
- 'meta_description': meta_description,
467
- 'extraction_method': extraction_method,
468
- 'has_forms': bool(soup.find('form')),
469
- 'has_tables': bool(soup.find('table')),
470
- 'was_base64_decoded': was_base64_decoded,
471
- 'special_elements': special_elements,
472
- 'page_size_kb': len(response.content) / 1024
473
- }
474
- }
475
-
476
- except Exception as e:
477
- logger.error(f"Failed to parse HTML: {str(e)}", exc_info=True)
478
- # Fallback to text extraction
479
- return await self._extract_from_text(response, url)
480
-
481
- def _detect_special_elements(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
482
- """
483
- Detect special elements that might contain or lead to the task
484
-
485
- Args:
486
- soup: BeautifulSoup parsed HTML
487
- base_url: Base URL for resolving relative URLs
488
-
489
- Returns:
490
- Dict of detected elements with absolute URLs
491
- """
492
- elements = {
493
- 'audio_urls': [],
494
- 'video_urls': [],
495
- 'image_urls': [],
496
- 'download_links': [],
497
- 'iframe_sources': [],
498
- 'external_links': [],
499
- 'form_actions': [],
500
- 'javascript_files': []
501
- }
502
-
503
- # Audio elements
504
- for audio in soup.find_all(['audio', 'source']):
505
- src = audio.get('src')
506
- if src:
507
- # Check for audio extensions or content type
508
- is_audio = any(ext in src.lower() for ext in ['.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac'])
509
- audio_type = audio.get('type', '')
510
- if is_audio or 'audio/' in audio_type:
511
- absolute_url = urljoin(base_url, src)
512
- elements['audio_urls'].append(absolute_url)
513
- logger.debug(f"Found audio: {absolute_url}")
514
-
515
- # Video elements
516
- for video in soup.find_all(['video', 'source']):
517
- src = video.get('src')
518
- if src:
519
- is_video = any(ext in src.lower() for ext in ['.mp4', '.webm', '.avi', '.mov', '.mkv'])
520
- video_type = video.get('type', '')
521
- if is_video or 'video/' in video_type:
522
- absolute_url = urljoin(base_url, src)
523
- elements['video_urls'].append(absolute_url)
524
- logger.debug(f"Found video: {absolute_url}")
525
-
526
- # YouTube/Vimeo iframes
527
- for iframe in soup.find_all('iframe'):
528
- src = iframe.get('src', '')
529
- if 'youtube.com' in src or 'vimeo.com' in src:
530
- elements['video_urls'].append(src)
531
- logger.debug(f"Found video iframe: {src}")
532
- elif src:
533
- absolute_url = urljoin(base_url, src)
534
- elements['iframe_sources'].append(absolute_url)
535
- logger.debug(f"Found iframe: {absolute_url}")
536
-
537
- # Image elements (might contain QR codes, screenshots with tasks, etc.)
538
- for img in soup.find_all('img'):
539
- src = img.get('src')
540
- if src:
541
- absolute_url = urljoin(base_url, src)
542
- # Only include if it looks like it might contain data (not decorative)
543
- alt_text = img.get('alt', '').lower()
544
- if any(keyword in alt_text for keyword in ['task', 'question', 'instruction', 'qr', 'code']):
545
- elements['image_urls'].append(absolute_url)
546
- logger.debug(f"Found relevant image: {absolute_url}")
547
-
548
- # Download links
549
- for link in soup.find_all('a', href=True):
550
- href = link.get('href', '')
551
- link_text = link.get_text().strip().lower()
552
-
553
- # Check for downloadable files
554
- download_extensions = [
555
- '.pdf', '.csv', '.xlsx', '.xls', '.zip', '.txt',
556
- '.json', '.xml', '.doc', '.docx', '.tsv'
557
- ]
558
-
559
- is_download = any(ext in href.lower() for ext in download_extensions)
560
-
561
- # Check for text indicating download or external task
562
- download_keywords = ['download', 'get task', 'click here', 'task file', 'see task']
563
- has_download_text = any(keyword in link_text for keyword in download_keywords)
564
-
565
- if is_download:
566
- absolute_url = urljoin(base_url, href)
567
- elements['download_links'].append(absolute_url)
568
- logger.debug(f"Found download link: {absolute_url}")
569
- elif has_download_text:
570
- absolute_url = urljoin(base_url, href)
571
- elements['external_links'].append(absolute_url)
572
- logger.debug(f"Found external link: {absolute_url}")
573
-
574
- # Forms (might need to submit to get task)
575
- for form in soup.find_all('form'):
576
- action = form.get('action')
577
- if action:
578
- absolute_url = urljoin(base_url, action)
579
- elements['form_actions'].append(absolute_url)
580
- logger.debug(f"Found form action: {absolute_url}")
581
-
582
- # JavaScript files (might load task dynamically)
583
- for script in soup.find_all('script', src=True):
584
- src = script.get('src')
585
- if src and not any(cdn in src for cdn in ['google', 'cdn', 'jquery']):
586
- absolute_url = urljoin(base_url, src)
587
- elements['javascript_files'].append(absolute_url)
588
-
589
- # Remove duplicates
590
- for key in elements:
591
- elements[key] = list(set(elements[key]))
592
-
593
- return elements
594
-
595
- async def _extract_from_text(
596
- self,
597
- response: httpx.Response,
598
- url: str
599
- ) -> Dict[str, Any]:
600
- """
601
- Extract task from plain text response with base64 detection
602
-
603
- Args:
604
- response: HTTP response
605
- url: Original URL
606
-
607
- Returns:
608
- Dict with task information
609
- """
610
- logger.debug("Extracting task from plain text")
611
-
612
- text = response.text.strip()
613
-
614
- # Check for base64 encoding
615
- original_text = text
616
- text = self._detect_and_decode_base64(text)
617
- was_base64_decoded = (original_text != text)
618
-
619
- return {
620
- 'task_description': text,
621
- 'raw_content': response.text,
622
- 'content_type': 'text',
623
- 'url': url,
624
- 'metadata': {
625
- 'length': len(text),
626
- 'lines': text.count('\n') + 1,
627
- 'was_base64_decoded': was_base64_decoded,
628
- 'special_elements': {}
629
  }
630
- }
631
-
632
- async def _extract_from_binary(
633
- self,
634
- response: httpx.Response,
635
- url: str
636
- ) -> Dict[str, Any]:
637
- """
638
- Handle binary content (PDF, audio, video, images, CSV)
639
- Returns info that LLM will need to process
640
-
641
- Args:
642
- response: HTTP response
643
- url: Original URL
644
-
645
- Returns:
646
- Dict with task information indicating processing needed
647
- """
648
- content_type = response.headers.get('content-type', 'unknown')
649
-
650
- logger.warning(f"⚠️ Binary content detected: {content_type}")
651
-
652
- task_description = f"Binary content detected. Type: {content_type}. URL: {url}. "
653
-
654
- if 'pdf' in content_type:
655
- task_description += "This is a PDF file that needs to be downloaded and parsed."
656
- elif 'audio' in content_type:
657
- task_description += "This is an audio file that needs to be transcribed."
658
- elif 'video' in content_type:
659
- task_description += "This is a video file that might need processing or transcription."
660
- elif 'image' in content_type:
661
- task_description += "This is an image that might need OCR or vision analysis."
662
- elif 'csv' in content_type:
663
- task_description += "This is a CSV file that needs to be downloaded and parsed."
664
- else:
665
- task_description += "This needs to be downloaded and processed."
666
-
667
- return {
668
- 'task_description': task_description,
669
- 'raw_content': '', # Don't include binary in raw_content
670
- 'content_type': content_type,
671
- 'url': url,
672
- 'metadata': {
673
- 'is_binary': True,
674
- 'content_length': len(response.content),
675
- 'requires_download': True,
676
- 'special_elements': {
677
- 'download_links': [url]
678
- }
679
- }
680
- }
681
-
682
- def _detect_and_decode_base64(self, text: str) -> str:
683
- """
684
- Detect and decode base64 content in text
685
-
686
- Args:
687
- text: Text that might contain base64
688
-
689
- Returns:
690
- Decoded text if base64 found, original text otherwise
691
- """
692
- # Pattern to detect base64 strings (at least 20 chars, typical base64 chars)
693
- # Must be fairly long to avoid false positives
694
- base64_pattern = r'([A-Za-z0-9+/]{40,}={0,2})'
695
-
696
- matches = re.findall(base64_pattern, text)
697
-
698
- if not matches:
699
- return text
700
-
701
- logger.debug(f"Found {len(matches)} potential base64 strings")
702
-
703
- decoded_parts = []
704
-
705
- for match in matches:
706
  try:
707
- # Try to decode
708
- decoded_bytes = base64.b64decode(match, validate=True)
709
- decoded_text = decoded_bytes.decode('utf-8', errors='ignore')
710
-
711
- # Check if decoded text is readable (not binary)
712
- if self._is_readable_text(decoded_text):
713
- logger.info(
714
- f"βœ“ Decoded base64 string "
715
- f"(length: {len(match)} β†’ {len(decoded_text)} chars)"
716
- )
717
- decoded_parts.append(decoded_text)
718
- else:
719
- logger.debug("Base64 decoded to binary/unreadable data")
720
-
721
  except Exception as e:
722
- logger.debug(f"Not valid base64: {str(e)}")
723
- continue
724
 
725
- # If we successfully decoded anything, return the best candidate
726
- if decoded_parts:
727
- # Use the longest decoded string as it's likely the main content
728
- result = max(decoded_parts, key=len)
729
- logger.info(f"βœ… Using decoded base64 content ({len(result)} chars)")
730
- return result
731
-
732
- return text
733
-
734
- def _is_readable_text(self, text: str, min_printable_ratio: float = 0.7) -> bool:
735
- """
736
- Check if decoded text is human-readable
737
-
738
- Args:
739
- text: Text to check
740
- min_printable_ratio: Minimum ratio of printable characters
741
-
742
- Returns:
743
- bool: True if text appears readable
744
- """
745
- if not text or len(text) < 10:
746
  return False
747
-
748
- # Count printable characters (letters, numbers, punctuation, spaces)
749
- printable_count = sum(c.isprintable() or c.isspace() for c in text)
750
- ratio = printable_count / len(text)
751
-
752
- # Also check for some common words to confirm it's text
753
- has_common_words = any(
754
- word in text.lower()
755
- for word in ['the', 'and', 'task', 'data', 'file', 'http']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
  )
757
 
758
- return ratio >= min_printable_ratio and (ratio > 0.9 or has_common_words)
759
-
760
- def _needs_llm_analysis(self, task_info: Dict[str, Any]) -> bool:
761
- """
762
- Determine if fetched content needs LLM analysis to extract actual task
763
-
764
- Args:
765
- task_info: Fetched task information
766
-
767
- Returns:
768
- bool: True if LLM analysis needed
769
- """
770
- metadata = task_info.get('metadata', {})
771
- task_desc = task_info.get('task_description', '').lower()
772
 
773
- # Check 1: Binary content always needs LLM
774
- if metadata.get('is_binary'):
775
- logger.info("πŸ€– Binary content detected - LLM analysis required")
776
- return True
777
-
778
- # Check 2: Special elements present
779
- special_elements = metadata.get('special_elements', {})
780
- has_audio = bool(special_elements.get('audio_urls'))
781
- has_video = bool(special_elements.get('video_urls'))
782
- has_downloads = bool(special_elements.get('download_links'))
783
- has_iframes = bool(special_elements.get('iframe_sources'))
784
- has_images = bool(special_elements.get('image_urls'))
785
- has_forms = bool(special_elements.get('form_actions'))
786
-
787
- if any([has_audio, has_video, has_downloads, has_iframes, has_images, has_forms]):
788
- logger.info(
789
- f"πŸ€– Special elements detected - LLM analysis recommended "
790
- f"(audio:{has_audio}, video:{has_video}, downloads:{has_downloads}, "
791
- f"iframes:{has_iframes}, images:{has_images}, forms:{has_forms})"
792
  )
793
- return True
794
-
795
- # Check 3: Very short content (likely incomplete)
796
- if len(task_desc.strip()) < 30:
797
- logger.info("πŸ€– Very short content - LLM analysis recommended")
798
- return True
799
-
800
- # Check 4: Indirect language suggesting further action needed
801
- indirect_keywords = [
802
- 'click here', 'download', 'visit', 'listen to',
803
- 'watch', 'see attached', 'refer to', 'check the',
804
- 'navigate to', 'go to', 'follow the link',
805
- 'open the file', 'play the', 'view the'
806
- ]
807
-
808
- has_indirect_language = any(keyword in task_desc for keyword in indirect_keywords)
809
-
810
- if has_indirect_language:
811
- logger.info("πŸ€– Indirect language detected - LLM analysis recommended")
812
- return True
813
-
814
- # Check 5: Multiple URLs in content (might need to visit them)
815
- url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
816
- urls_in_content = re.findall(url_pattern, task_desc)
817
-
818
- if len(urls_in_content) > 1:
819
- logger.info(f"πŸ€– Multiple URLs found ({len(urls_in_content)}) - LLM analysis recommended")
820
- return True
821
-
822
- # Content seems straightforward
823
- logger.info("βœ“ Content appears straightforward - LLM analysis not required")
824
- return False
825
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
 
827
- # Convenience function for quick usage
828
- async def fetch_task_from_url(url: str) -> Dict[str, Any]:
829
- """
830
- Convenience function to fetch task from URL
831
-
832
- Args:
833
- url: URL to fetch task from
834
-
835
- Returns:
836
- Dict with task information
837
-
838
- Raises:
839
- TaskProcessingError: If fetching fails
840
- """
841
- async with TaskFetcher() as fetcher:
842
- return await fetcher.fetch_task(url)
 
1
  """
2
+ Task Fetcher Service - with Static/Dynamic Scraper fallback
3
+ Fetches and extracts task descriptions from URLs
4
  """
5
 
6
  import httpx
7
  import json
 
8
  import re
9
  from typing import Optional, Dict, Any, List
10
+ from urllib.parse import urlparse
11
  from bs4 import BeautifulSoup
12
 
13
  from app.core.config import settings
14
  from app.core.logging import get_logger
15
  from app.core.exceptions import TaskProcessingError
16
+ from app.utils.llm_client import get_llm_client
17
+ from app.utils.prompts import AnalysisPrompts
18
 
19
  logger = get_logger(__name__)
20
 
21
 
22
  class TaskFetcher:
23
  """
24
+ Enhanced service for fetching and extracting task descriptions from URLs.
25
+ Strategy:
26
+ 1. httpx (fast)
27
+ 2. If content looks JS-only/empty β†’ DynamicScraper
28
+ 3. Let orchestrator use Static/Dynamic scrapers later for real data pages
29
  """
30
 
31
  def __init__(self, timeout: int = 30):
 
 
 
 
 
 
32
  self.timeout = timeout
33
+ self.client: Optional[httpx.AsyncClient] = None
34
+ self.llm_client = get_llm_client()
35
+
36
+ # Import here to avoid circular imports
37
+ from app.orchestrator.models import UnifiedTaskAnalysis
38
+
39
+ self._content_analyzer_agent = self.llm_client.create_agent(
40
+ output_type=UnifiedTaskAnalysis,
41
+ system_prompt=(
42
+ "You are an expert at analyzing task content. "
43
+ "You detect redirects, extract submission URLs, and parse instructions."
44
+ ),
45
+ retries=2
46
+ )
47
+
48
+ logger.debug("TaskFetcher initialized with unified LLM analysis")
49
 
50
  async def __aenter__(self):
 
51
  self.client = httpx.AsyncClient(
52
  timeout=self.timeout,
53
  follow_redirects=True,
 
59
  return self
60
 
61
  async def __aexit__(self, exc_type, exc_val, exc_tb):
 
62
  if self.client:
63
  await self.client.aclose()
64
+
65
+ # ======================================================================
66
+ # PUBLIC ENTRY POINT
67
+ # ======================================================================
68
+
69
+ async def fetch_and_analyze(self, url: str, base_url: Optional[str] = None) -> Dict[str, Any]:
70
  """
71
+ Fetch URL and perform unified LLM analysis.
72
+ 1. httpx + basic extraction
73
+ 2. If JS-only / empty β†’ DynamicScraper
74
+ 3. LLM: redirect + submission_url + instructions
75
+ """
76
+ logger.info(f"πŸ“₯ Fetching and analyzing URL: {url}")
77
+ if base_url is None:
78
+ base_url = url
79
 
80
+ # Step 1: Fetch visible content (with fallback)
81
+ content = await self._fetch_content(url)
82
+
83
+ logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
84
+
85
+ # Step 2: Unified LLM analysis
86
+ analysis = await self._analyze_content_with_llm(
87
+ task_description=content['task_description'],
88
+ raw_content=content['raw_content'],
89
+ url=url,
90
+ base_url=base_url
91
+ )
92
+
93
+ # Merge content + analysis
94
+ result = {
95
+ **content,
96
+ 'is_redirect': analysis.is_redirect,
97
+ 'question_url': analysis.question_url,
98
+ 'submission_url': analysis.submission_url,
99
+ 'instructions': self._format_instructions(analysis.instructions),
100
+ 'overall_goal': analysis.overall_goal,
101
+ 'complexity': analysis.complexity,
102
+ 'llm_analysis': {
103
+ 'redirect_reasoning': analysis.redirect_reasoning,
104
+ 'submission_reasoning': analysis.submission_reasoning,
105
+ 'confidence': analysis.confidence,
106
  }
107
+ }
 
 
 
 
108
 
109
+ # Resolve relative submission URL if needed
110
+ if analysis.submission_url and analysis.submission_url_is_relative:
111
+ absolute = str(httpx.URL(base_url).join(analysis.submission_url))
112
+ logger.info(f"βœ“ Resolved relative submission URL: {analysis.submission_url} β†’ {absolute}")
113
+ result['submission_url'] = absolute
114
+
115
+ # Resolve relative question URL if needed
116
+ if analysis.question_url and analysis.question_url.startswith('/'):
117
+ absolute_q = str(httpx.URL(base_url).join(analysis.question_url))
118
+ logger.info(f"βœ“ Resolved relative question URL: {analysis.question_url} β†’ {absolute_q}")
119
+ result['question_url'] = absolute_q
120
+
121
+ logger.info("βœ… Analysis complete:")
122
+ logger.info(f" Is Redirect: {result['is_redirect']}")
123
+ logger.info(f" Submission URL: {result['submission_url']}")
124
+ logger.info(f" Instructions: {len(result['instructions'])} steps")
125
+ logger.info(f" Complexity: {result['complexity']}")
126
+
127
+ return result
128
+
129
+ # ======================================================================
130
+ # FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
131
+ # ======================================================================
132
+
133
+ async def _fetch_content(self, url: str) -> Dict[str, Any]:
134
+ """
135
+ Fetch content from URL.
136
+ - Try httpx first
137
+ - If JS-only/empty β†’ fallback to DynamicScraper
138
+ """
139
  if not self._is_valid_url(url):
 
140
  raise TaskProcessingError(f"Invalid URL format: {url}")
141
 
142
  try:
 
143
  response = await self._fetch_url(url)
 
 
144
  content_type = self._detect_content_type(response)
 
 
 
 
 
 
 
 
 
 
145
 
146
+ # Basic extraction
147
+ task_description = await self._extract_basic_content(response, content_type)
148
+ raw_content = response.text[:5000]
149
 
150
+ # Heuristic: if nothing useful, try dynamic scraper
151
+ if self._looks_js_only(task_description, raw_content):
152
+ logger.warning("⚠️ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
153
+ dyn = await self._fetch_with_dynamic_scraper(url)
154
+ task_description = dyn['task_description']
155
+ raw_content = dyn['raw_content']
156
 
157
+ return {
158
+ 'task_description': task_description,
159
+ 'raw_content': raw_content,
160
+ 'content_type': content_type,
161
+ 'url': url,
162
+ 'metadata': {
163
+ 'content_length': len(response.content),
164
+ 'status_code': response.status_code,
165
+ }
166
+ }
 
 
 
 
 
 
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  except Exception as e:
169
+ logger.error(f"❌ Failed to fetch content: {e}", exc_info=True)
170
+ raise TaskProcessingError(f"Failed to fetch URL: {str(e)}")
171
 
172
  async def _fetch_url(self, url: str) -> httpx.Response:
173
+ """Fetch with httpx and retries."""
174
+ max_retries = getattr(settings, "MAX_RETRIES", 3)
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  for attempt in range(max_retries):
177
  try:
178
+ logger.debug(f"HTTPX fetch attempt {attempt + 1}/{max_retries} for {url}")
 
179
  response = await self.client.get(url)
180
  response.raise_for_status()
 
 
 
 
 
 
181
  return response
 
182
  except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
183
+ logger.warning(f"Attempt {attempt + 1} failed: {e}")
184
  if attempt == max_retries - 1:
 
185
  raise
 
 
 
 
186
  continue
187
+
188
+ def _looks_js_only(self, task_description: str, html: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  """
190
+ Detect JS-only / empty pages that need dynamic rendering.
191
+ - Empty or tiny text
192
+ - Has <script> that uses atob/innerHTML/URLSearchParams
 
 
 
 
 
 
193
  """
194
+ if task_description and len(task_description.strip()) > 50:
195
+ return False
196
 
197
+ # Strong JS signals
198
+ js_markers = ['atob(', 'innerHTML', 'URLSearchParams', 'document.querySelector']
199
+ if any(marker in html for marker in js_markers):
200
+ return True
 
 
 
 
 
 
201
 
202
+ # Very little visible text after stripping scripts
203
+ cleaned = re.sub(r'<script.*?</script>', '', html, flags=re.S | re.I)
204
+ if len(cleaned.strip()) < 100:
205
+ return True
206
 
207
+ return False
208
+
209
+ async def _fetch_with_dynamic_scraper(self, url: str) -> Dict[str, Any]:
 
 
 
 
210
  """
211
+ Use DynamicScraper to render the page and extract visible text
212
+ for instruction pages.
 
 
 
 
 
 
213
  """
214
+ from app.modules.scrapers.dynamic_scraper import DynamicScraper
215
 
216
+ scraper = DynamicScraper(use_pool=True)
217
+ await scraper.initialize()
218
  try:
219
+ # Auto-extract text blocks
220
+ result = await scraper.scrape_url(url)
221
+ if not result.success:
222
+ raise RuntimeError(result.error or "Dynamic scraping failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
+ # DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
225
+ texts: List[str] = []
226
+ if isinstance(result.data, list):
227
+ for row in result.data:
228
+ if isinstance(row, dict) and 'text' in row:
229
+ texts.append(str(row['text']))
230
 
231
+ task_text = "\n".join(texts) if texts else ""
 
 
232
 
233
+ logger.info(f"βœ“ Got {len(texts)} text blocks via DynamicScraper")
 
 
234
 
235
+ # Best-effort raw_content: you could extend DynamicScraper to return page.content()
236
  return {
237
+ 'task_description': task_text,
238
+ 'raw_content': task_text[:5000], # at least something readable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  }
240
+ finally:
241
+ await scraper.cleanup()
242
+
243
+ # ======================================================================
244
+ # BASIC EXTRACTION (NO LLM)
245
+ # ======================================================================
246
+
247
+ async def _extract_basic_content(self, response: httpx.Response, content_type: str) -> str:
248
+ """Fast, no-JS extraction for instruction pages."""
249
+ if content_type == 'json':
250
+ try:
251
+ data = response.json()
252
+ for field in ['task', 'description', 'question', 'content', 'text']:
253
+ if isinstance(data, dict) and field in data:
254
+ return str(data[field])
255
+ return json.dumps(data)
256
+ except Exception:
257
+ return response.text
258
+
259
+ if content_type == 'html':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  try:
261
+ html = response.text
262
+ soup = BeautifulSoup(html, 'html.parser')
263
+ for script in soup(['script', 'style', 'nav', 'header', 'footer']):
264
+ script.decompose()
265
+ text = soup.get_text(strip=True, separator=' ')
266
+ return text
 
 
 
 
 
 
 
 
267
  except Exception as e:
268
+ logger.error(f"HTML basic extraction failed: {e}")
269
+ return response.text
270
 
271
+ return response.text
272
+
273
+ def _detect_content_type(self, response: httpx.Response) -> str:
274
+ ct = response.headers.get('content-type', '').lower()
275
+ if 'application/json' in ct:
276
+ return 'json'
277
+ if 'text/html' in ct or '<html' in response.text.lower()[:200]:
278
+ return 'html'
279
+ return 'text'
280
+
281
+ def _is_valid_url(self, url: str) -> bool:
282
+ try:
283
+ r = urlparse(url)
284
+ return r.scheme in ('http', 'https') and bool(r.netloc)
285
+ except Exception:
 
 
 
 
 
 
286
  return False
287
+
288
+ # ======================================================================
289
+ # LLM ANALYSIS
290
+ # ======================================================================
291
+
292
+ async def _analyze_content_with_llm(
293
+ self,
294
+ task_description: str,
295
+ raw_content: str,
296
+ url: str,
297
+ base_url: str
298
+ ):
299
+ """Unified LLM analysis."""
300
+ logger.info("πŸ€– Running unified LLM analysis...")
301
+
302
+ url_pattern = r'https?://[^\s<>"\']+(?:/[^\s<>"\']*)?'
303
+ all_urls = re.findall(url_pattern, task_description + raw_content[:1000])
304
+ all_urls = list({u.rstrip('.,;:)') for u in all_urls})
305
+
306
+ prompt = AnalysisPrompts.unified_content_analysis_prompt(
307
+ task_description=task_description[:2000],
308
+ found_urls=all_urls,
309
+ current_url=url,
310
+ base_url=base_url
311
  )
312
 
313
+ from app.orchestrator.models import UnifiedTaskAnalysis
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ try:
316
+ analysis: UnifiedTaskAnalysis = await self.llm_client.run_agent(
317
+ self._content_analyzer_agent,
318
+ prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  )
320
+ return analysis
321
+ except Exception as e:
322
+ logger.error(f"❌ LLM analysis failed: {e}", exc_info=True)
323
+ return self._fallback_analysis(task_description, all_urls, url, base_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
+ def _fallback_analysis(
326
+ self,
327
+ task_description: str,
328
+ all_urls: List[str],
329
+ url: str,
330
+ base_url: str
331
+ ):
332
+ """Very simple fallback if LLM fails."""
333
+ from app.orchestrator.models import UnifiedTaskAnalysis, InstructionStep
334
+
335
+ logger.warning("⚠️ Using fallback pattern-based analysis")
336
+
337
+ is_redirect = False
338
+ submission_url = None
339
+
340
+ for pattern in [r'POST\s+(?:to\s+)?([^\s<>"\']+)', r'submit\s+(?:to\s+)?([^\s<>"\']+)']:
341
+ m = re.search(pattern, task_description, re.IGNORECASE)
342
+ if m:
343
+ submission_url = m.group(1).rstrip('.,;:)')
344
+ break
345
+
346
+ sentences = re.split(r'[.;\n]', task_description)
347
+ instructions = []
348
+ step = 1
349
+ for s in sentences:
350
+ s = s.strip()
351
+ if len(s) > 5:
352
+ instructions.append(InstructionStep(
353
+ step_number=step,
354
+ action='unknown',
355
+ description=s,
356
+ target=None,
357
+ dependencies=[]
358
+ ))
359
+ step += 1
360
+
361
+ return UnifiedTaskAnalysis(
362
+ is_redirect=is_redirect,
363
+ question_url=None,
364
+ redirect_reasoning="Fallback: no redirect detection",
365
+ submission_url=submission_url,
366
+ submission_url_is_relative=submission_url.startswith('/') if submission_url else False,
367
+ submission_reasoning="Fallback: simple regex match",
368
+ instructions=instructions,
369
+ overall_goal="Unknown (fallback)",
370
+ complexity="unknown",
371
+ confidence=0.3
372
+ )
373
 
374
+ def _format_instructions(self, steps) -> List[Dict[str, Any]]:
375
+ return [
376
+ {
377
+ 'step': s.step_number,
378
+ 'action': s.action,
379
+ 'text': s.description,
380
+ 'target': s.target,
381
+ 'dependencies': s.dependencies,
382
+ }
383
+ for s in steps
384
+ ]
 
 
 
 
 
app/services/task_processor.py CHANGED
@@ -1,24 +1,25 @@
1
  """
2
  Task Processing Service
3
- Complete orchestration using OrchestratorEngine (Steps 1-8)
4
  """
5
 
6
- from typing import Dict, Any
7
-
8
  from app.models.request import TaskRequest
9
  from app.core.logging import get_logger
10
  from app.core.exceptions import TaskProcessingError
11
  from app.orchestrator.orchestrator_engine import OrchestratorEngine
12
  from app.modules.registry import ModuleRegistry
13
  from app.modules.mock_modules import register_mock_modules
 
 
14
 
15
  logger = get_logger(__name__)
16
 
17
 
18
  class TaskProcessor:
19
  """
20
- Service class for processing tasks
21
- Uses complete orchestration engine (Steps 1-8)
22
  """
23
 
24
  def __init__(
@@ -27,74 +28,177 @@ class TaskProcessor:
27
  enable_actions: bool = True,
28
  auto_register_modules: bool = True
29
  ):
30
- """
31
- Initialize task processor
32
-
33
- Args:
34
- enable_decomposition: Enable complex task decomposition
35
- enable_actions: Enable action execution (downloads, transcription, OCR)
36
- auto_register_modules: Auto-register mock modules
37
- """
38
- logger.info("πŸš€ Initializing TaskProcessor with OrchestratorEngine")
39
 
40
- # Setup module registry
41
  self.registry = ModuleRegistry()
 
42
 
43
  if auto_register_modules:
44
- logger.info("πŸ“¦ Registering mock modules...")
45
  register_mock_modules(self.registry)
46
  logger.info(f"βœ“ Registered {len(self.registry.get_all_modules())} modules")
47
 
48
- # Initialize orchestrator engine (Steps 1-8)
49
  self.orchestrator = OrchestratorEngine(
50
  module_registry=self.registry,
51
  enable_decomposition=enable_decomposition,
52
  enable_actions=enable_actions
53
  )
54
 
55
- logger.info("βœ… TaskProcessor initialized with complete orchestration")
56
 
57
  async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
58
  """
59
- Process a task based on the provided data
60
- Uses complete orchestration pipeline (Steps 1-8)
61
 
62
- Args:
63
- task_data: Validated task request
64
-
65
- Returns:
66
- Dict containing complete task results
67
-
68
- Raises:
69
- TaskProcessingError: If processing fails
70
  """
 
71
  logger.info(f"πŸ”„ Processing task for: {task_data.email}")
72
- logger.info(f"πŸ“‹ Task URL: {task_data.url}")
 
 
 
 
 
73
 
74
  try:
75
- # Execute complete orchestration
 
 
 
 
76
  logger.info("=" * 80)
77
- logger.info("EXECUTING COMPLETE ORCHESTRATION PIPELINE (Steps 1-8)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  logger.info("=" * 80)
79
 
80
  orchestration_result = await self.orchestrator.execute_task(
81
- task_input=str(task_data.url),
82
- task_url=str(task_data.url),
83
- context={'email': task_data.email}
 
 
 
 
 
 
 
 
84
  )
85
 
86
- # Build response
87
- result = self._build_response(
88
- task_data=task_data,
89
- orchestration_result=orchestration_result
90
- )
91
 
92
- logger.info("=" * 80)
93
- logger.info(f"βœ… Task processing completed | Duration: {orchestration_result['duration']:.2f}s")
 
 
 
94
  logger.info("=" * 80)
95
 
96
- return result
97
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  except TaskProcessingError:
99
  # Re-raise task processing errors
100
  raise
@@ -103,117 +207,158 @@ class TaskProcessor:
103
  logger.error(f"❌ Task processing failed: {str(e)}", exc_info=True)
104
  raise TaskProcessingError(f"Failed to process task: {str(e)}")
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def _build_response(
107
  self,
108
  task_data: TaskRequest,
109
- orchestration_result: Dict[str, Any]
 
 
 
 
 
 
110
  ) -> Dict[str, Any]:
111
  """
112
- Build API response from orchestration result
113
 
114
  Args:
115
  task_data: Original task request
 
 
 
 
116
  orchestration_result: Result from orchestrator
 
 
117
 
118
  Returns:
119
- Dict: Formatted response
120
  """
121
- # Extract key information from orchestration
122
- classification = orchestration_result.get('execution_details', {}).get('classification')
123
- parameters = orchestration_result.get('execution_details', {}).get('parameters')
 
124
 
125
- # Base response
126
- response = {
127
- 'status': 'completed' if orchestration_result['success'] else 'failed',
 
 
 
128
  'email': task_data.email,
129
- 'task_url': str(task_data.url),
130
- 'task_id': orchestration_result['task_id'],
131
- 'execution_id': orchestration_result['execution_id'],
132
- 'duration': orchestration_result['duration'],
133
- 'strategy': orchestration_result.get('strategy', 'unknown'),
134
- 'success': orchestration_result['success']
135
- }
136
-
137
- # Add execution details
138
- if orchestration_result['success']:
139
- response['result'] = {
140
- 'data': orchestration_result.get('data'),
141
- 'execution_details': orchestration_result.get('execution_details', {}),
142
- 'steps_completed': orchestration_result.get('steps', {})
143
- }
144
 
145
- # Add classification if available
146
- if classification:
147
- response['classification'] = self._format_classification(classification)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # Add parameters if available
150
- if parameters:
151
- response['parameters'] = self._format_parameters(parameters)
 
 
 
 
 
152
 
153
- response['message'] = 'Task executed successfully through complete orchestration pipeline'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- else:
156
- response['error'] = orchestration_result.get('error', 'Unknown error')
157
- response['message'] = 'Task execution failed'
158
 
159
- # Add execution log
160
- response['execution_log'] = orchestration_result.get('execution_log', [])
 
161
 
162
- return response
163
-
164
- def _format_classification(self, classification: Any) -> Dict[str, Any]:
165
- """Format classification for API response"""
166
- try:
167
- return {
168
- 'primary_task': classification.primary_task.value,
169
- 'secondary_tasks': [t.value for t in classification.secondary_tasks],
170
- 'complexity': classification.complexity.value,
171
- 'estimated_steps': classification.estimated_steps,
172
- 'requires_javascript': classification.requires_javascript,
173
- 'requires_authentication': classification.requires_authentication,
174
- 'requires_external_data': classification.requires_external_data,
175
- 'output_format': classification.output_format.value,
176
- 'confidence': classification.confidence,
177
- 'reasoning': classification.reasoning,
178
- 'key_entities': classification.key_entities,
179
- 'suggested_tools': classification.suggested_tools
180
- }
181
- except Exception as e:
182
- logger.warning(f"Could not format classification: {e}")
183
- return {'error': 'Classification format error'}
184
 
185
- def _format_parameters(self, parameters: Any) -> Dict[str, Any]:
186
- """Format parameters for API response"""
187
- try:
188
- return {
189
- 'data_sources': [
190
- {
191
- 'type': ds.type,
192
- 'location': ds.location,
193
- 'format': ds.format,
194
- 'description': ds.description
195
- }
196
- for ds in parameters.data_sources
197
- ],
198
- 'filters': [
199
- {
200
- 'field': f.field,
201
- 'operator': f.operator,
202
- 'value': f.value,
203
- 'description': f.description
204
- }
205
- for f in parameters.filters
206
- ],
207
- 'columns': [col.name for col in parameters.columns],
208
- 'aggregations': len(parameters.aggregations),
209
- 'visualizations': len(parameters.visualizations),
210
- 'output_format': parameters.output.format if parameters.output else None,
211
- 'confidence': parameters.confidence,
212
- 'complexity_score': parameters.complexity_score
213
- }
214
- except Exception as e:
215
- logger.warning(f"Could not format parameters: {e}")
216
- return {'error': 'Parameters format error'}
217
 
218
  def get_registry(self) -> ModuleRegistry:
219
  """Get module registry for adding/removing modules"""
@@ -223,7 +368,16 @@ class TaskProcessor:
223
  """Get orchestrator engine for advanced usage"""
224
  return self.orchestrator
225
 
226
- def cleanup(self):
 
 
 
 
227
  """Clean up resources"""
228
- self.orchestrator.cleanup()
229
- logger.info("TaskProcessor cleanup complete")
 
 
 
 
 
 
1
  """
2
  Task Processing Service
3
+ Simplified with unified LLM analysis in task_fetcher
4
  """
5
 
6
+ from typing import Dict, Any, Optional
 
7
  from app.models.request import TaskRequest
8
  from app.core.logging import get_logger
9
  from app.core.exceptions import TaskProcessingError
10
  from app.orchestrator.orchestrator_engine import OrchestratorEngine
11
  from app.modules.registry import ModuleRegistry
12
  from app.modules.mock_modules import register_mock_modules
13
+ from app.services.task_fetcher import TaskFetcher
14
+ # from app.orchestrator.answer_submitter import AnswerSubmitter # βœ… Fixed: uncommented
15
 
16
  logger = get_logger(__name__)
17
 
18
 
19
  class TaskProcessor:
20
  """
21
+ Service class for processing TDS quiz tasks
22
+ Uses unified LLM analysis from task_fetcher
23
  """
24
 
25
  def __init__(
 
28
  enable_actions: bool = True,
29
  auto_register_modules: bool = True
30
  ):
31
+ """Initialize task processor"""
32
+ logger.info("πŸš€ Initializing TaskProcessor")
 
 
 
 
 
 
 
33
 
34
+ # Setup components
35
  self.registry = ModuleRegistry()
36
+ # self.answer_submitter = AnswerSubmitter() # βœ… Fixed: using this
37
 
38
  if auto_register_modules:
39
+ logger.info("πŸ“¦ Registering modules...")
40
  register_mock_modules(self.registry)
41
  logger.info(f"βœ“ Registered {len(self.registry.get_all_modules())} modules")
42
 
43
+ # Initialize orchestrator engine
44
  self.orchestrator = OrchestratorEngine(
45
  module_registry=self.registry,
46
  enable_decomposition=enable_decomposition,
47
  enable_actions=enable_actions
48
  )
49
 
50
+ logger.info("βœ… TaskProcessor initialized")
51
 
52
  async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
53
  """
54
+ Process TDS quiz task
 
55
 
56
+ Flow:
57
+ 1. Fetch and analyze Request URL (unified LLM call in task_fetcher)
58
+ 2. If redirect, fetch Question URL (unified LLM call in task_fetcher)
59
+ 3. Execute orchestration
60
+ 4. Extract answer
61
+ 5. Submit to TDS
62
+ 6. Build response
 
63
  """
64
+ logger.info("=" * 80)
65
  logger.info(f"πŸ”„ Processing task for: {task_data.email}")
66
+ logger.info(f"πŸ“‹ Request URL: {task_data.url}")
67
+ logger.info("=" * 80)
68
+
69
+ request_url = str(task_data.url)
70
+ question_url = None
71
+ submission_url = None
72
 
73
  try:
74
+ # ===================================================================
75
+ # STEP 1: FETCH AND ANALYZE REQUEST URL (UNIFIED LLM CALL)
76
+ # ===================================================================
77
+ logger.info("\n" + "=" * 80)
78
+ logger.info("STEP 1: FETCHING & ANALYZING REQUEST URL")
79
  logger.info("=" * 80)
80
+
81
+ async with TaskFetcher() as fetcher:
82
+ analysis = await fetcher.fetch_and_analyze(url=request_url)
83
+
84
+ logger.info(f"βœ“ Request URL analyzed")
85
+ logger.info(f" Is Redirect: {analysis['is_redirect']}")
86
+ logger.info(f" Complexity: {analysis['complexity']}")
87
+
88
+ # ===================================================================
89
+ # STEP 2: IF REDIRECT, FETCH QUESTION URL (UNIFIED LLM CALL)
90
+ # ===================================================================
91
+ if analysis['is_redirect'] and analysis['question_url']:
92
+ logger.info("\n" + "=" * 80)
93
+ logger.info("STEP 2: FETCHING & ANALYZING QUESTION URL")
94
+ logger.info("=" * 80)
95
+
96
+ question_url = analysis['question_url']
97
+ logger.info(f"πŸ”— Detected redirect to: {question_url}")
98
+
99
+ async with TaskFetcher() as fetcher:
100
+ analysis = await fetcher.fetch_and_analyze(
101
+ url=question_url,
102
+ base_url=request_url # For resolving relative URLs
103
+ )
104
+
105
+ logger.info(f"βœ“ Question URL analyzed")
106
+ logger.info(f" Task: {analysis['task_description'][:100]}...")
107
+ else:
108
+ question_url = request_url
109
+ logger.info(f"βœ“ Request URL contains actual task (no redirect)")
110
+
111
+ # Extract key information
112
+ task_description = analysis['task_description']
113
+ submission_url = analysis.get('submission_url')
114
+ instructions = analysis.get('instructions', [])
115
+
116
+ # Log URL hierarchy
117
+ logger.info("\nπŸ“ URL Hierarchy:")
118
+ logger.info(f" Request URL: {request_url}")
119
+ logger.info(f" Question URL: {question_url}")
120
+ logger.info(f" Submission URL: {submission_url}")
121
+ logger.info(f" Instructions: {len(instructions)} steps")
122
+
123
+ # ===================================================================
124
+ # STEP 3: EXECUTE ORCHESTRATION
125
+ # ===================================================================
126
+ logger.info("\n" + "=" * 80)
127
+ logger.info("STEP 3: EXECUTING ORCHESTRATION")
128
  logger.info("=" * 80)
129
 
130
  orchestration_result = await self.orchestrator.execute_task(
131
+ task_input=task_description,
132
+ task_url=question_url,
133
+ context={
134
+ 'email': task_data.email,
135
+ 'request_url': request_url,
136
+ 'question_url': question_url,
137
+ 'submission_url': submission_url,
138
+ 'instructions': instructions,
139
+ 'complexity': analysis['complexity'],
140
+ 'overall_goal': analysis['overall_goal']
141
+ }
142
  )
143
 
144
+ logger.info(f"βœ“ Orchestration completed")
145
+ logger.info(f" Success: {orchestration_result['success']}")
146
+ logger.info(f" Duration: {orchestration_result['duration']:.2f}s")
 
 
147
 
148
+ # ===================================================================
149
+ # STEP 4: EXTRACT ANSWER
150
+ # ===================================================================
151
+ logger.info("\n" + "=" * 80)
152
+ logger.info("STEP 4: EXTRACTING ANSWER")
153
  logger.info("=" * 80)
154
 
155
+ answer = self._extract_answer(orchestration_result)
156
+ logger.info(f"βœ“ Answer extracted: {str(answer)[:200]}")
157
+
158
+ # # ===================================================================
159
+ # # STEP 5: SUBMIT ANSWER TO TDS
160
+ # # ===================================================================
161
+ # logger.info("\n" + "=" * 80)
162
+ # logger.info("STEP 5: SUBMITTING ANSWER TO TDS")
163
+ # logger.info("=" * 80)
164
+
165
+ # submission_result = await self.answer_submitter.submit_answer(
166
+ # email=task_data.email,
167
+ # secret=task_data.secret,
168
+ # url=question_url, # βœ… Use Question URL, not Request URL
169
+ # answer=answer,
170
+ # submission_url=submission_url
171
+ # )
172
+
173
+ # logger.info(f"βœ“ Submission completed")
174
+ # logger.info(f" Success: {submission_result['success']}")
175
+ # logger.info(f" Status Code: {submission_result.get('status_code')}")
176
+
177
+ # if submission_result.get('response'):
178
+ # logger.info(f" Response: {submission_result['response']}")
179
+
180
+ # # ===================================================================
181
+ # # STEP 6: BUILD RESPONSE
182
+ # # ===================================================================
183
+ # result = self._build_response(
184
+ # task_data=task_data,
185
+ # request_url=request_url,
186
+ # question_url=question_url,
187
+ # submission_url=submission_url,
188
+ # analysis=analysis, # βœ… Fixed: pass analysis, not task_content
189
+ # orchestration_result=orchestration_result,
190
+ # submission_result=submission_result,
191
+ # answer=answer
192
+ # )
193
+
194
+ # logger.info("\n" + "=" * 80)
195
+ # logger.info(f"βœ… TASK COMPLETED SUCCESSFULLY")
196
+ # logger.info(f" Total Duration: {orchestration_result['duration']:.2f}s")
197
+ # logger.info(f" Answer Submitted: {submission_result['success']}")
198
+ # logger.info("=" * 80)
199
+
200
+ # return result # βœ… Fixed: actually return the result
201
+ return
202
  except TaskProcessingError:
203
  # Re-raise task processing errors
204
  raise
 
207
  logger.error(f"❌ Task processing failed: {str(e)}", exc_info=True)
208
  raise TaskProcessingError(f"Failed to process task: {str(e)}")
209
 
210
+ def _extract_answer(self, orchestration_result: Dict[str, Any]) -> Any:
211
+ """
212
+ Extract final answer from orchestration result
213
+
214
+ Tries multiple field names and strategies to find the answer
215
+ """
216
+ logger.debug("Extracting answer from orchestration result")
217
+
218
+ if not orchestration_result.get('success'):
219
+ logger.warning("Orchestration was not successful")
220
+ return None
221
+
222
+ data = orchestration_result.get('data', {})
223
+
224
+ # If data is not a dict, return it directly
225
+ if not isinstance(data, dict):
226
+ logger.debug(f"Data is {type(data).__name__}, returning as-is")
227
+ return data
228
+
229
+ # Try common answer field names
230
+ result_fields = [
231
+ 'answer', 'result', 'output', 'value', 'computed_value',
232
+ 'extracted_data', 'scraped_data', 'secret_code',
233
+ 'code', 'secret', 'solution', 'response'
234
+ ]
235
+
236
+ for field in result_fields:
237
+ if field in data:
238
+ logger.debug(f"Found answer in '{field}' field")
239
+ return data[field]
240
+
241
+ # If only one key, return its value
242
+ if len(data) == 1:
243
+ key = list(data.keys())[0]
244
+ logger.debug(f"Single key '{key}' in data, using its value")
245
+ return data[key]
246
+
247
+ # Return entire data dict as last resort
248
+ logger.debug("No specific answer field found, returning entire data")
249
+ return data
250
+
251
  def _build_response(
252
  self,
253
  task_data: TaskRequest,
254
+ request_url: str,
255
+ question_url: str,
256
+ submission_url: str,
257
+ analysis: Dict[str, Any], # βœ… Fixed: renamed from task_content
258
+ orchestration_result: Dict[str, Any],
259
+ submission_result: Dict[str, Any],
260
+ answer: Any
261
  ) -> Dict[str, Any]:
262
  """
263
+ Build comprehensive API response
264
 
265
  Args:
266
  task_data: Original task request
267
+ request_url: Original URL from API request
268
+ question_url: URL where actual task was found
269
+ submission_url: URL where answer was submitted
270
+ analysis: Unified analysis from task_fetcher
271
  orchestration_result: Result from orchestrator
272
+ submission_result: Result from TDS submission
273
+ answer: Extracted answer
274
 
275
  Returns:
276
+ Formatted response dict
277
  """
278
+ overall_success = (
279
+ orchestration_result['success'] and
280
+ submission_result['success']
281
+ )
282
 
283
+ return {
284
+ # Status
285
+ 'success': overall_success,
286
+ 'status': 'completed' if overall_success else 'failed',
287
+
288
+ # Request info
289
  'email': task_data.email,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
+ # URL hierarchy
292
+ 'urls': {
293
+ 'request_url': request_url,
294
+ 'question_url': question_url,
295
+ 'submission_url': submission_url
296
+ },
297
+
298
+ # IDs and timing
299
+ 'task_id': orchestration_result.get('task_id'),
300
+ 'execution_id': orchestration_result.get('execution_id'),
301
+ 'duration': orchestration_result.get('duration'),
302
+ 'timestamp': orchestration_result.get('timestamp'),
303
+
304
+ # Answer
305
+ 'answer': answer,
306
+
307
+ # Submission details
308
+ 'submission': {
309
+ 'success': submission_result['success'],
310
+ 'status_code': submission_result.get('status_code'),
311
+ 'submitted_to': submission_url,
312
+ 'submitted_url': question_url, # URL included in payload
313
+ 'response': submission_result.get('response')
314
+ },
315
 
316
+ # Task details
317
+ 'task_details': {
318
+ 'task_description': analysis['task_description'][:500], # Truncate
319
+ 'complexity': analysis.get('complexity'),
320
+ 'overall_goal': analysis.get('overall_goal'),
321
+ 'instructions_count': len(analysis.get('instructions', [])),
322
+ 'was_redirect': analysis.get('is_redirect', False)
323
+ },
324
 
325
+ # Orchestration details
326
+ 'orchestration': {
327
+ 'success': orchestration_result['success'],
328
+ 'strategy': orchestration_result.get('strategy'),
329
+ 'steps_completed': list(orchestration_result.get('steps', {}).keys())
330
+ },
331
+
332
+ # LLM analysis metadata
333
+ 'llm_analysis': analysis.get('llm_analysis', {}),
334
+
335
+ # Message
336
+ 'message': self._build_message(overall_success, orchestration_result, submission_result)
337
+ }
338
+
339
+ def _build_message(
340
+ self,
341
+ overall_success: bool,
342
+ orchestration_result: Dict[str, Any],
343
+ submission_result: Dict[str, Any]
344
+ ) -> str:
345
+ """Build human-readable status message"""
346
+ if overall_success:
347
+ return "Task completed successfully and answer submitted to TDS"
348
 
349
+ if not orchestration_result['success']:
350
+ error = orchestration_result.get('error', 'Unknown error')
351
+ return f"Task execution failed: {error}"
352
 
353
+ if not submission_result['success']:
354
+ error = submission_result.get('error', 'Unknown error')
355
+ return f"Task completed but submission failed: {error}"
356
 
357
+ return "Task failed for unknown reason"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
+ # ========================================================================
360
+ # UTILITY METHODS
361
+ # ========================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  def get_registry(self) -> ModuleRegistry:
364
  """Get module registry for adding/removing modules"""
 
368
  """Get orchestrator engine for advanced usage"""
369
  return self.orchestrator
370
 
371
+ # def get_answer_submitter(self) -> AnswerSubmitter:
372
+ # """Get answer submitter for testing"""
373
+ # return self.answer_submitter
374
+
375
+ async def cleanup(self):
376
  """Clean up resources"""
377
+ try:
378
+ self.orchestrator.cleanup()
379
+ logger.info("βœ“ Orchestrator cleanup complete")
380
+ except Exception as e:
381
+ logger.warning(f"Orchestrator cleanup error: {e}")
382
+
383
+ logger.info("βœ… TaskProcessor cleanup complete")
app/utils/prompts.py CHANGED
@@ -54,7 +54,84 @@ Extract EVERYTHING that could be useful for task execution."""
54
 
55
  class AnalysisPrompts:
56
  """Prompts for data analysis and insight generation"""
57
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  @staticmethod
59
  def analysis_planning_prompt(
60
  question: str,
@@ -543,3 +620,88 @@ Be detailed and actionable. Each step should be implementable."""
543
  lines.append(f" ... and {len(values) - 2} more")
544
 
545
  return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  class AnalysisPrompts:
56
  """Prompts for data analysis and insight generation"""
57
+ @staticmethod
58
+ def unified_content_analysis_prompt(
59
+ task_description: str,
60
+ found_urls: List[str],
61
+ current_url: str,
62
+ base_url: str
63
+ ) -> str:
64
+ """Optimized unified analysis prompt"""
65
+ urls_text = "\n".join(f"- {url}" for url in found_urls) if found_urls else "None"
66
+
67
+ return f"""Analyze this quiz/task content and extract all critical information.
68
+
69
+ **Current URL:** {current_url}
70
+ **Base URL:** {base_url}
71
+
72
+ **Content:**
73
+ {task_description}
74
+
75
+ **URLs found:**
76
+ {urls_text}
77
+
78
+ ---
79
+
80
+ ## EXTRACT:
81
+
82
+ ### 1. SUBMISSION URL (Priority #1)
83
+ Where to POST the final answer.
84
+
85
+ **Search for:** "POST to", "submit to", "send to", "answer to"
86
+ **Extract from:** Text, markdown links `[text](URL)`, relative paths `/submit`
87
+ **Set submission_url_is_relative=True** if starts with `/`
88
+
89
+ ### 2. REDIRECT DETECTION
90
+ **is_redirect=True** if content says "visit URL" or "task at URL" (directs elsewhere)
91
+ **is_redirect=False** if content IS the task (has instructions)
92
+
93
+ Provide **question_url** if redirect detected.
94
+
95
+ ### 3. INSTRUCTION PARSING
96
+ Break into steps (ONLY if is_redirect=False).
97
+
98
+ **Actions:** scrape, extract, calculate, submit, download, transcribe, analyze, visit
99
+ **Each step:** step_number, action, description, target, dependencies
100
+
101
+ ### 4. ASSESSMENT
102
+ - **overall_goal**: One sentence
103
+ - **complexity**: trivial/simple/moderate/complex
104
+ - **confidence**: 0.0-1.0
105
+
106
+ ---
107
+
108
+ ## EXAMPLE:
109
+
110
+ **Input:**
111
+ "Scrape /data?email=... Get the secret code. POST code to [/submit](https://example.com/submit)"
112
+
113
+ **Output:**
114
+ {{
115
+ "is_redirect": false,
116
+ "question_url": null,
117
+ "redirect_reasoning": "Contains task instructions",
118
+ "submission_url": "/submit",
119
+ "submission_url_is_relative": true,
120
+ "submission_reasoning": "Found 'POST code to /submit'",
121
+ "instructions": [
122
+ {{"step_number": 1, "action": "scrape", "description": "Scrape /data page", "target": "/data?email=...", "dependencies": []}},
123
+ {{"step_number": 2, "action": "extract", "description": "Extract secret code", "target": "secret code", "dependencies": }},
124
+ {{"step_number": 3, "action": "submit", "description": "POST code to /submit", "target": "/submit", "dependencies": }}
125
+ ],
126
+ "overall_goal": "Scrape, extract, and submit secret code",
127
+ "complexity": "simple",
128
+ "confidence": 0.92
129
+ }}
130
+
131
+ text
132
+
133
+ Now analyze the content above."""
134
+
135
  @staticmethod
136
  def analysis_planning_prompt(
137
  question: str,
 
620
  lines.append(f" ... and {len(values) - 2} more")
621
 
622
  return "\n".join(lines)
623
+
624
+ @staticmethod
625
+ def url_detection_prompt(
626
+ content: str,
627
+ urls: list,
628
+ request_url: str
629
+ ) -> str:
630
+ """
631
+ Prompt for detecting Question URL from content
632
+
633
+ Args:
634
+ content: Fetched content
635
+ urls: List of URLs found in content
636
+ request_url: Original request URL
637
+
638
+ Returns:
639
+ Formatted prompt
640
+ """
641
+ urls_text = "\n".join(f"{i+1}. {url}" for i, url in enumerate(urls)) if urls else "None"
642
+
643
+ return f"""You are analyzing content from a TDS quiz/task system to determine URL relationships.
644
+
645
+ **Context:**
646
+ - We fetched content from: {request_url}
647
+ - This content might either:
648
+ 1. BE the actual task/question (return is_redirect=False)
649
+ 2. REDIRECT/POINT to another URL where the actual task is located (return is_redirect=True)
650
+
651
+ **URLs found in content:**
652
+ {urls_text}
653
+
654
+ **Content (truncated to 1500 chars):**
655
+ {content[:1500]}
656
+
657
+ **Your Task:**
658
+ Analyze if this content IS the actual task, or if it REDIRECTS to another URL to get the task.
659
+
660
+ **URL Type Definitions:**
661
+ - **question_url**: URL to visit to GET the actual task/question
662
+ - **data_url**: URL to GET/SCRAPE data from (as part of task instructions)
663
+ - **submission_url**: URL to POST the final answer to
664
+ - **reference_url**: URL for reference/documentation only
665
+
666
+ **Decision Rules:**
667
+
668
+ 1. **is_redirect=True** when:
669
+ - Content explicitly says "visit <url>", "your task is at <url>", "go to <url>"
670
+ - Content is very short (< 100 chars) with just a URL
671
+ - Content describes WHERE to find the task, not WHAT the task is
672
+ - Primary purpose is to direct you to another location
673
+
674
+ 2. **is_redirect=False** when:
675
+ - Content contains actual task instructions (scrape, analyze, calculate, etc.)
676
+ - URLs are mentioned as DATA SOURCES or SUBMISSION endpoints
677
+ - Content is the task itself, even if it references other URLs
678
+
679
+ **Examples:**
680
+
681
+ Example 1 (REDIRECT):
682
+ Content: "Your quiz is available at https://example.com/quiz-834"
683
+ β†’ is_redirect=True, question_url="https://example.com/quiz-834"
684
+ β†’ Reasoning: Content tells you WHERE to go, not WHAT to do
685
+
686
+ Example 2 (ACTUAL TASK):
687
+ Content: "Scrape https://example.com/data and extract the top 5 prices. Submit to https://example.com/submit"
688
+ β†’ is_redirect=False
689
+ β†’ Reasoning: This IS the task. URLs are for data source and submission, not for getting the task
690
+
691
+ Example 3 (REDIRECT with multiple URLs):
692
+ Content: "Visit https://example.com/quiz-834 to receive your assignment. You'll be asked to scrape another website."
693
+ β†’ is_redirect=True, question_url="https://example.com/quiz-834"
694
+ β†’ Reasoning: Content directs you to quiz-834 to GET the actual task
695
+
696
+ Example 4 (ACTUAL TASK with multiple URLs):
697
+ Content: "Download data from https://api.example.com/data and compare with https://example.com/reference. Calculate the difference."
698
+ β†’ is_redirect=False
699
+ β†’ Reasoning: This IS the task. Both URLs are data sources for completing the task
700
+
701
+ Example 5 (SHORT REDIRECT):
702
+ Content: "https://example.com/quiz-834"
703
+ β†’ is_redirect=True, question_url="https://example.com/quiz-834"
704
+ β†’ Reasoning: Only a URL, no task instructions
705
+
706
+ **Now analyze the content above and provide your analysis.**"""
707
+