23f3003322 commited on
Commit
dc1c6a7
Β·
1 Parent(s): e071e70

new changes to handle dificulty level 1

Browse files
.gitignore CHANGED
@@ -41,3 +41,13 @@ Thumbs.db
41
 
42
  new.txt
43
  .DS_Store
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  new.txt
43
  .DS_Store
44
+
45
+
46
+ analysis.md
47
+ data_fetching.md
48
+ dynamic_scraper.md
49
+ orchestrator.md
50
+ questions.md
51
+ task_processor.md
52
+ unified.md
53
+
app/__pycache__/main.cpython-313.pyc CHANGED
Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ
 
app/api/routes/__pycache__/task.cpython-313.pyc CHANGED
Binary files a/app/api/routes/__pycache__/task.cpython-313.pyc and b/app/api/routes/__pycache__/task.cpython-313.pyc differ
 
app/api/routes/task.py CHANGED
@@ -5,15 +5,17 @@ Handles task submission and processing
5
 
6
  from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
7
  from datetime import datetime
8
- from typing import Dict, Any
9
 
10
- from app.models.request import TaskRequest
11
- from app.models.response import TaskResponse, ImmediateResponse
12
  from app.core.logging import get_logger
13
- from app.core.security import verify_authentication, AuthenticationError
14
  from app.core.exceptions import TaskProcessingError
15
  from app.services.task_processor import TaskProcessor
16
 
 
 
17
  logger = get_logger(__name__)
18
 
19
  router = APIRouter()
@@ -36,18 +38,7 @@ async def handle_task(
36
  request: Request,
37
  background_tasks: BackgroundTasks
38
  ):
39
- """
40
- Main API endpoint for handling task requests
41
-
42
- Flow:
43
- 1. Validate JSON format (HTTP 400 if invalid)
44
- 2. Verify secret (HTTP 403 if invalid)
45
- 3. Respond immediately with HTTP 200
46
- 4. Process task in background
47
-
48
- Returns:
49
- Immediate HTTP 200 response with task accepted message
50
- """
51
  start_time = datetime.now()
52
 
53
  logger.info("πŸ“₯ Task request received")
@@ -58,13 +49,13 @@ async def handle_task(
58
  # ================================================================
59
  try:
60
  body = await request.json()
61
- task_data = TaskRequest(**body)
62
- except ValueError as e:
63
- logger.error(f"❌ Invalid JSON format: {str(e)}")
64
- raise HTTPException(
65
- status_code=status.HTTP_400_BAD_REQUEST,
66
- detail=f"Invalid JSON format: {str(e)}"
67
- )
68
  except Exception as e:
69
  logger.error(f"❌ Request validation failed: {str(e)}")
70
  raise HTTPException(
@@ -72,28 +63,26 @@ async def handle_task(
72
  detail=f"Invalid request data: {str(e)}"
73
  )
74
 
75
- logger.info(f"βœ… Request validated for: {task_data.email}")
76
 
77
- # ================================================================
78
- # STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
79
- # ================================================================
80
- logger.info("πŸ” Verifying authentication")
81
- try:
82
- verify_authentication(task_data.secret)
83
- except AuthenticationError as e:
84
- logger.error(f"❌ Authentication failed: {str(e)}")
85
- raise HTTPException(
86
- status_code=status.HTTP_403_FORBIDDEN,
87
- detail="Invalid secret. Authentication failed."
88
- )
89
 
90
- logger.info("βœ… Authentication successful")
91
 
92
  # ================================================================
93
  # STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
94
  # ================================================================
95
  logger.info("βœ… Request accepted - processing in background")
96
-
97
  # Add task processing to background
98
  background_tasks.add_task(
99
  process_task_background,
@@ -105,7 +94,6 @@ async def handle_task(
105
  response = ImmediateResponse(
106
  success=True,
107
  message="Task accepted and processing started",
108
- email=task_data.email,
109
  task_url=str(task_data.url),
110
  status="processing",
111
  timestamp=datetime.now().isoformat()
@@ -127,8 +115,11 @@ async def handle_task(
127
  )
128
 
129
 
 
 
 
130
  async def process_task_background(
131
- task_data: TaskRequest,
132
  start_time: datetime
133
  ):
134
  """
@@ -149,7 +140,7 @@ async def process_task_background(
149
  try:
150
  # Process the task
151
  result_data = await task_processor.process(task_data)
152
-
153
  # Calculate execution time
154
  execution_time = (datetime.now() - start_time).total_seconds()
155
 
 
5
 
6
  from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
7
  from datetime import datetime
8
+ # from typing import Dict, Any
9
 
10
+ from app.models.request import ManualTriggeredRequestBody
11
+ from app.models.response import ImmediateResponse
12
  from app.core.logging import get_logger
13
+ # from app.core.security import verify_authentication, AuthenticationError
14
  from app.core.exceptions import TaskProcessingError
15
  from app.services.task_processor import TaskProcessor
16
 
17
+ import requests
18
+
19
  logger = get_logger(__name__)
20
 
21
  router = APIRouter()
 
38
  request: Request,
39
  background_tasks: BackgroundTasks
40
  ):
41
+
 
 
 
 
 
 
 
 
 
 
 
42
  start_time = datetime.now()
43
 
44
  logger.info("πŸ“₯ Task request received")
 
49
  # ================================================================
50
  try:
51
  body = await request.json()
52
+ task_data = ManualTriggeredRequestBody(**body)
53
+ # except ValueError as e:
54
+ # logger.error(f"❌ Invalid JSON format: {str(e)}")
55
+ # raise HTTPException(
56
+ # status_code=status.HTTP_400_BAD_REQUEST,
57
+ # detail=f"Invalid JSON format: {str(e)}"
58
+ # )
59
  except Exception as e:
60
  logger.error(f"❌ Request validation failed: {str(e)}")
61
  raise HTTPException(
 
63
  detail=f"Invalid request data: {str(e)}"
64
  )
65
 
 
66
 
67
+ # # ================================================================
68
+ # # STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
69
+ # # ================================================================
70
+ # logger.info("πŸ” Verifying authentication")
71
+ # try:
72
+ # verify_authentication(task_data.secret)
73
+ # except AuthenticationError as e:
74
+ # logger.error(f"❌ Authentication failed: {str(e)}")
75
+ # raise HTTPException(
76
+ # status_code=status.HTTP_403_FORBIDDEN,
77
+ # detail="Invalid secret. Authentication failed."
78
+ # )
79
 
80
+ # logger.info("βœ… Authentication successful")
81
 
82
  # ================================================================
83
  # STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
84
  # ================================================================
85
  logger.info("βœ… Request accepted - processing in background")
 
86
  # Add task processing to background
87
  background_tasks.add_task(
88
  process_task_background,
 
94
  response = ImmediateResponse(
95
  success=True,
96
  message="Task accepted and processing started",
 
97
  task_url=str(task_data.url),
98
  status="processing",
99
  timestamp=datetime.now().isoformat()
 
115
  )
116
 
117
 
118
+
119
+
120
+
121
  async def process_task_background(
122
+ task_data: ManualTriggeredRequestBody,
123
  start_time: datetime
124
  ):
125
  """
 
140
  try:
141
  # Process the task
142
  result_data = await task_processor.process(task_data)
143
+
144
  # Calculate execution time
145
  execution_time = (datetime.now() - start_time).total_seconds()
146
 
app/core/__pycache__/config.cpython-313.pyc CHANGED
Binary files a/app/core/__pycache__/config.cpython-313.pyc and b/app/core/__pycache__/config.cpython-313.pyc differ
 
app/core/__pycache__/exceptions.cpython-313.pyc CHANGED
Binary files a/app/core/__pycache__/exceptions.cpython-313.pyc and b/app/core/__pycache__/exceptions.cpython-313.pyc differ
 
app/core/config.py CHANGED
@@ -26,6 +26,7 @@ class Settings(BaseSettings):
26
 
27
  # Security
28
  API_SECRET: str = Field(default="", env="API_SECRET")
 
29
  ALLOWED_ORIGINS: List[str] = Field(default=["*"], env="ALLOWED_ORIGINS")
30
 
31
  # Logging
 
26
 
27
  # Security
28
  API_SECRET: str = Field(default="", env="API_SECRET")
29
+ USER_EMAIL: str = Field(default="", env="USER_EMAIL")
30
  ALLOWED_ORIGINS: List[str] = Field(default=["*"], env="ALLOWED_ORIGINS")
31
 
32
  # Logging
app/core/exceptions.py CHANGED
@@ -19,6 +19,10 @@ class TaskProcessingError(Exception):
19
  """Raised when task processing fails"""
20
  pass
21
 
 
 
 
 
22
 
23
  class AuthenticationError(Exception):
24
  """Raised when authentication fails"""
 
19
  """Raised when task processing fails"""
20
  pass
21
 
22
+ class AnswerGenerationError(Exception):
23
+ """Raised when answer generation fails"""
24
+ pass
25
+
26
 
27
  class AuthenticationError(Exception):
28
  """Raised when authentication fails"""
app/main.py CHANGED
@@ -29,13 +29,13 @@ async def lifespan(app: FastAPI):
29
  logger.info(f"Environment: {settings.ENVIRONMENT}")
30
  logger.info("=" * 80)
31
  import os
32
- if os.getenv('ENVIRONMENT') == 'production':
33
- from app.modules.scrapers.browser_pool import get_pooled_browser
34
- from app.modules.scrapers.browser_config import PRODUCTION_CONFIG
35
 
36
- logger.info("Pre-warming browser pool...")
37
- await get_pooled_browser(PRODUCTION_CONFIG)
38
- logger.info("βœ“ Browser pool ready")
39
 
40
  yield
41
 
@@ -57,8 +57,8 @@ def create_application() -> FastAPI:
57
  description=settings.APP_DESCRIPTION,
58
  version=settings.APP_VERSION,
59
  lifespan=lifespan,
60
- docs_url="/docs" if settings.ENVIRONMENT == "development" else None,
61
- redoc_url="/redoc" if settings.ENVIRONMENT == "development" else None,
62
  )
63
 
64
  # Configure CORS
@@ -76,7 +76,7 @@ def create_application() -> FastAPI:
76
  # Register exception handlers
77
  register_exception_handlers(app)
78
  registry = register_all_modules()
79
- orchestrator = OrchestratorEngine(registry)
80
 
81
  # Include routers
82
  app.include_router(health.router, tags=["Health"])
 
29
  logger.info(f"Environment: {settings.ENVIRONMENT}")
30
  logger.info("=" * 80)
31
  import os
32
+ # if os.getenv('ENVIRONMENT') == 'production':
33
+ # from app.modules.scrapers.browser_pool import get_pooled_browser
34
+ # from app.modules.scrapers.browser_config import PRODUCTION_CONFIG
35
 
36
+ # logger.info("Pre-warming browser pool...")
37
+ # await get_pooled_browser(PRODUCTION_CONFIG)
38
+ # logger.info("βœ“ Browser pool ready")
39
 
40
  yield
41
 
 
57
  description=settings.APP_DESCRIPTION,
58
  version=settings.APP_VERSION,
59
  lifespan=lifespan,
60
+ # docs_url="/docs" if settings.ENVIRONMENT == "development" else None,
61
+ # redoc_url="/redoc" if settings.ENVIRONMENT == "development" else None,
62
  )
63
 
64
  # Configure CORS
 
76
  # Register exception handlers
77
  register_exception_handlers(app)
78
  registry = register_all_modules()
79
+ # orchestrator = OrchestratorEngine(registry)
80
 
81
  # Include routers
82
  app.include_router(health.router, tags=["Health"])
app/models/__pycache__/request.cpython-313.pyc CHANGED
Binary files a/app/models/__pycache__/request.cpython-313.pyc and b/app/models/__pycache__/request.cpython-313.pyc differ
 
app/models/__pycache__/response.cpython-313.pyc CHANGED
Binary files a/app/models/__pycache__/response.cpython-313.pyc and b/app/models/__pycache__/response.cpython-313.pyc differ
 
app/models/analysis.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Dict, Any, List, Optional, Literal
3
+
4
+ class QuestionAnalysis(BaseModel):
5
+ """
6
+ Analysis focused on generating the correct answer.
7
+ No redirect/entry page logic needed.
8
+ """
9
+
10
+ # ===== QUESTION CLASSIFICATION =====
11
+ question_type: Literal[
12
+ 'cli_command', # Q2, Q3: shell commands
13
+ 'file_path', # Q4: paths/URLs
14
+ 'data_processing', # Q7, Q9, Q11: CSV/JSON processing
15
+ 'image_analysis', # Q6, Q17: image operations
16
+ 'audio_transcription', # Q5: audio to text
17
+ 'api_interaction', # Q8: external API calls
18
+ 'document_parsing', # Q10: PDF extraction
19
+ 'calculation', # Q20, Q21: mathematical computations
20
+ 'text_generation', # Q12, Q13, Q19: YAML, prompts
21
+ 'optimization', # Q14, Q18: constraint solving
22
+ 'llm_reasoning' # Q16: tool planning/reasoning
23
+ ] = Field(description="Type of task to solve")
24
+
25
+ # ===== ANSWER FORMAT =====
26
+ answer_format: Literal[
27
+ 'plain_string', # Q2, Q3, Q4: raw text
28
+ 'json_object', # Q11, Q14, Q16, Q21: {"key": "value"}
29
+ 'json_array', # Q20: ["a", "b", "c"]
30
+ 'number', # Q8, Q9, Q10, Q17, Q18: integer/float
31
+ 'single_letter' # Q12: A, B, or C
32
+ ] = Field(description="How to format the final answer")
33
+
34
+ # ===== ANSWER COMPONENTS =====
35
+ key_components: Dict[str, Any] = Field(
36
+ default_factory=dict,
37
+ description="Extracted components needed to generate answer"
38
+ )
39
+
40
+ # ===== PERSONALIZATION =====
41
+ requires_personalization: bool = Field(
42
+ default=False,
43
+ description="Does answer depend on user email?"
44
+ )
45
+
46
+ personalization_type: Optional[Literal[
47
+ 'email_in_url', # Q2: ?email=<user_email>
48
+ 'email_length_offset', # Q8, Q9, Q15, Q18: offset = len(email) mod N
49
+ 'email_length_conditional' # Q15: if even/odd
50
+ ]] = None
51
+
52
+ personalization_details: Optional[str] = Field(
53
+ default=None,
54
+ description="Specific personalization logic"
55
+ )
56
+
57
+ # ===== FILE REQUIREMENTS =====
58
+ requires_files: bool = Field(
59
+ default=False,
60
+ description="Does question need file downloads?"
61
+ )
62
+
63
+ required_file_types: List[str] = Field(
64
+ default_factory=list,
65
+ description="File types needed: csv, json, png, pdf, opus, zip"
66
+ )
67
+
68
+ # ===== EXTERNAL RESOURCES =====
69
+ requires_external_fetch: bool = Field(
70
+ default=False,
71
+ description="Need to fetch data from another URL (not just files)?"
72
+ )
73
+
74
+ external_resources: List[str] = Field(
75
+ default_factory=list,
76
+ description="URLs/endpoints to fetch before solving"
77
+ )
78
+
79
+ # ===== CRITICAL CONSTRAINTS =====
80
+ critical_constraints: List[str] = Field(
81
+ default_factory=list,
82
+ description="Must-follow rules for answer format"
83
+ )
84
+
85
+ # ===== SUBMISSION INFO =====
86
+ submission_url_path: str = Field(
87
+ description="URL path for this question (e.g., '/project2-uv')"
88
+ )
89
+
90
+ # ===== CONFIDENCE & REASONING =====
91
+ reasoning: str = Field(
92
+ description="Why this classification and components were chosen"
93
+ )
94
+
95
+ confidence: float = Field(
96
+ ge=0.0,
97
+ le=1.0,
98
+ description="Confidence in analysis (0.0-1.0)"
99
+ )
app/models/answer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Dict, Any, Optional
3
+
4
+ class AnswerResult(BaseModel):
5
+ """Structured output from answer generation"""
6
+
7
+ answer: str = Field(
8
+ description="The exact answer to submit (final output)"
9
+ )
10
+
11
+ reasoning: str = Field(
12
+ description="Step-by-step explanation of how answer was generated"
13
+ )
14
+
15
+ components_used: Dict[str, Any] = Field(
16
+ default_factory=dict,
17
+ description="Which components from analysis were used"
18
+ )
19
+
20
+ confidence: float = Field(
21
+ ge=0.0,
22
+ le=1.0,
23
+ description="Confidence in answer correctness (0.0-1.0)"
24
+ )
25
+
26
+ personalization_applied: bool = Field(
27
+ default=False,
28
+ description="Whether personalization was applied"
29
+ )
30
+
31
+ validation_notes: str = Field(
32
+ default="",
33
+ description="Notes about format validation"
34
+ )
app/models/request.py CHANGED
@@ -7,6 +7,12 @@ from typing import Optional, Dict, Any
7
  from pydantic import BaseModel, Field, EmailStr, HttpUrl, validator
8
 
9
 
 
 
 
 
 
 
10
  class TaskRequest(BaseModel):
11
  """
12
  Schema for task request validation
 
7
  from pydantic import BaseModel, Field, EmailStr, HttpUrl, validator
8
 
9
 
10
+ class ManualTriggeredRequestBody(BaseModel):
11
+ """Request body format for quiz submission"""
12
+ url: str
13
+
14
+
15
+
16
  class TaskRequest(BaseModel):
17
  """
18
  Schema for task request validation
app/models/response.py CHANGED
@@ -7,6 +7,14 @@ from typing import Optional, Dict, Any
7
  from datetime import datetime
8
  from pydantic import BaseModel, Field
9
 
 
 
 
 
 
 
 
 
10
  class ImmediateResponse(BaseModel):
11
  """
12
  Immediate response sent after validation
@@ -19,10 +27,7 @@ class ImmediateResponse(BaseModel):
19
  message: str = Field(
20
  description="Status message"
21
  )
22
-
23
- email: str = Field(
24
- description="Student email from request"
25
- )
26
 
27
  task_url: str = Field(
28
  description="Task URL from request"
 
7
  from datetime import datetime
8
  from pydantic import BaseModel, Field
9
 
10
+ class SubmissionBody(BaseModel):
11
+ """Request body format for quiz submission"""
12
+ email: str
13
+ secret: str
14
+ url: str
15
+ answer: int
16
+
17
+
18
  class ImmediateResponse(BaseModel):
19
  """
20
  Immediate response sent after validation
 
27
  message: str = Field(
28
  description="Status message"
29
  )
30
+
 
 
 
31
 
32
  task_url: str = Field(
33
  description="Task URL from request"
app/modules/scrapers/base_scraper.py CHANGED
@@ -26,6 +26,7 @@ class ScraperResult(BaseModel):
26
  encoding: str = "utf-8"
27
  response_time: float = 0.0
28
  status_code: int = 200
 
29
 
30
  # Scraping details
31
  selectors_used: List[str] = Field(default_factory=list)
@@ -35,6 +36,14 @@ class ScraperResult(BaseModel):
35
  error: Optional[str] = None
36
  warnings: List[str] = Field(default_factory=list)
37
 
 
 
 
 
 
 
 
 
38
  class Config:
39
  arbitrary_types_allowed = True
40
 
 
26
  encoding: str = "utf-8"
27
  response_time: float = 0.0
28
  status_code: int = 200
29
+ raw_html: Optional[str] = None
30
 
31
  # Scraping details
32
  selectors_used: List[str] = Field(default_factory=list)
 
36
  error: Optional[str] = None
37
  warnings: List[str] = Field(default_factory=list)
38
 
39
+ def __post_init__(self):
40
+ if self.data is None:
41
+ self.data = []
42
+ if self.columns_extracted is None:
43
+ self.columns_extracted = []
44
+ if self.selectors_used is None:
45
+ self.selectors_used = []
46
+
47
  class Config:
48
  arbitrary_types_allowed = True
49
 
app/modules/scrapers/dynamic_scraper.py CHANGED
@@ -154,6 +154,7 @@ class DynamicScraper(BaseScraper):
154
  wait_for: Optional[str] = None,
155
  click_selectors: List[str] = None,
156
  scroll: bool = False,
 
157
  take_screenshot: bool = False,
158
  **kwargs
159
  ) -> ScraperResult:
@@ -209,7 +210,7 @@ class DynamicScraper(BaseScraper):
209
  url=url,
210
  error="Failed to load page"
211
  )
212
-
213
  status_code = response.status
214
  logger.info(f"Page loaded | Status: {status_code}")
215
 
@@ -246,6 +247,15 @@ class DynamicScraper(BaseScraper):
246
  else:
247
  data = await self._extract_auto(page)
248
 
 
 
 
 
 
 
 
 
 
249
  # Build result
250
  columns = list(data[0].keys()) if data else []
251
 
@@ -257,6 +267,8 @@ class DynamicScraper(BaseScraper):
257
  columns_extracted=columns,
258
  status_code=status_code,
259
  selectors_used=list(selectors.keys()) if selectors else []
 
 
260
  )
261
 
262
  logger.info(f"βœ“ Scraped {len(data)} rows with browser")
 
154
  wait_for: Optional[str] = None,
155
  click_selectors: List[str] = None,
156
  scroll: bool = False,
157
+ return_html: bool = True,
158
  take_screenshot: bool = False,
159
  **kwargs
160
  ) -> ScraperResult:
 
210
  url=url,
211
  error="Failed to load page"
212
  )
213
+
214
  status_code = response.status
215
  logger.info(f"Page loaded | Status: {status_code}")
216
 
 
247
  else:
248
  data = await self._extract_auto(page)
249
 
250
+ if selectors:
251
+ data = await self._extract_with_selectors(page, selectors)
252
+ else:
253
+ data = await self._extract_auto(page)
254
+
255
+ rendered_html = None
256
+ if return_html:
257
+ rendered_html = await page.content()
258
+
259
  # Build result
260
  columns = list(data[0].keys()) if data else []
261
 
 
267
  columns_extracted=columns,
268
  status_code=status_code,
269
  selectors_used=list(selectors.keys()) if selectors else []
270
+ ,
271
+ raw_html=rendered_html
272
  )
273
 
274
  logger.info(f"βœ“ Scraped {len(data)} rows with browser")
app/services/__pycache__/task_processor.cpython-313.pyc CHANGED
Binary files a/app/services/__pycache__/task_processor.cpython-313.pyc and b/app/services/__pycache__/task_processor.cpython-313.pyc differ
 
app/services/analyser.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List
2
+ from app.core.logging import get_logger
3
+ from app.core.exceptions import TaskProcessingError
4
+ from app.models.analysis import QuestionAnalysis
5
+ from app.utils.prompts import AnalysisPrompts
6
+ logger = get_logger(__name__)
7
+
8
+
9
+ class QuestionAnalyzer:
10
+ """
11
+ Analyzes questions to determine how to generate answers.
12
+ No entry page or redirect logic - assumes all content is solvable questions.
13
+ """
14
+
15
+ def __init__(self, llm_client):
16
+ """
17
+ Args:
18
+ llm_client: LLM client with run_agent() method
19
+ """
20
+ self.llm_client = llm_client
21
+ self._analyzer_agent = None
22
+
23
+ async def initialize(self):
24
+ """Initialize LLM agent"""
25
+ self._analyzer_agent = self.llm_client.create_agent(
26
+ output_type=QuestionAnalysis,
27
+ system_prompt=(
28
+ "You are an expert at analyzing technical quiz questions. "
29
+ "Extract precise information needed to generate correct answers. "
30
+ "Be thorough and accurate."
31
+ ),
32
+ retries=2
33
+ )
34
+ logger.info("βœ“ Question analyzer initialized")
35
+
36
+ async def analyze_question(
37
+ self,
38
+ question_metadata: Dict[str, Any],
39
+ base_url: str,
40
+ user_email: str,
41
+ downloaded_files: List[Dict[str, Any]]
42
+ ) -> QuestionAnalysis:
43
+ """
44
+ Analyze question to determine how to generate the answer.
45
+
46
+ Args:
47
+ question_metadata: Parsed metadata from scraping
48
+ - title: Question title
49
+ - heading: Question heading
50
+ - difficulty: 1-5
51
+ - is_personalized: bool
52
+ - instructions: List of instruction strings
53
+ - file_links: List of file references
54
+ base_url: Base URL for the quiz
55
+ user_email: User's email address
56
+ downloaded_files: List of downloaded file info
57
+ - filename, type, path, size
58
+
59
+ Returns:
60
+ QuestionAnalysis: Structured analysis
61
+
62
+ Raises:
63
+ TaskProcessingError: If analysis fails
64
+ """
65
+ logger.info(f"πŸ€– Analyzing question: {question_metadata.get('title', 'unknown')}")
66
+
67
+ # Build prompt
68
+ prompt = AnalysisPrompts.question_analysis_prompt(
69
+ instructions=question_metadata.get('instructions', []),
70
+ difficulty=question_metadata.get('difficulty', 1),
71
+ is_personalized=question_metadata.get('is_personalized', False),
72
+ title=question_metadata.get('title', ''),
73
+ heading=question_metadata.get('heading', ''),
74
+ base_url=base_url,
75
+ user_email=user_email,
76
+ available_files=downloaded_files
77
+ )
78
+
79
+ try:
80
+ # Run LLM analysis
81
+ analysis: QuestionAnalysis = await self.llm_client.run_agent(
82
+ self._analyzer_agent,
83
+ prompt
84
+ )
85
+
86
+ # Log analysis results
87
+ logger.info(f"βœ“ Question type: {analysis.question_type}")
88
+ logger.info(f"βœ“ Answer format: {analysis.answer_format}")
89
+ logger.info(f"βœ“ Personalization: {analysis.requires_personalization}")
90
+ logger.info(f"βœ“ Files needed: {analysis.requires_files}")
91
+ logger.info(f"βœ“ Confidence: {analysis.confidence:.2f}")
92
+
93
+ # Validate analysis
94
+ self._validate_analysis(analysis, question_metadata)
95
+
96
+ return analysis
97
+
98
+ except Exception as e:
99
+ logger.error(f"❌ Question analysis failed: {e}", exc_info=True)
100
+ raise TaskProcessingError(
101
+ f"Cannot analyze question: {str(e)}. "
102
+ "LLM analysis is required for unknown question types."
103
+ )
104
+
105
+ def _validate_analysis(
106
+ self,
107
+ analysis: QuestionAnalysis,
108
+ metadata: Dict[str, Any]
109
+ ):
110
+ """
111
+ Validate analysis results make sense.
112
+
113
+ Args:
114
+ analysis: LLM analysis result
115
+ metadata: Original question metadata
116
+
117
+ Raises:
118
+ TaskProcessingError: If validation fails
119
+ """
120
+ # Check confidence threshold
121
+ if analysis.confidence < 0.5:
122
+ logger.warning(
123
+ f"⚠️ Low confidence analysis: {analysis.confidence:.2f}"
124
+ )
125
+
126
+ # Check personalization consistency
127
+ if metadata.get('is_personalized') and not analysis.requires_personalization:
128
+ logger.warning(
129
+ "⚠️ Metadata says personalized but analysis disagrees"
130
+ )
131
+
132
+ # Check file requirements
133
+ if analysis.requires_files and not analysis.required_file_types:
134
+ logger.warning(
135
+ "⚠️ Requires files but no file types specified"
136
+ )
137
+
138
+ # Check submission URL
139
+ if not analysis.submission_url_path:
140
+ raise TaskProcessingError(
141
+ "Analysis missing submission_url_path"
142
+ )
143
+
144
+ if not analysis.submission_url_path.startswith('/'):
145
+ logger.warning(
146
+ f"⚠️ Submission URL should start with '/': {analysis.submission_url_path}"
147
+ )
148
+
149
+ logger.debug("βœ“ Analysis validation passed")
app/services/answer_generator.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List, Optional
2
+ import json
3
+ import re
4
+ from app.core.logging import get_logger
5
+ from app.core.exceptions import AnswerGenerationError
6
+ from app.models.answer import AnswerResult
7
+ from app.models.analysis import QuestionAnalysis
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class AnswerGenerator:
12
+ """
13
+ Generates answers based on question analysis.
14
+ Uses LLM with rich context for flexibility with unknown questions.
15
+ """
16
+
17
+ def __init__(self, llm_client):
18
+ """
19
+ Args:
20
+ llm_client: LLM client with run_agent() method
21
+ """
22
+ self.llm_client = llm_client
23
+ self._generator_agent = None
24
+
25
+ async def initialize(self):
26
+ """Initialize LLM agent for answer generation"""
27
+ self._generator_agent = self.llm_client.create_agent(
28
+ output_type=AnswerResult,
29
+ system_prompt=(
30
+ "You are an expert at solving technical quiz questions. "
31
+ "Generate precise, exact answers based on the provided analysis and context. "
32
+ "Follow all constraints strictly. "
33
+ "Be thorough in your reasoning to ensure correctness."
34
+ ),
35
+ retries=2
36
+ )
37
+ logger.info("βœ“ Answer generator initialized")
38
+
39
+ async def generate(
40
+ self,
41
+ analysis: 'QuestionAnalysis',
42
+ question_metadata: Dict[str, Any],
43
+ base_url: str,
44
+ user_email: str,
45
+ downloaded_files: List[Dict[str, Any]]
46
+ ) -> str:
47
+ """
48
+ Generate answer based on question analysis.
49
+
50
+ Args:
51
+ analysis: Question analysis from analyzer
52
+ question_metadata: Original metadata with instructions
53
+ base_url: Base URL for the quiz
54
+ user_email: User's email address
55
+ downloaded_files: List of downloaded files with local_path
56
+
57
+ Returns:
58
+ str: Final answer ready to submit
59
+
60
+ Raises:
61
+ AnswerGenerationError: If generation fails
62
+ """
63
+ logger.info(f"πŸ’‘ Generating answer for {analysis.question_type}...")
64
+
65
+ try:
66
+ # Step 1: Build comprehensive context for LLM
67
+ context = self._build_generation_context(
68
+ analysis=analysis,
69
+ question_metadata=question_metadata,
70
+ base_url=base_url,
71
+ user_email=user_email,
72
+ downloaded_files=downloaded_files
73
+ )
74
+
75
+ # Step 2: Generate answer with LLM
76
+ result = await self._generate_with_llm(context)
77
+
78
+ logger.info(f"βœ“ Generated answer (confidence: {result.confidence:.2f})")
79
+ logger.debug(f"Reasoning: {result.reasoning}")
80
+
81
+ # Step 3: Apply personalization if needed
82
+ if analysis.requires_personalization and not result.personalization_applied:
83
+ logger.info("Applying personalization...")
84
+ result.answer = self._apply_personalization(
85
+ answer=result.answer,
86
+ analysis=analysis,
87
+ user_email=user_email
88
+ )
89
+ result.personalization_applied = True
90
+
91
+ # Step 4: Validate format
92
+ is_valid, validation_message = self._validate_format(
93
+ result.answer,
94
+ analysis
95
+ )
96
+
97
+ if not is_valid:
98
+ logger.warning(f"Format validation issue: {validation_message}")
99
+ # Try to auto-correct common issues
100
+ result.answer = self._auto_correct_format(
101
+ result.answer,
102
+ analysis,
103
+ validation_message
104
+ )
105
+ logger.info("Applied auto-correction")
106
+
107
+ # Step 5: Check constraints
108
+ constraints_met, violations = self._check_constraints(
109
+ result.answer,
110
+ analysis
111
+ )
112
+
113
+ if not constraints_met:
114
+ logger.warning(f"Constraint violations: {violations}")
115
+ if result.confidence < 0.8:
116
+ raise AnswerGenerationError(
117
+ f"Low confidence ({result.confidence}) with constraint violations: {violations}"
118
+ )
119
+
120
+ logger.info(f"βœ“ Final answer: {result.answer[:100]}...")
121
+
122
+ return result.answer
123
+
124
+ except Exception as e:
125
+ logger.error(f"❌ Answer generation failed: {e}", exc_info=True)
126
+ raise AnswerGenerationError(f"Failed to generate answer: {str(e)}")
127
+
128
+ def _build_generation_context(
129
+ self,
130
+ analysis: 'QuestionAnalysis',
131
+ question_metadata: Dict[str, Any],
132
+ base_url: str,
133
+ user_email: str,
134
+ downloaded_files: List[Dict[str, Any]]
135
+ ) -> str:
136
+ """
137
+ Build comprehensive context prompt for LLM.
138
+
139
+ Returns:
140
+ str: Rich context prompt
141
+ """
142
+ # Format instructions
143
+ instructions_text = "\n".join(
144
+ f"{i+1}. {inst}"
145
+ for i, inst in enumerate(question_metadata.get('instructions', []))
146
+ )
147
+
148
+ # Format key components
149
+ components_text = json.dumps(analysis.key_components, indent=2)
150
+
151
+ # Format constraints
152
+ constraints_text = "\n".join(
153
+ f"- {constraint}"
154
+ for constraint in analysis.critical_constraints
155
+ ) if analysis.critical_constraints else "None specified"
156
+
157
+ # Format files
158
+ files_text = "\n".join(
159
+ f"- {f['filename']} (type: {f['type']}, path: {f['local_path']})"
160
+ for f in downloaded_files
161
+ ) if downloaded_files else "None"
162
+
163
+ # Build personalization info
164
+ personalization_text = "Not required"
165
+ if analysis.requires_personalization:
166
+ personalization_text = f"""
167
+ Required: Yes
168
+ Type: {analysis.personalization_type}
169
+ Details: {analysis.personalization_details}
170
+ User Email: {user_email}
171
+ Email Length: {len(user_email)}
172
+ """
173
+
174
+ # Build complete prompt
175
+ prompt = f"""Generate the exact answer for this technical quiz question.
176
+
177
+ # QUESTION METADATA
178
+ - Title: {question_metadata.get('title', 'Unknown')}
179
+ - Difficulty: {question_metadata.get('difficulty', 'Unknown')}/5
180
+ - Question Type: {analysis.question_type}
181
+ - Answer Format: {analysis.answer_format}
182
+
183
+ # ORIGINAL INSTRUCTIONS
184
+ {instructions_text}
185
+
186
+ # EXTRACTED COMPONENTS
187
+ The following components were extracted from the instructions:
188
+ {components_text}
189
+
190
+ # USER CONTEXT
191
+ - Base URL: {base_url}
192
+ - User Email: {user_email}
193
+
194
+ # PERSONALIZATION
195
+ {personalization_text}
196
+
197
+ # AVAILABLE FILES
198
+ {files_text}
199
+
200
+ # CRITICAL CONSTRAINTS
201
+ {constraints_text}
202
+
203
+ # ANSWER FORMAT REQUIREMENTS
204
+ Format: {analysis.answer_format}
205
+
206
+ """
207
+
208
+ # Add format-specific guidance
209
+ if analysis.answer_format == 'plain_string':
210
+ prompt += """
211
+ Return PLAIN TEXT ONLY:
212
+ - No JSON wrapping
213
+ - No quotes around the answer
214
+ - No extra formatting
215
+ - Just the raw string
216
+ """
217
+ elif analysis.answer_format == 'json_object':
218
+ prompt += """
219
+ Return VALID JSON OBJECT:
220
+ - Must be a dictionary {{"key": "value"}}
221
+ - Properly escaped quotes
222
+ - Valid JSON syntax
223
+ """
224
+ elif analysis.answer_format == 'json_array':
225
+ prompt += """
226
+ Return VALID JSON ARRAY:
227
+ - Must be a list ["item1", "item2"]
228
+ - Properly formatted
229
+ - Valid JSON syntax
230
+ """
231
+ elif analysis.answer_format == 'number':
232
+ prompt += """
233
+ Return NUMBER ONLY:
234
+ - Just the numeric value
235
+ - No units or extra text
236
+ - Integer or float as appropriate
237
+ """
238
+ elif analysis.answer_format == 'single_letter':
239
+ prompt += """
240
+ Return SINGLE LETTER:
241
+ - Just one character (A, B, C, etc.)
242
+ - No explanation or extra text
243
+ """
244
+
245
+ # Add question-type specific guidance
246
+ if analysis.question_type == 'cli_command':
247
+ prompt += """
248
+
249
+ # COMMAND GENERATION GUIDANCE
250
+ - Assemble command from components in correct order
251
+ - Use exact formatting for flags and arguments
252
+ - Pay attention to quote style (single vs double)
253
+ - Include all required parts (tool, subcommand, arguments, flags)
254
+ - Do NOT include shell prompt ($, >, #)
255
+ - Return the COMMAND STRING itself, not its output
256
+ """
257
+ elif analysis.question_type == 'file_path':
258
+ prompt += """
259
+
260
+ # FILE PATH GUIDANCE
261
+ - Return the exact path as specified
262
+ - No markdown formatting []()
263
+ - No HTML tags
264
+ - No quotes unless specifically required
265
+ - Exact string match is critical
266
+ """
267
+
268
+ prompt += """
269
+
270
+ # YOUR TASK
271
+ Generate the EXACT answer that should be submitted based on all the information above.
272
+
273
+ IMPORTANT:
274
+ 1. Use the extracted components to build the answer
275
+ 2. Replace any placeholders with actual values (base_url, user_email)
276
+ 3. Follow ALL critical constraints precisely
277
+ 4. Match the required answer format exactly
278
+ 5. Provide detailed reasoning for your answer
279
+
280
+ Generate the answer now.
281
+ """
282
+
283
+ return prompt
284
+
285
+ async def _generate_with_llm(self, context: str) -> AnswerResult:
286
+ """
287
+ Call LLM to generate answer.
288
+
289
+ Args:
290
+ context: Rich context prompt
291
+
292
+ Returns:
293
+ AnswerResult: Structured answer with reasoning
294
+ """
295
+ try:
296
+ result: AnswerResult = await self.llm_client.run_agent(
297
+ self._generator_agent,
298
+ context
299
+ )
300
+ return result
301
+
302
+ except Exception as e:
303
+ logger.error(f"LLM generation failed: {e}")
304
+ raise AnswerGenerationError(f"LLM generation failed: {str(e)}")
305
+
306
+ def _apply_personalization(
307
+ self,
308
+ answer: str,
309
+ analysis: 'QuestionAnalysis',
310
+ user_email: str
311
+ ) -> str:
312
+ """
313
+ Apply email-based personalization to answer.
314
+
315
+ Args:
316
+ answer: Base answer from LLM
317
+ analysis: Question analysis
318
+ user_email: User's email
319
+
320
+ Returns:
321
+ str: Personalized answer
322
+ """
323
+ if not analysis.requires_personalization:
324
+ return answer
325
+
326
+ email_length = len(user_email)
327
+
328
+ if analysis.personalization_type == 'email_length_offset':
329
+ # Parse offset formula from personalization_details
330
+ # Example: "Add (len(email) mod 5) to base sum"
331
+
332
+ match = re.search(r'mod\s+(\d+)', analysis.personalization_details or '')
333
+ if match:
334
+ mod_value = int(match.group(1))
335
+ offset = email_length % mod_value
336
+
337
+ # Try to parse answer as number and add offset
338
+ try:
339
+ base_value = float(answer)
340
+ final_value = base_value + offset
341
+
342
+ # Return as int if it's a whole number
343
+ if final_value.is_integer():
344
+ return str(int(final_value))
345
+ return str(final_value)
346
+
347
+ except ValueError:
348
+ logger.warning(f"Cannot apply offset to non-numeric answer: {answer}")
349
+ return answer
350
+
351
+ elif analysis.personalization_type == 'email_length_conditional':
352
+ # Example: If even, use option A; if odd, use option B
353
+ # This should already be handled by LLM based on email_length in context
354
+ pass
355
+
356
+ return answer
357
+
358
+ def _validate_format(
359
+ self,
360
+ answer: str,
361
+ analysis: 'QuestionAnalysis'
362
+ ) -> tuple[bool, str]:
363
+ """
364
+ Validate answer matches expected format.
365
+
366
+ Returns:
367
+ tuple: (is_valid, message)
368
+ """
369
+ answer_format = analysis.answer_format
370
+
371
+ if answer_format == 'plain_string':
372
+ # Should not be JSON
373
+ if answer.strip().startswith(('{', '[')):
374
+ return False, "Should be plain string, not JSON"
375
+ return True, "Valid plain string"
376
+
377
+ elif answer_format == 'json_object':
378
+ try:
379
+ parsed = json.loads(answer)
380
+ if not isinstance(parsed, dict):
381
+ return False, "Should be JSON object (dict), not array"
382
+ return True, "Valid JSON object"
383
+ except json.JSONDecodeError as e:
384
+ return False, f"Invalid JSON: {str(e)}"
385
+
386
+ elif answer_format == 'json_array':
387
+ try:
388
+ parsed = json.loads(answer)
389
+ if not isinstance(parsed, list):
390
+ return False, "Should be JSON array (list), not object"
391
+ return True, "Valid JSON array"
392
+ except json.JSONDecodeError as e:
393
+ return False, f"Invalid JSON: {str(e)}"
394
+
395
+ elif answer_format == 'number':
396
+ try:
397
+ float(answer.strip())
398
+ return True, "Valid number"
399
+ except ValueError:
400
+ return False, "Should be a numeric value"
401
+
402
+ elif answer_format == 'single_letter':
403
+ if len(answer.strip()) == 1 and answer.strip().isalpha():
404
+ return True, "Valid single letter"
405
+ return False, "Should be exactly one letter"
406
+
407
+ return True, "Format not strictly validated"
408
+
409
+ def _check_constraints(
410
+ self,
411
+ answer: str,
412
+ analysis: 'QuestionAnalysis'
413
+ ) -> tuple[bool, List[str]]:
414
+ """
415
+ Check answer against critical constraints.
416
+
417
+ Returns:
418
+ tuple: (all_met, violations_list)
419
+ """
420
+ violations = []
421
+
422
+ for constraint in analysis.critical_constraints:
423
+ constraint_lower = constraint.lower()
424
+
425
+ # Check: "command string not output"
426
+ if 'command string' in constraint_lower and 'not output' in constraint_lower:
427
+ if answer.startswith(('$', '>', '#', 'Output:', 'Result:')):
428
+ violations.append("Answer looks like output/prompt, should be command only")
429
+
430
+ # Check: "no markdown formatting"
431
+ if 'no markdown' in constraint_lower or 'no formatting' in constraint_lower:
432
+ if re.search(r'\[.+\]\(.+\)', answer):
433
+ violations.append("Should not have markdown links []() formatting")
434
+
435
+ # Check: "double quotes"
436
+ if 'double quote' in constraint_lower:
437
+ if "'" in answer and '"' not in answer:
438
+ violations.append("Should use double quotes, not single quotes")
439
+
440
+ # Check: "exact string"
441
+ if 'exact string' in constraint_lower:
442
+ # Can't validate without knowing expected value
443
+ pass
444
+
445
+ # Check: "lowercase"
446
+ if 'lowercase' in constraint_lower:
447
+ if answer != answer.lower():
448
+ violations.append("Should be lowercase")
449
+
450
+ # Check: "no quotes" or "plain path"
451
+ if 'no quotes' in constraint_lower or 'plain' in constraint_lower:
452
+ if answer.startswith(('"', "'")) and answer.endswith(('"', "'")):
453
+ violations.append("Should not be wrapped in quotes")
454
+
455
+ return len(violations) == 0, violations
456
+
457
+ def _auto_correct_format(
458
+ self,
459
+ answer: str,
460
+ analysis: 'QuestionAnalysis',
461
+ validation_message: str
462
+ ) -> str:
463
+ """
464
+ Attempt to auto-correct common format issues.
465
+
466
+ Args:
467
+ answer: Original answer
468
+ analysis: Question analysis
469
+ validation_message: What was wrong
470
+
471
+ Returns:
472
+ str: Corrected answer
473
+ """
474
+ corrected = answer
475
+
476
+ # Remove JSON wrapping if should be plain string
477
+ if analysis.answer_format == 'plain_string':
478
+ if corrected.startswith('"') and corrected.endswith('"'):
479
+ corrected = corrected[1:-1]
480
+ if corrected.startswith("'") and corrected.endswith("'"):
481
+ corrected = corrected[1:-1]
482
+
483
+ # Strip whitespace
484
+ corrected = corrected.strip()
485
+
486
+ # Remove shell prompts
487
+ for prefix in ['$ ', '> ', '# ', 'Output: ', 'Result: ']:
488
+ if corrected.startswith(prefix):
489
+ corrected = corrected[len(prefix):]
490
+
491
+ return corrected
492
+
app/services/task_fetcher.py CHANGED
@@ -15,7 +15,7 @@ from app.core.logging import get_logger
15
  from app.core.exceptions import TaskProcessingError
16
  from app.utils.llm_client import get_llm_client
17
  from app.utils.prompts import AnalysisPrompts
18
-
19
  logger = get_logger(__name__)
20
 
21
 
@@ -32,6 +32,7 @@ class TaskFetcher:
32
  self.timeout = timeout
33
  self.client: Optional[httpx.AsyncClient] = None
34
  self.llm_client = get_llm_client()
 
35
 
36
  # Import here to avoid circular imports
37
  from app.orchestrator.models import UnifiedTaskAnalysis
@@ -79,52 +80,85 @@ class TaskFetcher:
79
 
80
  # Step 1: Fetch visible content (with fallback)
81
  content = await self._fetch_content(url)
82
-
83
  logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # Step 2: Unified LLM analysis
86
- analysis = await self._analyze_content_with_llm(
87
- task_description=content['task_description'],
88
- raw_content=content['raw_content'],
89
- url=url,
90
- base_url=base_url
 
 
 
 
91
  )
92
-
93
- # Merge content + analysis
94
  result = {
95
- **content,
96
- 'is_redirect': analysis.is_redirect,
97
- 'question_url': analysis.question_url,
98
- 'submission_url': analysis.submission_url,
99
- 'instructions': self._format_instructions(analysis.instructions),
100
- 'overall_goal': analysis.overall_goal,
101
- 'complexity': analysis.complexity,
102
- 'llm_analysis': {
103
- 'redirect_reasoning': analysis.redirect_reasoning,
104
- 'submission_reasoning': analysis.submission_reasoning,
105
- 'confidence': analysis.confidence,
106
- }
107
  }
 
 
 
 
 
 
 
 
108
 
109
- # Resolve relative submission URL if needed
110
- if analysis.submission_url and analysis.submission_url_is_relative:
111
- absolute = str(httpx.URL(base_url).join(analysis.submission_url))
112
- logger.info(f"βœ“ Resolved relative submission URL: {analysis.submission_url} β†’ {absolute}")
113
- result['submission_url'] = absolute
 
 
 
 
 
 
 
 
 
 
114
 
115
- # Resolve relative question URL if needed
116
- if analysis.question_url and analysis.question_url.startswith('/'):
117
- absolute_q = str(httpx.URL(base_url).join(analysis.question_url))
118
- logger.info(f"βœ“ Resolved relative question URL: {analysis.question_url} β†’ {absolute_q}")
119
- result['question_url'] = absolute_q
120
 
121
- logger.info("βœ… Analysis complete:")
122
- logger.info(f" Is Redirect: {result['is_redirect']}")
123
- logger.info(f" Submission URL: {result['submission_url']}")
124
- logger.info(f" Instructions: {len(result['instructions'])} steps")
125
- logger.info(f" Complexity: {result['complexity']}")
126
 
127
- return result
 
 
 
 
 
 
128
 
129
  # ======================================================================
130
  # FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
@@ -139,30 +173,43 @@ class TaskFetcher:
139
  if not self._is_valid_url(url):
140
  raise TaskProcessingError(f"Invalid URL format: {url}")
141
 
 
 
 
 
 
142
  try:
143
  response = await self._fetch_url(url)
144
  content_type = self._detect_content_type(response)
145
-
 
 
 
 
146
  # Basic extraction
147
- task_description = await self._extract_basic_content(response, content_type)
148
  raw_content = response.text[:5000]
149
 
 
150
  # Heuristic: if nothing useful, try dynamic scraper
151
  if self._looks_js_only(task_description, raw_content):
152
  logger.warning("⚠️ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
153
  dyn = await self._fetch_with_dynamic_scraper(url)
154
  task_description = dyn['task_description']
155
  raw_content = dyn['raw_content']
156
-
 
157
  return {
158
- 'task_description': task_description,
159
- 'raw_content': raw_content,
160
- 'content_type': content_type,
161
- 'url': url,
162
- 'metadata': {
163
- 'content_length': len(response.content),
164
- 'status_code': response.status_code,
165
- }
 
 
166
  }
167
 
168
  except Exception as e:
@@ -212,15 +259,43 @@ class TaskFetcher:
212
  for instruction pages.
213
  """
214
  from app.modules.scrapers.dynamic_scraper import DynamicScraper
 
 
 
 
 
215
 
216
  scraper = DynamicScraper(use_pool=True)
217
  await scraper.initialize()
218
  try:
219
  # Auto-extract text blocks
220
  result = await scraper.scrape_url(url)
 
221
  if not result.success:
222
  raise RuntimeError(result.error or "Dynamic scraping failed")
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  # DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
225
  texts: List[str] = []
226
  if isinstance(result.data, list):
@@ -234,8 +309,10 @@ class TaskFetcher:
234
 
235
  # Best-effort raw_content: you could extend DynamicScraper to return page.content()
236
  return {
237
- 'task_description': task_text,
238
- 'raw_content': task_text[:5000], # at least something readable
 
 
239
  }
240
  finally:
241
  await scraper.cleanup()
@@ -244,31 +321,169 @@ class TaskFetcher:
244
  # BASIC EXTRACTION (NO LLM)
245
  # ======================================================================
246
 
247
- async def _extract_basic_content(self, response: httpx.Response, content_type: str) -> str:
248
- """Fast, no-JS extraction for instruction pages."""
 
 
 
 
 
 
249
  if content_type == 'json':
250
  try:
251
- data = response.json()
252
  for field in ['task', 'description', 'question', 'content', 'text']:
253
  if isinstance(data, dict) and field in data:
254
  return str(data[field])
255
  return json.dumps(data)
256
  except Exception:
257
- return response.text
258
 
259
  if content_type == 'html':
260
  try:
261
- html = response.text
262
- soup = BeautifulSoup(html, 'html.parser')
 
 
263
  for script in soup(['script', 'style', 'nav', 'header', 'footer']):
264
  script.decompose()
 
265
  text = soup.get_text(strip=True, separator=' ')
266
  return text
267
  except Exception as e:
268
  logger.error(f"HTML basic extraction failed: {e}")
269
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  def _detect_content_type(self, response: httpx.Response) -> str:
274
  ct = response.headers.get('content-type', '').lower()
@@ -320,57 +535,7 @@ class TaskFetcher:
320
  return analysis
321
  except Exception as e:
322
  logger.error(f"❌ LLM analysis failed: {e}", exc_info=True)
323
- return self._fallback_analysis(task_description, all_urls, url, base_url)
324
-
325
- def _fallback_analysis(
326
- self,
327
- task_description: str,
328
- all_urls: List[str],
329
- url: str,
330
- base_url: str
331
- ):
332
- """Very simple fallback if LLM fails."""
333
- from app.orchestrator.models import UnifiedTaskAnalysis, InstructionStep
334
-
335
- logger.warning("⚠️ Using fallback pattern-based analysis")
336
-
337
- is_redirect = False
338
- submission_url = None
339
-
340
- for pattern in [r'POST\s+(?:to\s+)?([^\s<>"\']+)', r'submit\s+(?:to\s+)?([^\s<>"\']+)']:
341
- m = re.search(pattern, task_description, re.IGNORECASE)
342
- if m:
343
- submission_url = m.group(1).rstrip('.,;:)')
344
- break
345
-
346
- sentences = re.split(r'[.;\n]', task_description)
347
- instructions = []
348
- step = 1
349
- for s in sentences:
350
- s = s.strip()
351
- if len(s) > 5:
352
- instructions.append(InstructionStep(
353
- step_number=step,
354
- action='unknown',
355
- description=s,
356
- target=None,
357
- dependencies=[]
358
- ))
359
- step += 1
360
-
361
- return UnifiedTaskAnalysis(
362
- is_redirect=is_redirect,
363
- question_url=None,
364
- redirect_reasoning="Fallback: no redirect detection",
365
- submission_url=submission_url,
366
- submission_url_is_relative=submission_url.startswith('/') if submission_url else False,
367
- submission_reasoning="Fallback: simple regex match",
368
- instructions=instructions,
369
- overall_goal="Unknown (fallback)",
370
- complexity="unknown",
371
- confidence=0.3
372
- )
373
-
374
  def _format_instructions(self, steps) -> List[Dict[str, Any]]:
375
  return [
376
  {
 
15
  from app.core.exceptions import TaskProcessingError
16
  from app.utils.llm_client import get_llm_client
17
  from app.utils.prompts import AnalysisPrompts
18
+ from app.services.analyser import QuestionAnalyzer
19
  logger = get_logger(__name__)
20
 
21
 
 
32
  self.timeout = timeout
33
  self.client: Optional[httpx.AsyncClient] = None
34
  self.llm_client = get_llm_client()
35
+ self.question_analyzer = QuestionAnalyzer(self.llm_client)
36
 
37
  # Import here to avoid circular imports
38
  from app.orchestrator.models import UnifiedTaskAnalysis
 
80
 
81
  # Step 1: Fetch visible content (with fallback)
82
  content = await self._fetch_content(url)
83
+ print(content)
84
  logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
85
 
86
+ file_links = content['question_metadata'].get('file_links', [])
87
+
88
+ if file_links:
89
+ # Download files to disk
90
+ downloaded_files = await self._download_files(
91
+ file_links,
92
+ content['base_url'],
93
94
+ )
95
+ content['downloaded_files'] = downloaded_files
96
+ else:
97
+ content['downloaded_files'] = []
98
+
99
  # Step 2: Unified LLM analysis
100
+ logger.info("πŸ” Analyzing question...")
101
+ if not getattr(self.question_analyzer, "_analyzer_agent", None):
102
+ await self.question_analyzer.initialize()
103
+
104
+ analysis = await self.question_analyzer.analyze_question(
105
+ question_metadata=content["question_metadata"],
106
+ base_url=base_url,
107
+ user_email="[email protected]",
108
+ downloaded_files=content["downloaded_files"]
109
  )
 
 
110
  result = {
111
+ 'analysis': analysis,
112
+ 'question_metadata': content['question_metadata'],
113
+ 'base_url':base_url,
114
+ 'user_email':"23f3003322@ds.study.iitm.ac.in",
115
+ 'downloaded_files':content["downloaded_files"]
116
+
 
 
 
 
 
 
117
  }
118
+
119
+ return result
120
+ # analysis = await self._analyze_content_with_llm(
121
+ # task_description=content['task_description'],
122
+ # raw_content=content['raw_content'],
123
+ # url=url,
124
+ # base_url=base_url
125
+ # )
126
 
127
+ # # Merge content + analysis
128
+ # result = {
129
+ # **content,
130
+ # 'is_redirect': analysis.is_redirect,
131
+ # 'question_url': analysis.question_url,
132
+ # 'submission_url': analysis.submission_url,
133
+ # 'instructions': self._format_instructions(analysis.instructions),
134
+ # 'overall_goal': analysis.overall_goal,
135
+ # 'complexity': analysis.complexity,
136
+ # 'llm_analysis': {
137
+ # 'redirect_reasoning': analysis.redirect_reasoning,
138
+ # 'submission_reasoning': analysis.submission_reasoning,
139
+ # 'confidence': analysis.confidence,
140
+ # }
141
+ # }
142
 
143
+ # # Resolve relative submission URL if needed
144
+ # if analysis.submission_url and analysis.submission_url_is_relative:
145
+ # absolute = str(httpx.URL(base_url).join(analysis.submission_url))
146
+ # logger.info(f"βœ“ Resolved relative submission URL: {analysis.submission_url} β†’ {absolute}")
147
+ # result['submission_url'] = absolute
148
 
149
+ # # Resolve relative question URL if needed
150
+ # if analysis.question_url and analysis.question_url.startswith('/'):
151
+ # absolute_q = str(httpx.URL(base_url).join(analysis.question_url))
152
+ # logger.info(f"βœ“ Resolved relative question URL: {analysis.question_url} β†’ {absolute_q}")
153
+ # result['question_url'] = absolute_q
154
 
155
+ # logger.info("βœ… Analysis complete:")
156
+ # logger.info(f" Is Redirect: {result['is_redirect']}")
157
+ # logger.info(f" Submission URL: {result['submission_url']}")
158
+ # logger.info(f" Instructions: {len(result['instructions'])} steps")
159
+ # logger.info(f" Complexity: {result['complexity']}")
160
+
161
+ # return result
162
 
163
  # ======================================================================
164
  # FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
 
173
  if not self._is_valid_url(url):
174
  raise TaskProcessingError(f"Invalid URL format: {url}")
175
 
176
+ from urllib.parse import urlparse
177
+
178
+ parsed = urlparse(url)
179
+ base_url = f"{parsed.scheme}://{parsed.netloc}"
180
+
181
  try:
182
  response = await self._fetch_url(url)
183
  content_type = self._detect_content_type(response)
184
+ html_content = response.text # ← This is html_content
185
+ html_content = html_content.replace(
186
+ '<span class="origin"></span>',
187
+ base_url
188
+ )
189
  # Basic extraction
190
+ task_description = await self._extract_basic_content_from_html(html_content, content_type)
191
  raw_content = response.text[:5000]
192
 
193
+ metadata = self._parse_question_metadata(html_content)
194
  # Heuristic: if nothing useful, try dynamic scraper
195
  if self._looks_js_only(task_description, raw_content):
196
  logger.warning("⚠️ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
197
  dyn = await self._fetch_with_dynamic_scraper(url)
198
  task_description = dyn['task_description']
199
  raw_content = dyn['raw_content']
200
+ metadata = dyn['question_metadata']
201
+
202
  return {
203
+ 'task_description': task_description,
204
+ 'raw_content': raw_content,
205
+ 'content_type': content_type,
206
+ 'url': url,
207
+ 'base_url': base_url,
208
+ 'question_metadata': metadata, # βœ“ ADDED
209
+ 'metadata': {
210
+ 'content_length': len(response.content),
211
+ 'status_code': response.status_code,
212
+ }
213
  }
214
 
215
  except Exception as e:
 
259
  for instruction pages.
260
  """
261
  from app.modules.scrapers.dynamic_scraper import DynamicScraper
262
+ from urllib.parse import urlparse
263
+
264
+ # Extract base URL
265
+ parsed = urlparse(url)
266
+ base_url = f"{parsed.scheme}://{parsed.netloc}"
267
 
268
  scraper = DynamicScraper(use_pool=True)
269
  await scraper.initialize()
270
  try:
271
  # Auto-extract text blocks
272
  result = await scraper.scrape_url(url)
273
+
274
  if not result.success:
275
  raise RuntimeError(result.error or "Dynamic scraping failed")
276
 
277
+ rendered_html = result.raw_html if hasattr(result, 'raw_html') else None
278
+ if rendered_html:
279
+ rendered_html = rendered_html.replace(
280
+ '<span class="origin"></span>',
281
+ base_url
282
+ )
283
+
284
+ question_metadata = None
285
+ if rendered_html:
286
+ soup = BeautifulSoup(rendered_html, 'html.parser')
287
+ question_metadata = self._parse_question_metadata_from_soup(soup)
288
+ file_links = []
289
+ if rendered_html:
290
+ soup = BeautifulSoup(rendered_html, 'html.parser')
291
+ for a in soup.find_all('a', href=True):
292
+ href = a['href']
293
+ if href.startswith('/project2/'):
294
+ file_links.append({
295
+ 'href': href,
296
+ 'text': a.get_text(strip=True)
297
+ })
298
+
299
  # DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
300
  texts: List[str] = []
301
  if isinstance(result.data, list):
 
309
 
310
  # Best-effort raw_content: you could extend DynamicScraper to return page.content()
311
  return {
312
+ 'task_description': task_text,
313
+ 'raw_content': rendered_html if rendered_html else task_text[:5000],
314
+ 'base_url': base_url,
315
+ 'question_metadata': question_metadata, # NEW
316
  }
317
  finally:
318
  await scraper.cleanup()
 
321
  # BASIC EXTRACTION (NO LLM)
322
  # ======================================================================
323
 
324
+ async def _extract_basic_content_from_html(
325
+ self,
326
+ html_content: str, # ← Changed from response
327
+ content_type: str
328
+ ) -> str:
329
+ """
330
+ Fast extraction from HTML string (no JS execution).
331
+ """
332
  if content_type == 'json':
333
  try:
334
+ data = json.loads(html_content)
335
  for field in ['task', 'description', 'question', 'content', 'text']:
336
  if isinstance(data, dict) and field in data:
337
  return str(data[field])
338
  return json.dumps(data)
339
  except Exception:
340
+ return html_content
341
 
342
  if content_type == 'html':
343
  try:
344
+ from bs4 import BeautifulSoup
345
+ soup = BeautifulSoup(html_content, 'html.parser')
346
+
347
+ # Remove scripts (but origin already replaced before this)
348
  for script in soup(['script', 'style', 'nav', 'header', 'footer']):
349
  script.decompose()
350
+
351
  text = soup.get_text(strip=True, separator=' ')
352
  return text
353
  except Exception as e:
354
  logger.error(f"HTML basic extraction failed: {e}")
355
+ return html_content
356
+
357
+ return html_content
358
+
359
+ def _parse_question_metadata(self, html: str) -> Dict[str, Any]:
360
+ """
361
+ Extract structured metadata from question HTML.
362
+ """
363
+ from bs4 import BeautifulSoup
364
+ soup = BeautifulSoup(html, 'html.parser')
365
+
366
+ metadata = {
367
+ 'title': None,
368
+ 'heading': None,
369
+ 'difficulty': None,
370
+ 'is_personalized': False,
371
+ 'instructions': [],
372
+ 'file_links': []
373
+ }
374
+
375
+ # Extract title
376
+ title_tag = soup.find('title')
377
+ if title_tag:
378
+ metadata['title'] = title_tag.text.strip()
379
+
380
+ # Extract heading
381
+ h1_tag = soup.find('h1')
382
+ if h1_tag:
383
+ metadata['heading'] = h1_tag.text.strip()
384
+
385
+ # Extract difficulty and personalization
386
+ for p in soup.find_all('p'):
387
+ text = p.get_text()
388
+
389
+ # Difficulty: "Difficulty: 1 (next URL revealed even if wrong)"
390
+ if 'Difficulty:' in text:
391
+ import re
392
+ match = re.search(r'Difficulty:\s*(\d+)', text)
393
+ if match:
394
+ metadata['difficulty'] = int(match.group(1))
395
+
396
+ # Personalization: "Personalized: Yes" or "Personalized: No"
397
+ if 'Personalized:' in text:
398
+ metadata['is_personalized'] = 'Yes' in text
399
+
400
+ # Extract ordered instructions
401
+ ol_tag = soup.find('ol')
402
+ if ol_tag:
403
+ for li in ol_tag.find_all('li', recursive=False):
404
+ metadata['instructions'].append(li.get_text(strip=True))
405
+
406
+ # Extract file links
407
+ for a in soup.find_all('a', href=True):
408
+ href = a['href']
409
+ if href.startswith('/project2/'):
410
+ metadata['file_links'].append({
411
+ 'href': href,
412
+ 'text': a.get_text(strip=True)
413
+ })
414
+
415
+ return metadata
416
+
417
+ def _parse_question_metadata_from_soup(self, soup) -> Dict[str, Any]:
418
+ """
419
+ Extract structured metadata from BeautifulSoup object.
420
+ Helper method for both httpx and dynamic scraper paths.
421
+
422
+ Args:
423
+ soup: BeautifulSoup parsed HTML
424
+
425
+ Returns:
426
+ Dict with title, difficulty, personalization, instructions, file_links
427
+ """
428
+ metadata = {
429
+ 'title': None,
430
+ 'heading': None,
431
+ 'difficulty': None,
432
+ 'is_personalized': False,
433
+ 'instructions': [],
434
+ 'file_links': []
435
+ }
436
+
437
+ # Extract title
438
+ title_tag = soup.find('title')
439
+ if title_tag:
440
+ metadata['title'] = title_tag.text.strip()
441
 
442
+ # Extract heading
443
+ h1_tag = soup.find('h1')
444
+ if h1_tag:
445
+ metadata['heading'] = h1_tag.text.strip()
446
+
447
+ # Extract difficulty and personalization from paragraphs
448
+ for p in soup.find_all('p'):
449
+ text = p.get_text()
450
+
451
+ # Parse difficulty: "Difficulty: 1 (next URL revealed even if wrong)"
452
+ if 'Difficulty:' in text or 'difficulty:' in text.lower():
453
+ import re
454
+ match = re.search(r'[Dd]ifficulty:\s*(\d+)', text)
455
+ if match:
456
+ metadata['difficulty'] = int(match.group(1))
457
+ logger.debug(f"Parsed difficulty: {metadata['difficulty']}")
458
+
459
+ # Parse personalization: "Personalized: Yes" or "Personalized: No"
460
+ if 'Personalized:' in text or 'personalized:' in text.lower():
461
+ metadata['is_personalized'] = 'yes' in text.lower()
462
+ logger.debug(f"Parsed personalization: {metadata['is_personalized']}")
463
+
464
+ # Extract ordered instructions from <ol> tag
465
+ ol_tag = soup.find('ol')
466
+ if ol_tag:
467
+ for li in ol_tag.find_all('li', recursive=False):
468
+ instruction_text = li.get_text(separator=' ', strip=True)
469
+ metadata['instructions'].append(instruction_text)
470
+ logger.debug(f"Parsed {len(metadata['instructions'])} instructions")
471
+
472
+ # Extract file links from <a> tags
473
+ for a in soup.find_all('a', href=True):
474
+ href = a['href']
475
+ # Look for project files
476
+ if href.startswith('/project2/') or '/project2/' in href:
477
+ metadata['file_links'].append({
478
+ 'href': href,
479
+ 'text': a.get_text(strip=True)
480
+ })
481
+
482
+ if metadata['file_links']:
483
+ logger.debug(f"Found {len(metadata['file_links'])} file links")
484
+
485
+ return metadata
486
+
487
 
488
  def _detect_content_type(self, response: httpx.Response) -> str:
489
  ct = response.headers.get('content-type', '').lower()
 
535
  return analysis
536
  except Exception as e:
537
  logger.error(f"❌ LLM analysis failed: {e}", exc_info=True)
538
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  def _format_instructions(self, steps) -> List[Dict[str, Any]]:
540
  return [
541
  {
app/services/task_processor.py CHANGED
@@ -5,14 +5,16 @@ Simplified with unified LLM analysis in task_fetcher + AnswerSubmitter integrati
5
 
6
  from typing import Dict, Any, Optional
7
  import asyncio
8
- from app.models.request import TaskRequest
9
  from app.core.logging import get_logger
10
  from app.core.exceptions import TaskProcessingError
11
  from app.orchestrator.orchestrator_engine import OrchestratorEngine
12
  from app.modules import get_fully_loaded_registry # βœ… AUTO-REGISTRATION
13
  from app.services.task_fetcher import TaskFetcher
14
  from app.modules.submitters.answer_submitter import AnswerSubmitter # βœ… NEW
15
-
 
 
16
  logger = get_logger(__name__)
17
 
18
  class TaskProcessor:
@@ -28,13 +30,15 @@ class TaskProcessor:
28
  # βœ… AUTO-REGISTER ALL MODULES
29
  self.registry = get_fully_loaded_registry()
30
  self.answer_submitter = AnswerSubmitter()
 
 
31
 
32
  # Initialize orchestrator engine
33
  self.orchestrator = OrchestratorEngine(self.registry)
34
 
35
  logger.info(f"βœ… TaskProcessor initialized with {len(self.registry.modules)} modules")
36
 
37
- async def process(self, task_data: TaskRequest) -> Dict[str, Any]:
38
  """
39
  Process TDS quiz task - COMPLETE END-TO-END FLOW
40
 
@@ -47,7 +51,7 @@ class TaskProcessor:
47
  6. Build response
48
  """
49
  logger.info("=" * 80)
50
- logger.info(f"πŸ”„ Processing task for: {task_data.email}")
51
  logger.info(f"πŸ“‹ Request URL: {task_data.url}")
52
  logger.info("=" * 80)
53
 
@@ -65,101 +69,31 @@ class TaskProcessor:
65
 
66
  # βœ… FIXED: Use proper async context manager pattern
67
  async with TaskFetcher() as fetcher:
68
- analysis = await fetcher.fetch_and_analyze(url=request_url)
69
-
70
- logger.info(f"βœ“ Request URL analyzed")
71
- logger.info(f" Submission URL: {analysis.get('submission_url')}")
72
-
73
- # Extract key information
74
- task_description = analysis['task_description']
75
- submission_url = analysis.get('submission_url')
76
- instructions = analysis.get('instructions', [])
77
- question_url = request_url # Default to request URL
78
-
79
- logger.info(f"πŸ“ Submission URL: {submission_url}")
80
- logger.info(f"πŸ“‹ Instructions: {len(instructions)} steps")
81
-
82
- # ===================================================================
83
- # STEP 2: EXECUTE ORCHESTRATION (Scrape β†’ Extract β†’ Answer)
84
- # ===================================================================
85
- logger.info("\n" + "=" * 80)
86
- logger.info("STEP 2: EXECUTING ORCHESTRATION")
87
- logger.info("=" * 80)
88
 
89
- orchestration_result = await self.orchestrator.execute_task(
90
- task_input=task_description,
91
- task_url=question_url,
92
- context={
93
- 'email': task_data.email,
94
- 'request_url': request_url,
95
- 'question_url': question_url,
96
- 'submission_url': submission_url,
97
- 'instructions': instructions
98
- }
99
  )
100
-
101
- logger.info(f"βœ“ Orchestration completed")
102
- logger.info(f" Success: {orchestration_result['success']}")
103
-
104
- # ===================================================================
105
- # STEP 3: EXTRACT ANSWER
106
- # ===================================================================
107
- answer = self._extract_answer(orchestration_result)
108
- logger.info(f"βœ“ Answer extracted: {str(answer)[:100]}")
109
-
110
- if not answer or answer == "No answer found":
111
- logger.warning("⚠️ No valid answer extracted")
112
- return self._build_response(
113
- task_data, request_url, question_url, submission_url,
114
- analysis, orchestration_result, None, answer
115
- )
116
 
117
- # ===================================================================
118
- # STEP 4: SUBMIT ANSWER & HANDLE CHAINING
119
- # ===================================================================
120
- logger.info("\n" + "=" * 80)
121
- logger.info("STEP 4: SUBMITTING & CHAINING")
122
- logger.info("=" * 80)
123
-
124
- submission_result = await self.answer_submitter.execute({
125
- 'submission_url': submission_url,
126
- 'email': task_data.email,
127
- 'secret': str(answer),
128
- 'quiz_url': question_url,
129
- 'answer': answer
130
- })
131
-
132
- logger.info(f"βœ“ Submission completed: {getattr(submission_result, 'success', False)}")
133
-
134
- # βœ… ALWAYS check for new URL first
135
- if (hasattr(submission_result, 'data') and
136
- submission_result.data and
137
- (next_url := submission_result.data.get('next_quiz_url'))):
138
-
139
- logger.info(f"πŸ”„ NEW QUIZ DETECTED: {next_url}")
140
-
141
- # βœ… FIXED: Proper background task handling with reference tracking
142
- background_tasks = set()
143
- task = asyncio.create_task(self._process_chained_quiz(task_data.email, next_url, submission_url))
144
- background_tasks.add(task)
145
- task.add_done_callback(background_tasks.discard)
146
-
147
- return {
148
- 'success': True,
149
- 'status': 'chained',
150
- 'message': f'Submitted & chained to next quiz: {next_url}',
151
- 'next_url': next_url,
152
- 'correct': submission_result.data.get('correct', False)
153
- }
154
-
155
- # βœ… No new URL = SUCCESS (whether correct or not)
156
- logger.info("βœ… No new quiz - Task completed successfully")
157
- return {
158
- 'success': True,
159
- 'status': 'completed',
160
- 'message': 'Answer submitted successfully to TDS',
161
- 'correct': getattr(submission_result, 'data', {}).get('correct', False)
162
- }
163
 
164
  except Exception as e:
165
  logger.error(f"❌ Task processing failed: {str(e)}", exc_info=True)
@@ -226,7 +160,7 @@ class TaskProcessor:
226
 
227
  def _build_response(
228
  self,
229
- task_data: TaskRequest,
230
  request_url: str,
231
  question_url: str,
232
  submission_url: str,
 
5
 
6
  from typing import Dict, Any, Optional
7
  import asyncio
8
+ from app.models.request import ManualTriggeredRequestBody
9
  from app.core.logging import get_logger
10
  from app.core.exceptions import TaskProcessingError
11
  from app.orchestrator.orchestrator_engine import OrchestratorEngine
12
  from app.modules import get_fully_loaded_registry # βœ… AUTO-REGISTRATION
13
  from app.services.task_fetcher import TaskFetcher
14
  from app.modules.submitters.answer_submitter import AnswerSubmitter # βœ… NEW
15
+ from app.services.answer_generator import AnswerGenerator
16
+ from app.utils.llm_client import get_llm_client
17
+ from app.utils.submit_answer import submit_answer
18
  logger = get_logger(__name__)
19
 
20
  class TaskProcessor:
 
30
  # βœ… AUTO-REGISTER ALL MODULES
31
  self.registry = get_fully_loaded_registry()
32
  self.answer_submitter = AnswerSubmitter()
33
+ self.llm_client = get_llm_client()
34
+ self.answer_generator = AnswerGenerator(self.llm_client)
35
 
36
  # Initialize orchestrator engine
37
  self.orchestrator = OrchestratorEngine(self.registry)
38
 
39
  logger.info(f"βœ… TaskProcessor initialized with {len(self.registry.modules)} modules")
40
 
41
+ async def process(self, task_data: ManualTriggeredRequestBody) -> Dict[str, Any]:
42
  """
43
  Process TDS quiz task - COMPLETE END-TO-END FLOW
44
 
 
51
  6. Build response
52
  """
53
  logger.info("=" * 80)
54
+ # logger.info(f"πŸ”„ Processing task for: {task_data.email}")
55
  logger.info(f"πŸ“‹ Request URL: {task_data.url}")
56
  logger.info("=" * 80)
57
 
 
69
 
70
  # βœ… FIXED: Use proper async context manager pattern
71
  async with TaskFetcher() as fetcher:
72
+ result = await fetcher.fetch_and_analyze(url=request_url)
73
+ print("========")
74
+ print("analysis")
75
+ print(result)
76
+ # Initialize answer generator if needed
77
+ if not getattr(self.answer_generator, "_generator_agent", None):
78
+ await self.answer_generator.initialize()
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ answer = await self.answer_generator.generate(
81
+ analysis=result["analysis"],
82
+ question_metadata=result["question_metadata"],
83
+ base_url=result["base_url"],
84
+ user_email=result["user_email"],
85
+ downloaded_files=result["downloaded_files"]
 
 
 
 
86
  )
87
+ print("================================= answer")
88
+ print(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ return submit_answer(
91
+ submit_url="https://tds-llm-analysis.s-anand.net/submit",
92
+ answer=answer,
93
+ req_url=request_url,
94
+ background_tasks=None
95
+ )
96
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  except Exception as e:
99
  logger.error(f"❌ Task processing failed: {str(e)}", exc_info=True)
 
160
 
161
  def _build_response(
162
  self,
163
+ task_data: ManualTriggeredRequestBody,
164
  request_url: str,
165
  question_url: str,
166
  submission_url: str,
app/utils/prompts.py CHANGED
@@ -132,6 +132,259 @@ class AnalysisPrompts:
132
 
133
  Now analyze the content above."""
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  @staticmethod
136
  def analysis_planning_prompt(
137
  question: str,
 
132
 
133
  Now analyze the content above."""
134
 
135
+ @staticmethod
136
+ def question_analysis_prompt(
137
+ instructions: List[str],
138
+ difficulty: int,
139
+ is_personalized: bool,
140
+ title: str,
141
+ heading: str,
142
+ base_url: str,
143
+ user_email: str,
144
+ available_files: List[Dict[str, Any]]
145
+ ) -> str:
146
+ """
147
+ Generate prompt for analyzing question.
148
+ Focused on extracting what's needed to generate the answer.
149
+ """
150
+
151
+ files_text = "\n".join(
152
+ f"- {f.get('filename', 'unknown')} ({f.get('type', 'unknown')})"
153
+ for f in available_files
154
+ ) if available_files else "None"
155
+
156
+ instructions_text = "\n".join(
157
+ f"{i+1}. {inst}"
158
+ for i, inst in enumerate(instructions)
159
+ )
160
+
161
+ return f"""Analyze this technical quiz question to determine how to generate the correct answer.
162
+
163
+ # QUESTION METADATA
164
+ - **Title**: {title}
165
+ - **Heading**: {heading}
166
+ - **Difficulty**: {difficulty}/5 (1=easiest, 5=hardest)
167
+ - **Personalized**: {is_personalized}
168
+ - **Base URL**: {base_url}
169
+ - **User Email**: {user_email}
170
+
171
+ # INSTRUCTIONS
172
+ {instructions_text}
173
+
174
+ # AVAILABLE FILES
175
+ {files_text}
176
+
177
+ ---
178
+
179
+ # YOUR ANALYSIS TASK
180
+
181
+ Extract the following information to enable answer generation:
182
+
183
+ ## 1. QUESTION TYPE
184
+ Categorize the task:
185
+ - **cli_command**: Generate command strings (uv, git, curl, docker)
186
+ - **file_path**: Return file paths or URLs
187
+ - **data_processing**: Process CSV/JSON/ZIP files
188
+ - **image_analysis**: Analyze images (colors, pixels, differences)
189
+ - **audio_transcription**: Transcribe audio to text
190
+ - **api_interaction**: Make API calls (GitHub, REST APIs)
191
+ - **document_parsing**: Extract data from PDFs
192
+ - **calculation**: Mathematical computations (sums, F1 scores)
193
+ - **text_generation**: Generate YAML, prompts, configuration
194
+ - **optimization**: Solve constraint/optimization problems
195
+ - **llm_reasoning**: Multi-step reasoning or tool planning
196
+
197
+ ## 2. ANSWER FORMAT
198
+ How should the final answer be formatted?
199
+ - **plain_string**: Raw text, no quotes, no JSON (e.g., "uv http get ...")
200
+ - **json_object**: JSON dictionary (e.g., {{"key": "value"}})
201
+ - **json_array**: JSON list (e.g., ["a", "b", "c"])
202
+ - **number**: Integer or float (e.g., 42 or 3.14)
203
+ - **single_letter**: One character (e.g., A, B, or C)
204
+
205
+ ## 3. KEY COMPONENTS
206
+ Extract specific data needed to generate the answer:
207
+
208
+ **For cli_command:**
209
+ - tool: "uv", "git", "curl"
210
+ - subcommand: "http get", "add", "commit"
211
+ - url_template: Pattern with placeholders
212
+ - flags: ["-H", "-m", "-p"]
213
+ - arguments: Headers, messages, parameters
214
+
215
+ **For file_path:**
216
+ - path: Exact path or pattern
217
+
218
+ **For data_processing:**
219
+ - operations: ["normalize", "filter", "aggregate"]
220
+ - output_format: "json", "csv"
221
+ - sorting: Field and direction
222
+
223
+ **For calculations:**
224
+ - formula: Mathematical expression
225
+ - input_sources: Where data comes from
226
+ - precision: Decimal places
227
+
228
+ **For any type:**
229
+ - Any other relevant details from instructions
230
+
231
+ ## 4. PERSONALIZATION
232
+ Determine if answer depends on user's email:
233
+
234
+ **Types:**
235
+ - **email_in_url**: Email appears in URL (e.g., ?email={{user_email}})
236
+ - **email_length_offset**: offset = len(email) mod N, add to result
237
+ - **email_length_conditional**: Different answer based on email length (even/odd)
238
+
239
+ **Details:**
240
+ - Which mod value? (mod 2, mod 3, mod 5)
241
+ - How to apply? (add to result, choose option)
242
+
243
+ ## 5. FILE REQUIREMENTS
244
+ Does the question need files from available_files list?
245
+ - Which file types? (csv, json, png, pdf, opus, zip)
246
+ - What to do with them? (process, analyze, extract)
247
+
248
+ ## 6. EXTERNAL RESOURCES
249
+ Does the question require fetching from another URL/endpoint?
250
+ - API endpoints mentioned in instructions
251
+ - Data sources not in available_files
252
+ - Example: "Use GitHub API with params in /project2/gh-tree.json"
253
+
254
+ ## 7. CRITICAL CONSTRAINTS
255
+ Extract must-follow rules:
256
+ - "command string" not "command output"
257
+ - Exact decimal places (2, 4)
258
+ - Sorting order (ascending, descending)
259
+ - Case sensitivity (lowercase, uppercase)
260
+ - Separators (comma, space, newline)
261
+ - Quote style ("double", 'single', none)
262
+ - No markdown formatting
263
+ - Specific value ranges
264
+
265
+ ## 8. SUBMISSION URL PATH
266
+ The URL path for THIS specific question (from title/heading).
267
+ Pattern: /project2-{{question-name}}
268
+ Example: /project2-uv, /project2-git, /project2-md
269
+
270
+ ---
271
+
272
+ # EXAMPLES
273
+
274
+ ## Example 1: CLI Command (Q2-like)
275
+
276
+ **Instructions:**
277
+ 1. Craft the command string using uv http get on {{{{base_url}}}}/project2/uv.json?email=<your email>
278
+ 2. Include header Accept: application/json
279
+ 3. POST that exact command string as answer
280
+
281
+ **Analysis:**
282
+ {{
283
+ "question_type": "cli_command",
284
+ "answer_format": "plain_string",
285
+ "key_components": {{
286
+ "tool": "uv",
287
+ "subcommand": "http get",
288
+ "url_template": "{{{{base_url}}}}/project2/uv.json?email={{{{user_email}}}}",
289
+ "headers": [{{"name": "Accept", "value": "application/json"}}],
290
+ "header_flag": "-H"
291
+ }},
292
+ "requires_personalization": true,
293
+ "personalization_type": "email_in_url",
294
+ "personalization_details": "User email in URL query parameter",
295
+ "requires_files": false,
296
+ "required_file_types": [],
297
+ "requires_external_fetch": false,
298
+ "external_resources": [],
299
+ "critical_constraints": [
300
+ "Return command string only, not output",
301
+ "Use double quotes for header value",
302
+ "Format: tool subcommand url -H \"header: value\""
303
+ ],
304
+ "submission_url_path": "/project2-uv",
305
+ "reasoning": "Instructions explicitly ask for 'command string' using specific tool and parameters",
306
+ "confidence": 0.98
307
+ }}
308
+
309
+ text
310
+
311
+ ## Example 2: File Path (Q4-like)
312
+
313
+ **Instructions:**
314
+ 1. The correct relative link target is exactly /project2/data-preparation.md
315
+ 2. Submit that exact string. Do not wrap in Markdown/HTML
316
+
317
+ **Analysis:**
318
+ {{
319
+ "question_type": "file_path",
320
+ "answer_format": "plain_string",
321
+ "key_components": {{
322
+ "path": "/project2/data-preparation.md"
323
+ }},
324
+ "requires_personalization": false,
325
+ "requires_files": false,
326
+ "requires_external_fetch": false,
327
+ "critical_constraints": [
328
+ "Exact string: /project2/data-preparation.md",
329
+ "No markdown formatting",
330
+ "No HTML tags",
331
+ "No quotes"
332
+ ],
333
+ "submission_url_path": "/project2-md",
334
+ "reasoning": "Instructions provide exact path to return",
335
+ "confidence": 1.0
336
+ }}
337
+
338
+ text
339
+
340
+ ## Example 3: Data Processing with Personalization (Q9-like)
341
+
342
+ **Instructions:**
343
+ 1. Download logs.zip and sum bytes where event=="download"
344
+ 2. Compute offset = (length of your email) mod 5
345
+ 3. Final answer = base sum + offset
346
+
347
+ **Available Files:**
348
+ - logs.zip (zip)
349
+
350
+ **Analysis:**
351
+ {{
352
+ "question_type": "data_processing",
353
+ "answer_format": "number",
354
+ "key_components": {{
355
+ "file": "logs.zip",
356
+ "operation": "sum",
357
+ "field": "bytes",
358
+ "filter": {{"event": "download"}},
359
+ "offset_formula": "len(user_email) mod 5"
360
+ }},
361
+ "requires_personalization": true,
362
+ "personalization_type": "email_length_offset",
363
+ "personalization_details": "Add (len(email) mod 5) to base sum",
364
+ "requires_files": true,
365
+ "required_file_types": ["zip"],
366
+ "requires_external_fetch": false,
367
+ "critical_constraints": [
368
+ "Filter: event == 'download'",
369
+ "Sum the bytes field",
370
+ "Add email length offset",
371
+ "Return integer only"
372
+ ],
373
+ "submission_url_path": "/project2-logs",
374
+ "reasoning": "File processing with email-based offset calculation",
375
+ "confidence": 0.92
376
+ }}
377
+
378
+ text
379
+
380
+ ---
381
+
382
+ # NOW ANALYZE
383
+
384
+ Analyze the question above and return a complete QuestionAnalysis object.
385
+ Be precise and extract ALL relevant details from the instructions.
386
+ """
387
+
388
  @staticmethod
389
  def analysis_planning_prompt(
390
  question: str,
app/utils/submit_answer.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Answer Submission Utility
3
+ Handles answer submission and chained quiz processing
4
+ """
5
+
6
+ from datetime import datetime
7
+ from fastapi import HTTPException, BackgroundTasks
8
+ from app.core.logging import get_logger
9
+ from app.models.request import ManualTriggeredRequestBody
10
+ logger = get_logger(__name__)
11
+ import requests
12
+
13
+
14
+ def submit_answer(submit_url: str, req_url: str ,answer:str, background_tasks: BackgroundTasks = None) -> dict:
15
+ """
16
+ Submits an answer to the provided submit_url and triggers next quiz if URL is returned.
17
+
18
+ Args:
19
+ submit_url: The URL endpoint to submit the answer to
20
+ body: Dictionary containing email, secret, url, and answer
21
+ background_tasks: FastAPI BackgroundTasks for chained processing
22
+
23
+ Returns:
24
+ The response from the server containing correct status, reason, url, and delay
25
+
26
+ Raises:
27
+ HTTPException on request failure
28
+ """
29
+ try:
30
+ logger.info(f"Submitting answer to {submit_url}")
31
+
32
+ # Get email and secret from environment
33
+ from app.core.config import settings
34
+
35
+ answer_body = {
36
+ "email": settings.USER_EMAIL,
37
+ "secret": settings.API_SECRET,
38
+ "url": req_url,
39
+ "answer": answer
40
+ }
41
+ response = requests.post(submit_url, json=answer_body, timeout=15)
42
+ response.raise_for_status()
43
+
44
+ result = response.json()
45
+ logger.info(f"Submission response: {result}")
46
+
47
+ print(f"[submit_answer] Response from {submit_url}:")
48
+
49
+ print ("="* 8)
50
+ print("answer")
51
+ print(result.get("correct"))
52
+ print(result)
53
+ print ("="* 8)
54
+
55
+ # If response contains a url, process it as the next quiz in background
56
+ if result.get("url"):
57
+ next_url = result["url"]
58
+ logger.info(f"πŸ”— Chained quiz detected: {next_url}")
59
+ print(f"\n[submit_answer] Adding next quiz to background tasks: {next_url}")
60
+
61
+ # If background_tasks available (from FastAPI), use it
62
+ if background_tasks:
63
+ background_tasks.add_task(
64
+ process_next_quiz,
65
+ next_url=next_url,
66
+ email=answer_body.get("email"),
67
+ start_time=datetime.now()
68
+ )
69
+ else:
70
+ # Fallback: run in background thread
71
+ import threading
72
+ thread = threading.Thread(
73
+ target=process_next_quiz,
74
+ args=(next_url, answer_body.get("email"), datetime.now()),
75
+ daemon=True
76
+ )
77
+ thread.start()
78
+ logger.info(f"βœ“ Started background thread for chained quiz")
79
+
80
+ return result
81
+
82
+ except requests.exceptions.RequestException as exc:
83
+ logger.error(f"Failed to submit answer to {submit_url}: {exc}")
84
+ raise HTTPException(status_code=400, detail=f"Submission failed: {exc}")
85
+
86
+
87
+ def process_next_quiz(next_url: str, email: str, start_time: datetime):
88
+ """
89
+ Process the next quiz in the chain as a background task.
90
+
91
+ Args:
92
+ next_url: URL of the next quiz to process
93
+ email: User's email address
94
+ start_time: Start time for tracking
95
+ """
96
+ try:
97
+ logger.info(f"πŸ”„ Processing chained quiz: {next_url}")
98
+
99
+ # Import here to avoid circular dependency
100
+ from app.services.task_processor import TaskProcessor
101
+
102
+ # Create task data for next quiz
103
+ task_data = ManualTriggeredRequestBody(url=next_url)
104
+
105
+ # Process the next quiz
106
+ processor = TaskProcessor()
107
+ import asyncio
108
+ result = asyncio.run(processor.process(task_data))
109
+
110
+ elapsed = (datetime.now() - start_time).total_seconds()
111
+ logger.info(f"βœ… Chained quiz completed in {elapsed:.2f}s")
112
+
113
+ except Exception as e:
114
+ logger.error(f"❌ Failed to process chained quiz: {e}", exc_info=True)