Spaces:
Sleeping
Sleeping
Commit
Β·
dc1c6a7
1
Parent(s):
e071e70
new changes to handle dificulty level 1
Browse files- .gitignore +10 -0
- app/__pycache__/main.cpython-313.pyc +0 -0
- app/api/routes/__pycache__/task.cpython-313.pyc +0 -0
- app/api/routes/task.py +32 -41
- app/core/__pycache__/config.cpython-313.pyc +0 -0
- app/core/__pycache__/exceptions.cpython-313.pyc +0 -0
- app/core/config.py +1 -0
- app/core/exceptions.py +4 -0
- app/main.py +9 -9
- app/models/__pycache__/request.cpython-313.pyc +0 -0
- app/models/__pycache__/response.cpython-313.pyc +0 -0
- app/models/analysis.py +99 -0
- app/models/answer.py +34 -0
- app/models/request.py +6 -0
- app/models/response.py +9 -4
- app/modules/scrapers/base_scraper.py +9 -0
- app/modules/scrapers/dynamic_scraper.py +13 -1
- app/services/__pycache__/task_processor.cpython-313.pyc +0 -0
- app/services/analyser.py +149 -0
- app/services/answer_generator.py +492 -0
- app/services/task_fetcher.py +274 -109
- app/services/task_processor.py +31 -97
- app/utils/prompts.py +253 -0
- app/utils/submit_answer.py +114 -0
.gitignore
CHANGED
|
@@ -41,3 +41,13 @@ Thumbs.db
|
|
| 41 |
|
| 42 |
new.txt
|
| 43 |
.DS_Store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
new.txt
|
| 43 |
.DS_Store
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
analysis.md
|
| 47 |
+
data_fetching.md
|
| 48 |
+
dynamic_scraper.md
|
| 49 |
+
orchestrator.md
|
| 50 |
+
questions.md
|
| 51 |
+
task_processor.md
|
| 52 |
+
unified.md
|
| 53 |
+
|
app/__pycache__/main.cpython-313.pyc
CHANGED
|
Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ
|
|
|
app/api/routes/__pycache__/task.cpython-313.pyc
CHANGED
|
Binary files a/app/api/routes/__pycache__/task.cpython-313.pyc and b/app/api/routes/__pycache__/task.cpython-313.pyc differ
|
|
|
app/api/routes/task.py
CHANGED
|
@@ -5,15 +5,17 @@ Handles task submission and processing
|
|
| 5 |
|
| 6 |
from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
|
| 7 |
from datetime import datetime
|
| 8 |
-
from typing import Dict, Any
|
| 9 |
|
| 10 |
-
from app.models.request import
|
| 11 |
-
from app.models.response import
|
| 12 |
from app.core.logging import get_logger
|
| 13 |
-
from app.core.security import verify_authentication, AuthenticationError
|
| 14 |
from app.core.exceptions import TaskProcessingError
|
| 15 |
from app.services.task_processor import TaskProcessor
|
| 16 |
|
|
|
|
|
|
|
| 17 |
logger = get_logger(__name__)
|
| 18 |
|
| 19 |
router = APIRouter()
|
|
@@ -36,18 +38,7 @@ async def handle_task(
|
|
| 36 |
request: Request,
|
| 37 |
background_tasks: BackgroundTasks
|
| 38 |
):
|
| 39 |
-
|
| 40 |
-
Main API endpoint for handling task requests
|
| 41 |
-
|
| 42 |
-
Flow:
|
| 43 |
-
1. Validate JSON format (HTTP 400 if invalid)
|
| 44 |
-
2. Verify secret (HTTP 403 if invalid)
|
| 45 |
-
3. Respond immediately with HTTP 200
|
| 46 |
-
4. Process task in background
|
| 47 |
-
|
| 48 |
-
Returns:
|
| 49 |
-
Immediate HTTP 200 response with task accepted message
|
| 50 |
-
"""
|
| 51 |
start_time = datetime.now()
|
| 52 |
|
| 53 |
logger.info("π₯ Task request received")
|
|
@@ -58,13 +49,13 @@ async def handle_task(
|
|
| 58 |
# ================================================================
|
| 59 |
try:
|
| 60 |
body = await request.json()
|
| 61 |
-
task_data =
|
| 62 |
-
except ValueError as e:
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
except Exception as e:
|
| 69 |
logger.error(f"β Request validation failed: {str(e)}")
|
| 70 |
raise HTTPException(
|
|
@@ -72,28 +63,26 @@ async def handle_task(
|
|
| 72 |
detail=f"Invalid request data: {str(e)}"
|
| 73 |
)
|
| 74 |
|
| 75 |
-
logger.info(f"β
Request validated for: {task_data.email}")
|
| 76 |
|
| 77 |
-
# ================================================================
|
| 78 |
-
# STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
|
| 79 |
-
# ================================================================
|
| 80 |
-
logger.info("π Verifying authentication")
|
| 81 |
-
try:
|
| 82 |
-
|
| 83 |
-
except AuthenticationError as e:
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
|
| 90 |
-
logger.info("β
Authentication successful")
|
| 91 |
|
| 92 |
# ================================================================
|
| 93 |
# STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
|
| 94 |
# ================================================================
|
| 95 |
logger.info("β
Request accepted - processing in background")
|
| 96 |
-
|
| 97 |
# Add task processing to background
|
| 98 |
background_tasks.add_task(
|
| 99 |
process_task_background,
|
|
@@ -105,7 +94,6 @@ async def handle_task(
|
|
| 105 |
response = ImmediateResponse(
|
| 106 |
success=True,
|
| 107 |
message="Task accepted and processing started",
|
| 108 |
-
email=task_data.email,
|
| 109 |
task_url=str(task_data.url),
|
| 110 |
status="processing",
|
| 111 |
timestamp=datetime.now().isoformat()
|
|
@@ -127,8 +115,11 @@ async def handle_task(
|
|
| 127 |
)
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
|
|
|
| 130 |
async def process_task_background(
|
| 131 |
-
task_data:
|
| 132 |
start_time: datetime
|
| 133 |
):
|
| 134 |
"""
|
|
@@ -149,7 +140,7 @@ async def process_task_background(
|
|
| 149 |
try:
|
| 150 |
# Process the task
|
| 151 |
result_data = await task_processor.process(task_data)
|
| 152 |
-
|
| 153 |
# Calculate execution time
|
| 154 |
execution_time = (datetime.now() - start_time).total_seconds()
|
| 155 |
|
|
|
|
| 5 |
|
| 6 |
from fastapi import APIRouter, Request, status, BackgroundTasks, HTTPException
|
| 7 |
from datetime import datetime
|
| 8 |
+
# from typing import Dict, Any
|
| 9 |
|
| 10 |
+
from app.models.request import ManualTriggeredRequestBody
|
| 11 |
+
from app.models.response import ImmediateResponse
|
| 12 |
from app.core.logging import get_logger
|
| 13 |
+
# from app.core.security import verify_authentication, AuthenticationError
|
| 14 |
from app.core.exceptions import TaskProcessingError
|
| 15 |
from app.services.task_processor import TaskProcessor
|
| 16 |
|
| 17 |
+
import requests
|
| 18 |
+
|
| 19 |
logger = get_logger(__name__)
|
| 20 |
|
| 21 |
router = APIRouter()
|
|
|
|
| 38 |
request: Request,
|
| 39 |
background_tasks: BackgroundTasks
|
| 40 |
):
|
| 41 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
start_time = datetime.now()
|
| 43 |
|
| 44 |
logger.info("π₯ Task request received")
|
|
|
|
| 49 |
# ================================================================
|
| 50 |
try:
|
| 51 |
body = await request.json()
|
| 52 |
+
task_data = ManualTriggeredRequestBody(**body)
|
| 53 |
+
# except ValueError as e:
|
| 54 |
+
# logger.error(f"β Invalid JSON format: {str(e)}")
|
| 55 |
+
# raise HTTPException(
|
| 56 |
+
# status_code=status.HTTP_400_BAD_REQUEST,
|
| 57 |
+
# detail=f"Invalid JSON format: {str(e)}"
|
| 58 |
+
# )
|
| 59 |
except Exception as e:
|
| 60 |
logger.error(f"β Request validation failed: {str(e)}")
|
| 61 |
raise HTTPException(
|
|
|
|
| 63 |
detail=f"Invalid request data: {str(e)}"
|
| 64 |
)
|
| 65 |
|
|
|
|
| 66 |
|
| 67 |
+
# # ================================================================
|
| 68 |
+
# # STEP 2: VERIFY AUTHENTICATION (HTTP 403 if invalid)
|
| 69 |
+
# # ================================================================
|
| 70 |
+
# logger.info("π Verifying authentication")
|
| 71 |
+
# try:
|
| 72 |
+
# verify_authentication(task_data.secret)
|
| 73 |
+
# except AuthenticationError as e:
|
| 74 |
+
# logger.error(f"β Authentication failed: {str(e)}")
|
| 75 |
+
# raise HTTPException(
|
| 76 |
+
# status_code=status.HTTP_403_FORBIDDEN,
|
| 77 |
+
# detail="Invalid secret. Authentication failed."
|
| 78 |
+
# )
|
| 79 |
|
| 80 |
+
# logger.info("β
Authentication successful")
|
| 81 |
|
| 82 |
# ================================================================
|
| 83 |
# STEP 3: RESPOND IMMEDIATELY WITH HTTP 200
|
| 84 |
# ================================================================
|
| 85 |
logger.info("β
Request accepted - processing in background")
|
|
|
|
| 86 |
# Add task processing to background
|
| 87 |
background_tasks.add_task(
|
| 88 |
process_task_background,
|
|
|
|
| 94 |
response = ImmediateResponse(
|
| 95 |
success=True,
|
| 96 |
message="Task accepted and processing started",
|
|
|
|
| 97 |
task_url=str(task_data.url),
|
| 98 |
status="processing",
|
| 99 |
timestamp=datetime.now().isoformat()
|
|
|
|
| 115 |
)
|
| 116 |
|
| 117 |
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
|
| 121 |
async def process_task_background(
|
| 122 |
+
task_data: ManualTriggeredRequestBody,
|
| 123 |
start_time: datetime
|
| 124 |
):
|
| 125 |
"""
|
|
|
|
| 140 |
try:
|
| 141 |
# Process the task
|
| 142 |
result_data = await task_processor.process(task_data)
|
| 143 |
+
|
| 144 |
# Calculate execution time
|
| 145 |
execution_time = (datetime.now() - start_time).total_seconds()
|
| 146 |
|
app/core/__pycache__/config.cpython-313.pyc
CHANGED
|
Binary files a/app/core/__pycache__/config.cpython-313.pyc and b/app/core/__pycache__/config.cpython-313.pyc differ
|
|
|
app/core/__pycache__/exceptions.cpython-313.pyc
CHANGED
|
Binary files a/app/core/__pycache__/exceptions.cpython-313.pyc and b/app/core/__pycache__/exceptions.cpython-313.pyc differ
|
|
|
app/core/config.py
CHANGED
|
@@ -26,6 +26,7 @@ class Settings(BaseSettings):
|
|
| 26 |
|
| 27 |
# Security
|
| 28 |
API_SECRET: str = Field(default="", env="API_SECRET")
|
|
|
|
| 29 |
ALLOWED_ORIGINS: List[str] = Field(default=["*"], env="ALLOWED_ORIGINS")
|
| 30 |
|
| 31 |
# Logging
|
|
|
|
| 26 |
|
| 27 |
# Security
|
| 28 |
API_SECRET: str = Field(default="", env="API_SECRET")
|
| 29 |
+
USER_EMAIL: str = Field(default="", env="USER_EMAIL")
|
| 30 |
ALLOWED_ORIGINS: List[str] = Field(default=["*"], env="ALLOWED_ORIGINS")
|
| 31 |
|
| 32 |
# Logging
|
app/core/exceptions.py
CHANGED
|
@@ -19,6 +19,10 @@ class TaskProcessingError(Exception):
|
|
| 19 |
"""Raised when task processing fails"""
|
| 20 |
pass
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
class AuthenticationError(Exception):
|
| 24 |
"""Raised when authentication fails"""
|
|
|
|
| 19 |
"""Raised when task processing fails"""
|
| 20 |
pass
|
| 21 |
|
| 22 |
+
class AnswerGenerationError(Exception):
|
| 23 |
+
"""Raised when answer generation fails"""
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
|
| 27 |
class AuthenticationError(Exception):
|
| 28 |
"""Raised when authentication fails"""
|
app/main.py
CHANGED
|
@@ -29,13 +29,13 @@ async def lifespan(app: FastAPI):
|
|
| 29 |
logger.info(f"Environment: {settings.ENVIRONMENT}")
|
| 30 |
logger.info("=" * 80)
|
| 31 |
import os
|
| 32 |
-
if os.getenv('ENVIRONMENT') == 'production':
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
|
| 40 |
yield
|
| 41 |
|
|
@@ -57,8 +57,8 @@ def create_application() -> FastAPI:
|
|
| 57 |
description=settings.APP_DESCRIPTION,
|
| 58 |
version=settings.APP_VERSION,
|
| 59 |
lifespan=lifespan,
|
| 60 |
-
docs_url="/docs" if settings.ENVIRONMENT == "development" else None,
|
| 61 |
-
redoc_url="/redoc" if settings.ENVIRONMENT == "development" else None,
|
| 62 |
)
|
| 63 |
|
| 64 |
# Configure CORS
|
|
@@ -76,7 +76,7 @@ def create_application() -> FastAPI:
|
|
| 76 |
# Register exception handlers
|
| 77 |
register_exception_handlers(app)
|
| 78 |
registry = register_all_modules()
|
| 79 |
-
orchestrator = OrchestratorEngine(registry)
|
| 80 |
|
| 81 |
# Include routers
|
| 82 |
app.include_router(health.router, tags=["Health"])
|
|
|
|
| 29 |
logger.info(f"Environment: {settings.ENVIRONMENT}")
|
| 30 |
logger.info("=" * 80)
|
| 31 |
import os
|
| 32 |
+
# if os.getenv('ENVIRONMENT') == 'production':
|
| 33 |
+
# from app.modules.scrapers.browser_pool import get_pooled_browser
|
| 34 |
+
# from app.modules.scrapers.browser_config import PRODUCTION_CONFIG
|
| 35 |
|
| 36 |
+
# logger.info("Pre-warming browser pool...")
|
| 37 |
+
# await get_pooled_browser(PRODUCTION_CONFIG)
|
| 38 |
+
# logger.info("β Browser pool ready")
|
| 39 |
|
| 40 |
yield
|
| 41 |
|
|
|
|
| 57 |
description=settings.APP_DESCRIPTION,
|
| 58 |
version=settings.APP_VERSION,
|
| 59 |
lifespan=lifespan,
|
| 60 |
+
# docs_url="/docs" if settings.ENVIRONMENT == "development" else None,
|
| 61 |
+
# redoc_url="/redoc" if settings.ENVIRONMENT == "development" else None,
|
| 62 |
)
|
| 63 |
|
| 64 |
# Configure CORS
|
|
|
|
| 76 |
# Register exception handlers
|
| 77 |
register_exception_handlers(app)
|
| 78 |
registry = register_all_modules()
|
| 79 |
+
# orchestrator = OrchestratorEngine(registry)
|
| 80 |
|
| 81 |
# Include routers
|
| 82 |
app.include_router(health.router, tags=["Health"])
|
app/models/__pycache__/request.cpython-313.pyc
CHANGED
|
Binary files a/app/models/__pycache__/request.cpython-313.pyc and b/app/models/__pycache__/request.cpython-313.pyc differ
|
|
|
app/models/__pycache__/response.cpython-313.pyc
CHANGED
|
Binary files a/app/models/__pycache__/response.cpython-313.pyc and b/app/models/__pycache__/response.cpython-313.pyc differ
|
|
|
app/models/analysis.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Dict, Any, List, Optional, Literal
|
| 3 |
+
|
| 4 |
+
class QuestionAnalysis(BaseModel):
|
| 5 |
+
"""
|
| 6 |
+
Analysis focused on generating the correct answer.
|
| 7 |
+
No redirect/entry page logic needed.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# ===== QUESTION CLASSIFICATION =====
|
| 11 |
+
question_type: Literal[
|
| 12 |
+
'cli_command', # Q2, Q3: shell commands
|
| 13 |
+
'file_path', # Q4: paths/URLs
|
| 14 |
+
'data_processing', # Q7, Q9, Q11: CSV/JSON processing
|
| 15 |
+
'image_analysis', # Q6, Q17: image operations
|
| 16 |
+
'audio_transcription', # Q5: audio to text
|
| 17 |
+
'api_interaction', # Q8: external API calls
|
| 18 |
+
'document_parsing', # Q10: PDF extraction
|
| 19 |
+
'calculation', # Q20, Q21: mathematical computations
|
| 20 |
+
'text_generation', # Q12, Q13, Q19: YAML, prompts
|
| 21 |
+
'optimization', # Q14, Q18: constraint solving
|
| 22 |
+
'llm_reasoning' # Q16: tool planning/reasoning
|
| 23 |
+
] = Field(description="Type of task to solve")
|
| 24 |
+
|
| 25 |
+
# ===== ANSWER FORMAT =====
|
| 26 |
+
answer_format: Literal[
|
| 27 |
+
'plain_string', # Q2, Q3, Q4: raw text
|
| 28 |
+
'json_object', # Q11, Q14, Q16, Q21: {"key": "value"}
|
| 29 |
+
'json_array', # Q20: ["a", "b", "c"]
|
| 30 |
+
'number', # Q8, Q9, Q10, Q17, Q18: integer/float
|
| 31 |
+
'single_letter' # Q12: A, B, or C
|
| 32 |
+
] = Field(description="How to format the final answer")
|
| 33 |
+
|
| 34 |
+
# ===== ANSWER COMPONENTS =====
|
| 35 |
+
key_components: Dict[str, Any] = Field(
|
| 36 |
+
default_factory=dict,
|
| 37 |
+
description="Extracted components needed to generate answer"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# ===== PERSONALIZATION =====
|
| 41 |
+
requires_personalization: bool = Field(
|
| 42 |
+
default=False,
|
| 43 |
+
description="Does answer depend on user email?"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
personalization_type: Optional[Literal[
|
| 47 |
+
'email_in_url', # Q2: ?email=<user_email>
|
| 48 |
+
'email_length_offset', # Q8, Q9, Q15, Q18: offset = len(email) mod N
|
| 49 |
+
'email_length_conditional' # Q15: if even/odd
|
| 50 |
+
]] = None
|
| 51 |
+
|
| 52 |
+
personalization_details: Optional[str] = Field(
|
| 53 |
+
default=None,
|
| 54 |
+
description="Specific personalization logic"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# ===== FILE REQUIREMENTS =====
|
| 58 |
+
requires_files: bool = Field(
|
| 59 |
+
default=False,
|
| 60 |
+
description="Does question need file downloads?"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
required_file_types: List[str] = Field(
|
| 64 |
+
default_factory=list,
|
| 65 |
+
description="File types needed: csv, json, png, pdf, opus, zip"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# ===== EXTERNAL RESOURCES =====
|
| 69 |
+
requires_external_fetch: bool = Field(
|
| 70 |
+
default=False,
|
| 71 |
+
description="Need to fetch data from another URL (not just files)?"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
external_resources: List[str] = Field(
|
| 75 |
+
default_factory=list,
|
| 76 |
+
description="URLs/endpoints to fetch before solving"
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# ===== CRITICAL CONSTRAINTS =====
|
| 80 |
+
critical_constraints: List[str] = Field(
|
| 81 |
+
default_factory=list,
|
| 82 |
+
description="Must-follow rules for answer format"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# ===== SUBMISSION INFO =====
|
| 86 |
+
submission_url_path: str = Field(
|
| 87 |
+
description="URL path for this question (e.g., '/project2-uv')"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# ===== CONFIDENCE & REASONING =====
|
| 91 |
+
reasoning: str = Field(
|
| 92 |
+
description="Why this classification and components were chosen"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
confidence: float = Field(
|
| 96 |
+
ge=0.0,
|
| 97 |
+
le=1.0,
|
| 98 |
+
description="Confidence in analysis (0.0-1.0)"
|
| 99 |
+
)
|
app/models/answer.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Dict, Any, Optional
|
| 3 |
+
|
| 4 |
+
class AnswerResult(BaseModel):
|
| 5 |
+
"""Structured output from answer generation"""
|
| 6 |
+
|
| 7 |
+
answer: str = Field(
|
| 8 |
+
description="The exact answer to submit (final output)"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
reasoning: str = Field(
|
| 12 |
+
description="Step-by-step explanation of how answer was generated"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
components_used: Dict[str, Any] = Field(
|
| 16 |
+
default_factory=dict,
|
| 17 |
+
description="Which components from analysis were used"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
confidence: float = Field(
|
| 21 |
+
ge=0.0,
|
| 22 |
+
le=1.0,
|
| 23 |
+
description="Confidence in answer correctness (0.0-1.0)"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
personalization_applied: bool = Field(
|
| 27 |
+
default=False,
|
| 28 |
+
description="Whether personalization was applied"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
validation_notes: str = Field(
|
| 32 |
+
default="",
|
| 33 |
+
description="Notes about format validation"
|
| 34 |
+
)
|
app/models/request.py
CHANGED
|
@@ -7,6 +7,12 @@ from typing import Optional, Dict, Any
|
|
| 7 |
from pydantic import BaseModel, Field, EmailStr, HttpUrl, validator
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class TaskRequest(BaseModel):
|
| 11 |
"""
|
| 12 |
Schema for task request validation
|
|
|
|
| 7 |
from pydantic import BaseModel, Field, EmailStr, HttpUrl, validator
|
| 8 |
|
| 9 |
|
| 10 |
+
class ManualTriggeredRequestBody(BaseModel):
|
| 11 |
+
"""Request body format for quiz submission"""
|
| 12 |
+
url: str
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
class TaskRequest(BaseModel):
|
| 17 |
"""
|
| 18 |
Schema for task request validation
|
app/models/response.py
CHANGED
|
@@ -7,6 +7,14 @@ from typing import Optional, Dict, Any
|
|
| 7 |
from datetime import datetime
|
| 8 |
from pydantic import BaseModel, Field
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class ImmediateResponse(BaseModel):
|
| 11 |
"""
|
| 12 |
Immediate response sent after validation
|
|
@@ -19,10 +27,7 @@ class ImmediateResponse(BaseModel):
|
|
| 19 |
message: str = Field(
|
| 20 |
description="Status message"
|
| 21 |
)
|
| 22 |
-
|
| 23 |
-
email: str = Field(
|
| 24 |
-
description="Student email from request"
|
| 25 |
-
)
|
| 26 |
|
| 27 |
task_url: str = Field(
|
| 28 |
description="Task URL from request"
|
|
|
|
| 7 |
from datetime import datetime
|
| 8 |
from pydantic import BaseModel, Field
|
| 9 |
|
| 10 |
+
class SubmissionBody(BaseModel):
|
| 11 |
+
"""Request body format for quiz submission"""
|
| 12 |
+
email: str
|
| 13 |
+
secret: str
|
| 14 |
+
url: str
|
| 15 |
+
answer: int
|
| 16 |
+
|
| 17 |
+
|
| 18 |
class ImmediateResponse(BaseModel):
|
| 19 |
"""
|
| 20 |
Immediate response sent after validation
|
|
|
|
| 27 |
message: str = Field(
|
| 28 |
description="Status message"
|
| 29 |
)
|
| 30 |
+
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
task_url: str = Field(
|
| 33 |
description="Task URL from request"
|
app/modules/scrapers/base_scraper.py
CHANGED
|
@@ -26,6 +26,7 @@ class ScraperResult(BaseModel):
|
|
| 26 |
encoding: str = "utf-8"
|
| 27 |
response_time: float = 0.0
|
| 28 |
status_code: int = 200
|
|
|
|
| 29 |
|
| 30 |
# Scraping details
|
| 31 |
selectors_used: List[str] = Field(default_factory=list)
|
|
@@ -35,6 +36,14 @@ class ScraperResult(BaseModel):
|
|
| 35 |
error: Optional[str] = None
|
| 36 |
warnings: List[str] = Field(default_factory=list)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
class Config:
|
| 39 |
arbitrary_types_allowed = True
|
| 40 |
|
|
|
|
| 26 |
encoding: str = "utf-8"
|
| 27 |
response_time: float = 0.0
|
| 28 |
status_code: int = 200
|
| 29 |
+
raw_html: Optional[str] = None
|
| 30 |
|
| 31 |
# Scraping details
|
| 32 |
selectors_used: List[str] = Field(default_factory=list)
|
|
|
|
| 36 |
error: Optional[str] = None
|
| 37 |
warnings: List[str] = Field(default_factory=list)
|
| 38 |
|
| 39 |
+
def __post_init__(self):
|
| 40 |
+
if self.data is None:
|
| 41 |
+
self.data = []
|
| 42 |
+
if self.columns_extracted is None:
|
| 43 |
+
self.columns_extracted = []
|
| 44 |
+
if self.selectors_used is None:
|
| 45 |
+
self.selectors_used = []
|
| 46 |
+
|
| 47 |
class Config:
|
| 48 |
arbitrary_types_allowed = True
|
| 49 |
|
app/modules/scrapers/dynamic_scraper.py
CHANGED
|
@@ -154,6 +154,7 @@ class DynamicScraper(BaseScraper):
|
|
| 154 |
wait_for: Optional[str] = None,
|
| 155 |
click_selectors: List[str] = None,
|
| 156 |
scroll: bool = False,
|
|
|
|
| 157 |
take_screenshot: bool = False,
|
| 158 |
**kwargs
|
| 159 |
) -> ScraperResult:
|
|
@@ -209,7 +210,7 @@ class DynamicScraper(BaseScraper):
|
|
| 209 |
url=url,
|
| 210 |
error="Failed to load page"
|
| 211 |
)
|
| 212 |
-
|
| 213 |
status_code = response.status
|
| 214 |
logger.info(f"Page loaded | Status: {status_code}")
|
| 215 |
|
|
@@ -246,6 +247,15 @@ class DynamicScraper(BaseScraper):
|
|
| 246 |
else:
|
| 247 |
data = await self._extract_auto(page)
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
# Build result
|
| 250 |
columns = list(data[0].keys()) if data else []
|
| 251 |
|
|
@@ -257,6 +267,8 @@ class DynamicScraper(BaseScraper):
|
|
| 257 |
columns_extracted=columns,
|
| 258 |
status_code=status_code,
|
| 259 |
selectors_used=list(selectors.keys()) if selectors else []
|
|
|
|
|
|
|
| 260 |
)
|
| 261 |
|
| 262 |
logger.info(f"β Scraped {len(data)} rows with browser")
|
|
|
|
| 154 |
wait_for: Optional[str] = None,
|
| 155 |
click_selectors: List[str] = None,
|
| 156 |
scroll: bool = False,
|
| 157 |
+
return_html: bool = True,
|
| 158 |
take_screenshot: bool = False,
|
| 159 |
**kwargs
|
| 160 |
) -> ScraperResult:
|
|
|
|
| 210 |
url=url,
|
| 211 |
error="Failed to load page"
|
| 212 |
)
|
| 213 |
+
|
| 214 |
status_code = response.status
|
| 215 |
logger.info(f"Page loaded | Status: {status_code}")
|
| 216 |
|
|
|
|
| 247 |
else:
|
| 248 |
data = await self._extract_auto(page)
|
| 249 |
|
| 250 |
+
if selectors:
|
| 251 |
+
data = await self._extract_with_selectors(page, selectors)
|
| 252 |
+
else:
|
| 253 |
+
data = await self._extract_auto(page)
|
| 254 |
+
|
| 255 |
+
rendered_html = None
|
| 256 |
+
if return_html:
|
| 257 |
+
rendered_html = await page.content()
|
| 258 |
+
|
| 259 |
# Build result
|
| 260 |
columns = list(data[0].keys()) if data else []
|
| 261 |
|
|
|
|
| 267 |
columns_extracted=columns,
|
| 268 |
status_code=status_code,
|
| 269 |
selectors_used=list(selectors.keys()) if selectors else []
|
| 270 |
+
,
|
| 271 |
+
raw_html=rendered_html
|
| 272 |
)
|
| 273 |
|
| 274 |
logger.info(f"β Scraped {len(data)} rows with browser")
|
app/services/__pycache__/task_processor.cpython-313.pyc
CHANGED
|
Binary files a/app/services/__pycache__/task_processor.cpython-313.pyc and b/app/services/__pycache__/task_processor.cpython-313.pyc differ
|
|
|
app/services/analyser.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any, List
|
| 2 |
+
from app.core.logging import get_logger
|
| 3 |
+
from app.core.exceptions import TaskProcessingError
|
| 4 |
+
from app.models.analysis import QuestionAnalysis
|
| 5 |
+
from app.utils.prompts import AnalysisPrompts
|
| 6 |
+
logger = get_logger(__name__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class QuestionAnalyzer:
|
| 10 |
+
"""
|
| 11 |
+
Analyzes questions to determine how to generate answers.
|
| 12 |
+
No entry page or redirect logic - assumes all content is solvable questions.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, llm_client):
|
| 16 |
+
"""
|
| 17 |
+
Args:
|
| 18 |
+
llm_client: LLM client with run_agent() method
|
| 19 |
+
"""
|
| 20 |
+
self.llm_client = llm_client
|
| 21 |
+
self._analyzer_agent = None
|
| 22 |
+
|
| 23 |
+
async def initialize(self):
|
| 24 |
+
"""Initialize LLM agent"""
|
| 25 |
+
self._analyzer_agent = self.llm_client.create_agent(
|
| 26 |
+
output_type=QuestionAnalysis,
|
| 27 |
+
system_prompt=(
|
| 28 |
+
"You are an expert at analyzing technical quiz questions. "
|
| 29 |
+
"Extract precise information needed to generate correct answers. "
|
| 30 |
+
"Be thorough and accurate."
|
| 31 |
+
),
|
| 32 |
+
retries=2
|
| 33 |
+
)
|
| 34 |
+
logger.info("β Question analyzer initialized")
|
| 35 |
+
|
| 36 |
+
async def analyze_question(
|
| 37 |
+
self,
|
| 38 |
+
question_metadata: Dict[str, Any],
|
| 39 |
+
base_url: str,
|
| 40 |
+
user_email: str,
|
| 41 |
+
downloaded_files: List[Dict[str, Any]]
|
| 42 |
+
) -> QuestionAnalysis:
|
| 43 |
+
"""
|
| 44 |
+
Analyze question to determine how to generate the answer.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
question_metadata: Parsed metadata from scraping
|
| 48 |
+
- title: Question title
|
| 49 |
+
- heading: Question heading
|
| 50 |
+
- difficulty: 1-5
|
| 51 |
+
- is_personalized: bool
|
| 52 |
+
- instructions: List of instruction strings
|
| 53 |
+
- file_links: List of file references
|
| 54 |
+
base_url: Base URL for the quiz
|
| 55 |
+
user_email: User's email address
|
| 56 |
+
downloaded_files: List of downloaded file info
|
| 57 |
+
- filename, type, path, size
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
QuestionAnalysis: Structured analysis
|
| 61 |
+
|
| 62 |
+
Raises:
|
| 63 |
+
TaskProcessingError: If analysis fails
|
| 64 |
+
"""
|
| 65 |
+
logger.info(f"π€ Analyzing question: {question_metadata.get('title', 'unknown')}")
|
| 66 |
+
|
| 67 |
+
# Build prompt
|
| 68 |
+
prompt = AnalysisPrompts.question_analysis_prompt(
|
| 69 |
+
instructions=question_metadata.get('instructions', []),
|
| 70 |
+
difficulty=question_metadata.get('difficulty', 1),
|
| 71 |
+
is_personalized=question_metadata.get('is_personalized', False),
|
| 72 |
+
title=question_metadata.get('title', ''),
|
| 73 |
+
heading=question_metadata.get('heading', ''),
|
| 74 |
+
base_url=base_url,
|
| 75 |
+
user_email=user_email,
|
| 76 |
+
available_files=downloaded_files
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
# Run LLM analysis
|
| 81 |
+
analysis: QuestionAnalysis = await self.llm_client.run_agent(
|
| 82 |
+
self._analyzer_agent,
|
| 83 |
+
prompt
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Log analysis results
|
| 87 |
+
logger.info(f"β Question type: {analysis.question_type}")
|
| 88 |
+
logger.info(f"β Answer format: {analysis.answer_format}")
|
| 89 |
+
logger.info(f"β Personalization: {analysis.requires_personalization}")
|
| 90 |
+
logger.info(f"β Files needed: {analysis.requires_files}")
|
| 91 |
+
logger.info(f"β Confidence: {analysis.confidence:.2f}")
|
| 92 |
+
|
| 93 |
+
# Validate analysis
|
| 94 |
+
self._validate_analysis(analysis, question_metadata)
|
| 95 |
+
|
| 96 |
+
return analysis
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"β Question analysis failed: {e}", exc_info=True)
|
| 100 |
+
raise TaskProcessingError(
|
| 101 |
+
f"Cannot analyze question: {str(e)}. "
|
| 102 |
+
"LLM analysis is required for unknown question types."
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
def _validate_analysis(
|
| 106 |
+
self,
|
| 107 |
+
analysis: QuestionAnalysis,
|
| 108 |
+
metadata: Dict[str, Any]
|
| 109 |
+
):
|
| 110 |
+
"""
|
| 111 |
+
Validate analysis results make sense.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
analysis: LLM analysis result
|
| 115 |
+
metadata: Original question metadata
|
| 116 |
+
|
| 117 |
+
Raises:
|
| 118 |
+
TaskProcessingError: If validation fails
|
| 119 |
+
"""
|
| 120 |
+
# Check confidence threshold
|
| 121 |
+
if analysis.confidence < 0.5:
|
| 122 |
+
logger.warning(
|
| 123 |
+
f"β οΈ Low confidence analysis: {analysis.confidence:.2f}"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Check personalization consistency
|
| 127 |
+
if metadata.get('is_personalized') and not analysis.requires_personalization:
|
| 128 |
+
logger.warning(
|
| 129 |
+
"β οΈ Metadata says personalized but analysis disagrees"
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Check file requirements
|
| 133 |
+
if analysis.requires_files and not analysis.required_file_types:
|
| 134 |
+
logger.warning(
|
| 135 |
+
"β οΈ Requires files but no file types specified"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Check submission URL
|
| 139 |
+
if not analysis.submission_url_path:
|
| 140 |
+
raise TaskProcessingError(
|
| 141 |
+
"Analysis missing submission_url_path"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
if not analysis.submission_url_path.startswith('/'):
|
| 145 |
+
logger.warning(
|
| 146 |
+
f"β οΈ Submission URL should start with '/': {analysis.submission_url_path}"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
logger.debug("β Analysis validation passed")
|
app/services/answer_generator.py
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any, List, Optional
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from app.core.logging import get_logger
|
| 5 |
+
from app.core.exceptions import AnswerGenerationError
|
| 6 |
+
from app.models.answer import AnswerResult
|
| 7 |
+
from app.models.analysis import QuestionAnalysis
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AnswerGenerator:
|
| 12 |
+
"""
|
| 13 |
+
Generates answers based on question analysis.
|
| 14 |
+
Uses LLM with rich context for flexibility with unknown questions.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, llm_client):
|
| 18 |
+
"""
|
| 19 |
+
Args:
|
| 20 |
+
llm_client: LLM client with run_agent() method
|
| 21 |
+
"""
|
| 22 |
+
self.llm_client = llm_client
|
| 23 |
+
self._generator_agent = None
|
| 24 |
+
|
| 25 |
+
async def initialize(self):
|
| 26 |
+
"""Initialize LLM agent for answer generation"""
|
| 27 |
+
self._generator_agent = self.llm_client.create_agent(
|
| 28 |
+
output_type=AnswerResult,
|
| 29 |
+
system_prompt=(
|
| 30 |
+
"You are an expert at solving technical quiz questions. "
|
| 31 |
+
"Generate precise, exact answers based on the provided analysis and context. "
|
| 32 |
+
"Follow all constraints strictly. "
|
| 33 |
+
"Be thorough in your reasoning to ensure correctness."
|
| 34 |
+
),
|
| 35 |
+
retries=2
|
| 36 |
+
)
|
| 37 |
+
logger.info("β Answer generator initialized")
|
| 38 |
+
|
| 39 |
+
async def generate(
|
| 40 |
+
self,
|
| 41 |
+
analysis: 'QuestionAnalysis',
|
| 42 |
+
question_metadata: Dict[str, Any],
|
| 43 |
+
base_url: str,
|
| 44 |
+
user_email: str,
|
| 45 |
+
downloaded_files: List[Dict[str, Any]]
|
| 46 |
+
) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Generate answer based on question analysis.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
analysis: Question analysis from analyzer
|
| 52 |
+
question_metadata: Original metadata with instructions
|
| 53 |
+
base_url: Base URL for the quiz
|
| 54 |
+
user_email: User's email address
|
| 55 |
+
downloaded_files: List of downloaded files with local_path
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
str: Final answer ready to submit
|
| 59 |
+
|
| 60 |
+
Raises:
|
| 61 |
+
AnswerGenerationError: If generation fails
|
| 62 |
+
"""
|
| 63 |
+
logger.info(f"π‘ Generating answer for {analysis.question_type}...")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Step 1: Build comprehensive context for LLM
|
| 67 |
+
context = self._build_generation_context(
|
| 68 |
+
analysis=analysis,
|
| 69 |
+
question_metadata=question_metadata,
|
| 70 |
+
base_url=base_url,
|
| 71 |
+
user_email=user_email,
|
| 72 |
+
downloaded_files=downloaded_files
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Step 2: Generate answer with LLM
|
| 76 |
+
result = await self._generate_with_llm(context)
|
| 77 |
+
|
| 78 |
+
logger.info(f"β Generated answer (confidence: {result.confidence:.2f})")
|
| 79 |
+
logger.debug(f"Reasoning: {result.reasoning}")
|
| 80 |
+
|
| 81 |
+
# Step 3: Apply personalization if needed
|
| 82 |
+
if analysis.requires_personalization and not result.personalization_applied:
|
| 83 |
+
logger.info("Applying personalization...")
|
| 84 |
+
result.answer = self._apply_personalization(
|
| 85 |
+
answer=result.answer,
|
| 86 |
+
analysis=analysis,
|
| 87 |
+
user_email=user_email
|
| 88 |
+
)
|
| 89 |
+
result.personalization_applied = True
|
| 90 |
+
|
| 91 |
+
# Step 4: Validate format
|
| 92 |
+
is_valid, validation_message = self._validate_format(
|
| 93 |
+
result.answer,
|
| 94 |
+
analysis
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
if not is_valid:
|
| 98 |
+
logger.warning(f"Format validation issue: {validation_message}")
|
| 99 |
+
# Try to auto-correct common issues
|
| 100 |
+
result.answer = self._auto_correct_format(
|
| 101 |
+
result.answer,
|
| 102 |
+
analysis,
|
| 103 |
+
validation_message
|
| 104 |
+
)
|
| 105 |
+
logger.info("Applied auto-correction")
|
| 106 |
+
|
| 107 |
+
# Step 5: Check constraints
|
| 108 |
+
constraints_met, violations = self._check_constraints(
|
| 109 |
+
result.answer,
|
| 110 |
+
analysis
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
if not constraints_met:
|
| 114 |
+
logger.warning(f"Constraint violations: {violations}")
|
| 115 |
+
if result.confidence < 0.8:
|
| 116 |
+
raise AnswerGenerationError(
|
| 117 |
+
f"Low confidence ({result.confidence}) with constraint violations: {violations}"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
logger.info(f"β Final answer: {result.answer[:100]}...")
|
| 121 |
+
|
| 122 |
+
return result.answer
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.error(f"β Answer generation failed: {e}", exc_info=True)
|
| 126 |
+
raise AnswerGenerationError(f"Failed to generate answer: {str(e)}")
|
| 127 |
+
|
| 128 |
+
def _build_generation_context(
|
| 129 |
+
self,
|
| 130 |
+
analysis: 'QuestionAnalysis',
|
| 131 |
+
question_metadata: Dict[str, Any],
|
| 132 |
+
base_url: str,
|
| 133 |
+
user_email: str,
|
| 134 |
+
downloaded_files: List[Dict[str, Any]]
|
| 135 |
+
) -> str:
|
| 136 |
+
"""
|
| 137 |
+
Build comprehensive context prompt for LLM.
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
str: Rich context prompt
|
| 141 |
+
"""
|
| 142 |
+
# Format instructions
|
| 143 |
+
instructions_text = "\n".join(
|
| 144 |
+
f"{i+1}. {inst}"
|
| 145 |
+
for i, inst in enumerate(question_metadata.get('instructions', []))
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Format key components
|
| 149 |
+
components_text = json.dumps(analysis.key_components, indent=2)
|
| 150 |
+
|
| 151 |
+
# Format constraints
|
| 152 |
+
constraints_text = "\n".join(
|
| 153 |
+
f"- {constraint}"
|
| 154 |
+
for constraint in analysis.critical_constraints
|
| 155 |
+
) if analysis.critical_constraints else "None specified"
|
| 156 |
+
|
| 157 |
+
# Format files
|
| 158 |
+
files_text = "\n".join(
|
| 159 |
+
f"- {f['filename']} (type: {f['type']}, path: {f['local_path']})"
|
| 160 |
+
for f in downloaded_files
|
| 161 |
+
) if downloaded_files else "None"
|
| 162 |
+
|
| 163 |
+
# Build personalization info
|
| 164 |
+
personalization_text = "Not required"
|
| 165 |
+
if analysis.requires_personalization:
|
| 166 |
+
personalization_text = f"""
|
| 167 |
+
Required: Yes
|
| 168 |
+
Type: {analysis.personalization_type}
|
| 169 |
+
Details: {analysis.personalization_details}
|
| 170 |
+
User Email: {user_email}
|
| 171 |
+
Email Length: {len(user_email)}
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
# Build complete prompt
|
| 175 |
+
prompt = f"""Generate the exact answer for this technical quiz question.
|
| 176 |
+
|
| 177 |
+
# QUESTION METADATA
|
| 178 |
+
- Title: {question_metadata.get('title', 'Unknown')}
|
| 179 |
+
- Difficulty: {question_metadata.get('difficulty', 'Unknown')}/5
|
| 180 |
+
- Question Type: {analysis.question_type}
|
| 181 |
+
- Answer Format: {analysis.answer_format}
|
| 182 |
+
|
| 183 |
+
# ORIGINAL INSTRUCTIONS
|
| 184 |
+
{instructions_text}
|
| 185 |
+
|
| 186 |
+
# EXTRACTED COMPONENTS
|
| 187 |
+
The following components were extracted from the instructions:
|
| 188 |
+
{components_text}
|
| 189 |
+
|
| 190 |
+
# USER CONTEXT
|
| 191 |
+
- Base URL: {base_url}
|
| 192 |
+
- User Email: {user_email}
|
| 193 |
+
|
| 194 |
+
# PERSONALIZATION
|
| 195 |
+
{personalization_text}
|
| 196 |
+
|
| 197 |
+
# AVAILABLE FILES
|
| 198 |
+
{files_text}
|
| 199 |
+
|
| 200 |
+
# CRITICAL CONSTRAINTS
|
| 201 |
+
{constraints_text}
|
| 202 |
+
|
| 203 |
+
# ANSWER FORMAT REQUIREMENTS
|
| 204 |
+
Format: {analysis.answer_format}
|
| 205 |
+
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
# Add format-specific guidance
|
| 209 |
+
if analysis.answer_format == 'plain_string':
|
| 210 |
+
prompt += """
|
| 211 |
+
Return PLAIN TEXT ONLY:
|
| 212 |
+
- No JSON wrapping
|
| 213 |
+
- No quotes around the answer
|
| 214 |
+
- No extra formatting
|
| 215 |
+
- Just the raw string
|
| 216 |
+
"""
|
| 217 |
+
elif analysis.answer_format == 'json_object':
|
| 218 |
+
prompt += """
|
| 219 |
+
Return VALID JSON OBJECT:
|
| 220 |
+
- Must be a dictionary {{"key": "value"}}
|
| 221 |
+
- Properly escaped quotes
|
| 222 |
+
- Valid JSON syntax
|
| 223 |
+
"""
|
| 224 |
+
elif analysis.answer_format == 'json_array':
|
| 225 |
+
prompt += """
|
| 226 |
+
Return VALID JSON ARRAY:
|
| 227 |
+
- Must be a list ["item1", "item2"]
|
| 228 |
+
- Properly formatted
|
| 229 |
+
- Valid JSON syntax
|
| 230 |
+
"""
|
| 231 |
+
elif analysis.answer_format == 'number':
|
| 232 |
+
prompt += """
|
| 233 |
+
Return NUMBER ONLY:
|
| 234 |
+
- Just the numeric value
|
| 235 |
+
- No units or extra text
|
| 236 |
+
- Integer or float as appropriate
|
| 237 |
+
"""
|
| 238 |
+
elif analysis.answer_format == 'single_letter':
|
| 239 |
+
prompt += """
|
| 240 |
+
Return SINGLE LETTER:
|
| 241 |
+
- Just one character (A, B, C, etc.)
|
| 242 |
+
- No explanation or extra text
|
| 243 |
+
"""
|
| 244 |
+
|
| 245 |
+
# Add question-type specific guidance
|
| 246 |
+
if analysis.question_type == 'cli_command':
|
| 247 |
+
prompt += """
|
| 248 |
+
|
| 249 |
+
# COMMAND GENERATION GUIDANCE
|
| 250 |
+
- Assemble command from components in correct order
|
| 251 |
+
- Use exact formatting for flags and arguments
|
| 252 |
+
- Pay attention to quote style (single vs double)
|
| 253 |
+
- Include all required parts (tool, subcommand, arguments, flags)
|
| 254 |
+
- Do NOT include shell prompt ($, >, #)
|
| 255 |
+
- Return the COMMAND STRING itself, not its output
|
| 256 |
+
"""
|
| 257 |
+
elif analysis.question_type == 'file_path':
|
| 258 |
+
prompt += """
|
| 259 |
+
|
| 260 |
+
# FILE PATH GUIDANCE
|
| 261 |
+
- Return the exact path as specified
|
| 262 |
+
- No markdown formatting []()
|
| 263 |
+
- No HTML tags
|
| 264 |
+
- No quotes unless specifically required
|
| 265 |
+
- Exact string match is critical
|
| 266 |
+
"""
|
| 267 |
+
|
| 268 |
+
prompt += """
|
| 269 |
+
|
| 270 |
+
# YOUR TASK
|
| 271 |
+
Generate the EXACT answer that should be submitted based on all the information above.
|
| 272 |
+
|
| 273 |
+
IMPORTANT:
|
| 274 |
+
1. Use the extracted components to build the answer
|
| 275 |
+
2. Replace any placeholders with actual values (base_url, user_email)
|
| 276 |
+
3. Follow ALL critical constraints precisely
|
| 277 |
+
4. Match the required answer format exactly
|
| 278 |
+
5. Provide detailed reasoning for your answer
|
| 279 |
+
|
| 280 |
+
Generate the answer now.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
return prompt
|
| 284 |
+
|
| 285 |
+
async def _generate_with_llm(self, context: str) -> AnswerResult:
|
| 286 |
+
"""
|
| 287 |
+
Call LLM to generate answer.
|
| 288 |
+
|
| 289 |
+
Args:
|
| 290 |
+
context: Rich context prompt
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
AnswerResult: Structured answer with reasoning
|
| 294 |
+
"""
|
| 295 |
+
try:
|
| 296 |
+
result: AnswerResult = await self.llm_client.run_agent(
|
| 297 |
+
self._generator_agent,
|
| 298 |
+
context
|
| 299 |
+
)
|
| 300 |
+
return result
|
| 301 |
+
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"LLM generation failed: {e}")
|
| 304 |
+
raise AnswerGenerationError(f"LLM generation failed: {str(e)}")
|
| 305 |
+
|
| 306 |
+
def _apply_personalization(
|
| 307 |
+
self,
|
| 308 |
+
answer: str,
|
| 309 |
+
analysis: 'QuestionAnalysis',
|
| 310 |
+
user_email: str
|
| 311 |
+
) -> str:
|
| 312 |
+
"""
|
| 313 |
+
Apply email-based personalization to answer.
|
| 314 |
+
|
| 315 |
+
Args:
|
| 316 |
+
answer: Base answer from LLM
|
| 317 |
+
analysis: Question analysis
|
| 318 |
+
user_email: User's email
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
str: Personalized answer
|
| 322 |
+
"""
|
| 323 |
+
if not analysis.requires_personalization:
|
| 324 |
+
return answer
|
| 325 |
+
|
| 326 |
+
email_length = len(user_email)
|
| 327 |
+
|
| 328 |
+
if analysis.personalization_type == 'email_length_offset':
|
| 329 |
+
# Parse offset formula from personalization_details
|
| 330 |
+
# Example: "Add (len(email) mod 5) to base sum"
|
| 331 |
+
|
| 332 |
+
match = re.search(r'mod\s+(\d+)', analysis.personalization_details or '')
|
| 333 |
+
if match:
|
| 334 |
+
mod_value = int(match.group(1))
|
| 335 |
+
offset = email_length % mod_value
|
| 336 |
+
|
| 337 |
+
# Try to parse answer as number and add offset
|
| 338 |
+
try:
|
| 339 |
+
base_value = float(answer)
|
| 340 |
+
final_value = base_value + offset
|
| 341 |
+
|
| 342 |
+
# Return as int if it's a whole number
|
| 343 |
+
if final_value.is_integer():
|
| 344 |
+
return str(int(final_value))
|
| 345 |
+
return str(final_value)
|
| 346 |
+
|
| 347 |
+
except ValueError:
|
| 348 |
+
logger.warning(f"Cannot apply offset to non-numeric answer: {answer}")
|
| 349 |
+
return answer
|
| 350 |
+
|
| 351 |
+
elif analysis.personalization_type == 'email_length_conditional':
|
| 352 |
+
# Example: If even, use option A; if odd, use option B
|
| 353 |
+
# This should already be handled by LLM based on email_length in context
|
| 354 |
+
pass
|
| 355 |
+
|
| 356 |
+
return answer
|
| 357 |
+
|
| 358 |
+
def _validate_format(
|
| 359 |
+
self,
|
| 360 |
+
answer: str,
|
| 361 |
+
analysis: 'QuestionAnalysis'
|
| 362 |
+
) -> tuple[bool, str]:
|
| 363 |
+
"""
|
| 364 |
+
Validate answer matches expected format.
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
tuple: (is_valid, message)
|
| 368 |
+
"""
|
| 369 |
+
answer_format = analysis.answer_format
|
| 370 |
+
|
| 371 |
+
if answer_format == 'plain_string':
|
| 372 |
+
# Should not be JSON
|
| 373 |
+
if answer.strip().startswith(('{', '[')):
|
| 374 |
+
return False, "Should be plain string, not JSON"
|
| 375 |
+
return True, "Valid plain string"
|
| 376 |
+
|
| 377 |
+
elif answer_format == 'json_object':
|
| 378 |
+
try:
|
| 379 |
+
parsed = json.loads(answer)
|
| 380 |
+
if not isinstance(parsed, dict):
|
| 381 |
+
return False, "Should be JSON object (dict), not array"
|
| 382 |
+
return True, "Valid JSON object"
|
| 383 |
+
except json.JSONDecodeError as e:
|
| 384 |
+
return False, f"Invalid JSON: {str(e)}"
|
| 385 |
+
|
| 386 |
+
elif answer_format == 'json_array':
|
| 387 |
+
try:
|
| 388 |
+
parsed = json.loads(answer)
|
| 389 |
+
if not isinstance(parsed, list):
|
| 390 |
+
return False, "Should be JSON array (list), not object"
|
| 391 |
+
return True, "Valid JSON array"
|
| 392 |
+
except json.JSONDecodeError as e:
|
| 393 |
+
return False, f"Invalid JSON: {str(e)}"
|
| 394 |
+
|
| 395 |
+
elif answer_format == 'number':
|
| 396 |
+
try:
|
| 397 |
+
float(answer.strip())
|
| 398 |
+
return True, "Valid number"
|
| 399 |
+
except ValueError:
|
| 400 |
+
return False, "Should be a numeric value"
|
| 401 |
+
|
| 402 |
+
elif answer_format == 'single_letter':
|
| 403 |
+
if len(answer.strip()) == 1 and answer.strip().isalpha():
|
| 404 |
+
return True, "Valid single letter"
|
| 405 |
+
return False, "Should be exactly one letter"
|
| 406 |
+
|
| 407 |
+
return True, "Format not strictly validated"
|
| 408 |
+
|
| 409 |
+
def _check_constraints(
|
| 410 |
+
self,
|
| 411 |
+
answer: str,
|
| 412 |
+
analysis: 'QuestionAnalysis'
|
| 413 |
+
) -> tuple[bool, List[str]]:
|
| 414 |
+
"""
|
| 415 |
+
Check answer against critical constraints.
|
| 416 |
+
|
| 417 |
+
Returns:
|
| 418 |
+
tuple: (all_met, violations_list)
|
| 419 |
+
"""
|
| 420 |
+
violations = []
|
| 421 |
+
|
| 422 |
+
for constraint in analysis.critical_constraints:
|
| 423 |
+
constraint_lower = constraint.lower()
|
| 424 |
+
|
| 425 |
+
# Check: "command string not output"
|
| 426 |
+
if 'command string' in constraint_lower and 'not output' in constraint_lower:
|
| 427 |
+
if answer.startswith(('$', '>', '#', 'Output:', 'Result:')):
|
| 428 |
+
violations.append("Answer looks like output/prompt, should be command only")
|
| 429 |
+
|
| 430 |
+
# Check: "no markdown formatting"
|
| 431 |
+
if 'no markdown' in constraint_lower or 'no formatting' in constraint_lower:
|
| 432 |
+
if re.search(r'\[.+\]\(.+\)', answer):
|
| 433 |
+
violations.append("Should not have markdown links []() formatting")
|
| 434 |
+
|
| 435 |
+
# Check: "double quotes"
|
| 436 |
+
if 'double quote' in constraint_lower:
|
| 437 |
+
if "'" in answer and '"' not in answer:
|
| 438 |
+
violations.append("Should use double quotes, not single quotes")
|
| 439 |
+
|
| 440 |
+
# Check: "exact string"
|
| 441 |
+
if 'exact string' in constraint_lower:
|
| 442 |
+
# Can't validate without knowing expected value
|
| 443 |
+
pass
|
| 444 |
+
|
| 445 |
+
# Check: "lowercase"
|
| 446 |
+
if 'lowercase' in constraint_lower:
|
| 447 |
+
if answer != answer.lower():
|
| 448 |
+
violations.append("Should be lowercase")
|
| 449 |
+
|
| 450 |
+
# Check: "no quotes" or "plain path"
|
| 451 |
+
if 'no quotes' in constraint_lower or 'plain' in constraint_lower:
|
| 452 |
+
if answer.startswith(('"', "'")) and answer.endswith(('"', "'")):
|
| 453 |
+
violations.append("Should not be wrapped in quotes")
|
| 454 |
+
|
| 455 |
+
return len(violations) == 0, violations
|
| 456 |
+
|
| 457 |
+
def _auto_correct_format(
|
| 458 |
+
self,
|
| 459 |
+
answer: str,
|
| 460 |
+
analysis: 'QuestionAnalysis',
|
| 461 |
+
validation_message: str
|
| 462 |
+
) -> str:
|
| 463 |
+
"""
|
| 464 |
+
Attempt to auto-correct common format issues.
|
| 465 |
+
|
| 466 |
+
Args:
|
| 467 |
+
answer: Original answer
|
| 468 |
+
analysis: Question analysis
|
| 469 |
+
validation_message: What was wrong
|
| 470 |
+
|
| 471 |
+
Returns:
|
| 472 |
+
str: Corrected answer
|
| 473 |
+
"""
|
| 474 |
+
corrected = answer
|
| 475 |
+
|
| 476 |
+
# Remove JSON wrapping if should be plain string
|
| 477 |
+
if analysis.answer_format == 'plain_string':
|
| 478 |
+
if corrected.startswith('"') and corrected.endswith('"'):
|
| 479 |
+
corrected = corrected[1:-1]
|
| 480 |
+
if corrected.startswith("'") and corrected.endswith("'"):
|
| 481 |
+
corrected = corrected[1:-1]
|
| 482 |
+
|
| 483 |
+
# Strip whitespace
|
| 484 |
+
corrected = corrected.strip()
|
| 485 |
+
|
| 486 |
+
# Remove shell prompts
|
| 487 |
+
for prefix in ['$ ', '> ', '# ', 'Output: ', 'Result: ']:
|
| 488 |
+
if corrected.startswith(prefix):
|
| 489 |
+
corrected = corrected[len(prefix):]
|
| 490 |
+
|
| 491 |
+
return corrected
|
| 492 |
+
|
app/services/task_fetcher.py
CHANGED
|
@@ -15,7 +15,7 @@ from app.core.logging import get_logger
|
|
| 15 |
from app.core.exceptions import TaskProcessingError
|
| 16 |
from app.utils.llm_client import get_llm_client
|
| 17 |
from app.utils.prompts import AnalysisPrompts
|
| 18 |
-
|
| 19 |
logger = get_logger(__name__)
|
| 20 |
|
| 21 |
|
|
@@ -32,6 +32,7 @@ class TaskFetcher:
|
|
| 32 |
self.timeout = timeout
|
| 33 |
self.client: Optional[httpx.AsyncClient] = None
|
| 34 |
self.llm_client = get_llm_client()
|
|
|
|
| 35 |
|
| 36 |
# Import here to avoid circular imports
|
| 37 |
from app.orchestrator.models import UnifiedTaskAnalysis
|
|
@@ -79,52 +80,85 @@ class TaskFetcher:
|
|
| 79 |
|
| 80 |
# Step 1: Fetch visible content (with fallback)
|
| 81 |
content = await self._fetch_content(url)
|
| 82 |
-
|
| 83 |
logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# Step 2: Unified LLM analysis
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
)
|
| 92 |
-
|
| 93 |
-
# Merge content + analysis
|
| 94 |
result = {
|
| 95 |
-
|
| 96 |
-
'
|
| 97 |
-
'
|
| 98 |
-
'
|
| 99 |
-
'
|
| 100 |
-
|
| 101 |
-
'complexity': analysis.complexity,
|
| 102 |
-
'llm_analysis': {
|
| 103 |
-
'redirect_reasoning': analysis.redirect_reasoning,
|
| 104 |
-
'submission_reasoning': analysis.submission_reasoning,
|
| 105 |
-
'confidence': analysis.confidence,
|
| 106 |
-
}
|
| 107 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
#
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
# Resolve relative
|
| 116 |
-
if analysis.
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
logger.info(f"
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# ======================================================================
|
| 130 |
# FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
|
|
@@ -139,30 +173,43 @@ class TaskFetcher:
|
|
| 139 |
if not self._is_valid_url(url):
|
| 140 |
raise TaskProcessingError(f"Invalid URL format: {url}")
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
try:
|
| 143 |
response = await self._fetch_url(url)
|
| 144 |
content_type = self._detect_content_type(response)
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
# Basic extraction
|
| 147 |
-
task_description = await self.
|
| 148 |
raw_content = response.text[:5000]
|
| 149 |
|
|
|
|
| 150 |
# Heuristic: if nothing useful, try dynamic scraper
|
| 151 |
if self._looks_js_only(task_description, raw_content):
|
| 152 |
logger.warning("β οΈ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
|
| 153 |
dyn = await self._fetch_with_dynamic_scraper(url)
|
| 154 |
task_description = dyn['task_description']
|
| 155 |
raw_content = dyn['raw_content']
|
| 156 |
-
|
|
|
|
| 157 |
return {
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
}
|
| 167 |
|
| 168 |
except Exception as e:
|
|
@@ -212,15 +259,43 @@ class TaskFetcher:
|
|
| 212 |
for instruction pages.
|
| 213 |
"""
|
| 214 |
from app.modules.scrapers.dynamic_scraper import DynamicScraper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
scraper = DynamicScraper(use_pool=True)
|
| 217 |
await scraper.initialize()
|
| 218 |
try:
|
| 219 |
# Auto-extract text blocks
|
| 220 |
result = await scraper.scrape_url(url)
|
|
|
|
| 221 |
if not result.success:
|
| 222 |
raise RuntimeError(result.error or "Dynamic scraping failed")
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
# DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
|
| 225 |
texts: List[str] = []
|
| 226 |
if isinstance(result.data, list):
|
|
@@ -234,8 +309,10 @@ class TaskFetcher:
|
|
| 234 |
|
| 235 |
# Best-effort raw_content: you could extend DynamicScraper to return page.content()
|
| 236 |
return {
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
| 239 |
}
|
| 240 |
finally:
|
| 241 |
await scraper.cleanup()
|
|
@@ -244,31 +321,169 @@ class TaskFetcher:
|
|
| 244 |
# BASIC EXTRACTION (NO LLM)
|
| 245 |
# ======================================================================
|
| 246 |
|
| 247 |
-
async def
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
if content_type == 'json':
|
| 250 |
try:
|
| 251 |
-
data =
|
| 252 |
for field in ['task', 'description', 'question', 'content', 'text']:
|
| 253 |
if isinstance(data, dict) and field in data:
|
| 254 |
return str(data[field])
|
| 255 |
return json.dumps(data)
|
| 256 |
except Exception:
|
| 257 |
-
return
|
| 258 |
|
| 259 |
if content_type == 'html':
|
| 260 |
try:
|
| 261 |
-
|
| 262 |
-
soup = BeautifulSoup(
|
|
|
|
|
|
|
| 263 |
for script in soup(['script', 'style', 'nav', 'header', 'footer']):
|
| 264 |
script.decompose()
|
|
|
|
| 265 |
text = soup.get_text(strip=True, separator=' ')
|
| 266 |
return text
|
| 267 |
except Exception as e:
|
| 268 |
logger.error(f"HTML basic extraction failed: {e}")
|
| 269 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
def _detect_content_type(self, response: httpx.Response) -> str:
|
| 274 |
ct = response.headers.get('content-type', '').lower()
|
|
@@ -320,57 +535,7 @@ class TaskFetcher:
|
|
| 320 |
return analysis
|
| 321 |
except Exception as e:
|
| 322 |
logger.error(f"β LLM analysis failed: {e}", exc_info=True)
|
| 323 |
-
return
|
| 324 |
-
|
| 325 |
-
def _fallback_analysis(
|
| 326 |
-
self,
|
| 327 |
-
task_description: str,
|
| 328 |
-
all_urls: List[str],
|
| 329 |
-
url: str,
|
| 330 |
-
base_url: str
|
| 331 |
-
):
|
| 332 |
-
"""Very simple fallback if LLM fails."""
|
| 333 |
-
from app.orchestrator.models import UnifiedTaskAnalysis, InstructionStep
|
| 334 |
-
|
| 335 |
-
logger.warning("β οΈ Using fallback pattern-based analysis")
|
| 336 |
-
|
| 337 |
-
is_redirect = False
|
| 338 |
-
submission_url = None
|
| 339 |
-
|
| 340 |
-
for pattern in [r'POST\s+(?:to\s+)?([^\s<>"\']+)', r'submit\s+(?:to\s+)?([^\s<>"\']+)']:
|
| 341 |
-
m = re.search(pattern, task_description, re.IGNORECASE)
|
| 342 |
-
if m:
|
| 343 |
-
submission_url = m.group(1).rstrip('.,;:)')
|
| 344 |
-
break
|
| 345 |
-
|
| 346 |
-
sentences = re.split(r'[.;\n]', task_description)
|
| 347 |
-
instructions = []
|
| 348 |
-
step = 1
|
| 349 |
-
for s in sentences:
|
| 350 |
-
s = s.strip()
|
| 351 |
-
if len(s) > 5:
|
| 352 |
-
instructions.append(InstructionStep(
|
| 353 |
-
step_number=step,
|
| 354 |
-
action='unknown',
|
| 355 |
-
description=s,
|
| 356 |
-
target=None,
|
| 357 |
-
dependencies=[]
|
| 358 |
-
))
|
| 359 |
-
step += 1
|
| 360 |
-
|
| 361 |
-
return UnifiedTaskAnalysis(
|
| 362 |
-
is_redirect=is_redirect,
|
| 363 |
-
question_url=None,
|
| 364 |
-
redirect_reasoning="Fallback: no redirect detection",
|
| 365 |
-
submission_url=submission_url,
|
| 366 |
-
submission_url_is_relative=submission_url.startswith('/') if submission_url else False,
|
| 367 |
-
submission_reasoning="Fallback: simple regex match",
|
| 368 |
-
instructions=instructions,
|
| 369 |
-
overall_goal="Unknown (fallback)",
|
| 370 |
-
complexity="unknown",
|
| 371 |
-
confidence=0.3
|
| 372 |
-
)
|
| 373 |
-
|
| 374 |
def _format_instructions(self, steps) -> List[Dict[str, Any]]:
|
| 375 |
return [
|
| 376 |
{
|
|
|
|
| 15 |
from app.core.exceptions import TaskProcessingError
|
| 16 |
from app.utils.llm_client import get_llm_client
|
| 17 |
from app.utils.prompts import AnalysisPrompts
|
| 18 |
+
from app.services.analyser import QuestionAnalyzer
|
| 19 |
logger = get_logger(__name__)
|
| 20 |
|
| 21 |
|
|
|
|
| 32 |
self.timeout = timeout
|
| 33 |
self.client: Optional[httpx.AsyncClient] = None
|
| 34 |
self.llm_client = get_llm_client()
|
| 35 |
+
self.question_analyzer = QuestionAnalyzer(self.llm_client)
|
| 36 |
|
| 37 |
# Import here to avoid circular imports
|
| 38 |
from app.orchestrator.models import UnifiedTaskAnalysis
|
|
|
|
| 80 |
|
| 81 |
# Step 1: Fetch visible content (with fallback)
|
| 82 |
content = await self._fetch_content(url)
|
| 83 |
+
print(content)
|
| 84 |
logger.debug(f"Task description length after fetch: {len(content['task_description'])}")
|
| 85 |
|
| 86 |
+
file_links = content['question_metadata'].get('file_links', [])
|
| 87 |
+
|
| 88 |
+
if file_links:
|
| 89 |
+
# Download files to disk
|
| 90 |
+
downloaded_files = await self._download_files(
|
| 91 |
+
file_links,
|
| 92 |
+
content['base_url'],
|
| 93 | |
| 94 |
+
)
|
| 95 |
+
content['downloaded_files'] = downloaded_files
|
| 96 |
+
else:
|
| 97 |
+
content['downloaded_files'] = []
|
| 98 |
+
|
| 99 |
# Step 2: Unified LLM analysis
|
| 100 |
+
logger.info("π Analyzing question...")
|
| 101 |
+
if not getattr(self.question_analyzer, "_analyzer_agent", None):
|
| 102 |
+
await self.question_analyzer.initialize()
|
| 103 |
+
|
| 104 |
+
analysis = await self.question_analyzer.analyze_question(
|
| 105 |
+
question_metadata=content["question_metadata"],
|
| 106 |
+
base_url=base_url,
|
| 107 |
+
user_email="[email protected]",
|
| 108 |
+
downloaded_files=content["downloaded_files"]
|
| 109 |
)
|
|
|
|
|
|
|
| 110 |
result = {
|
| 111 |
+
'analysis': analysis,
|
| 112 |
+
'question_metadata': content['question_metadata'],
|
| 113 |
+
'base_url':base_url,
|
| 114 |
+
'user_email':"23f3003322@ds.study.iitm.ac.in",
|
| 115 |
+
'downloaded_files':content["downloaded_files"]
|
| 116 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
}
|
| 118 |
+
|
| 119 |
+
return result
|
| 120 |
+
# analysis = await self._analyze_content_with_llm(
|
| 121 |
+
# task_description=content['task_description'],
|
| 122 |
+
# raw_content=content['raw_content'],
|
| 123 |
+
# url=url,
|
| 124 |
+
# base_url=base_url
|
| 125 |
+
# )
|
| 126 |
|
| 127 |
+
# # Merge content + analysis
|
| 128 |
+
# result = {
|
| 129 |
+
# **content,
|
| 130 |
+
# 'is_redirect': analysis.is_redirect,
|
| 131 |
+
# 'question_url': analysis.question_url,
|
| 132 |
+
# 'submission_url': analysis.submission_url,
|
| 133 |
+
# 'instructions': self._format_instructions(analysis.instructions),
|
| 134 |
+
# 'overall_goal': analysis.overall_goal,
|
| 135 |
+
# 'complexity': analysis.complexity,
|
| 136 |
+
# 'llm_analysis': {
|
| 137 |
+
# 'redirect_reasoning': analysis.redirect_reasoning,
|
| 138 |
+
# 'submission_reasoning': analysis.submission_reasoning,
|
| 139 |
+
# 'confidence': analysis.confidence,
|
| 140 |
+
# }
|
| 141 |
+
# }
|
| 142 |
|
| 143 |
+
# # Resolve relative submission URL if needed
|
| 144 |
+
# if analysis.submission_url and analysis.submission_url_is_relative:
|
| 145 |
+
# absolute = str(httpx.URL(base_url).join(analysis.submission_url))
|
| 146 |
+
# logger.info(f"β Resolved relative submission URL: {analysis.submission_url} β {absolute}")
|
| 147 |
+
# result['submission_url'] = absolute
|
| 148 |
|
| 149 |
+
# # Resolve relative question URL if needed
|
| 150 |
+
# if analysis.question_url and analysis.question_url.startswith('/'):
|
| 151 |
+
# absolute_q = str(httpx.URL(base_url).join(analysis.question_url))
|
| 152 |
+
# logger.info(f"β Resolved relative question URL: {analysis.question_url} β {absolute_q}")
|
| 153 |
+
# result['question_url'] = absolute_q
|
| 154 |
|
| 155 |
+
# logger.info("β
Analysis complete:")
|
| 156 |
+
# logger.info(f" Is Redirect: {result['is_redirect']}")
|
| 157 |
+
# logger.info(f" Submission URL: {result['submission_url']}")
|
| 158 |
+
# logger.info(f" Instructions: {len(result['instructions'])} steps")
|
| 159 |
+
# logger.info(f" Complexity: {result['complexity']}")
|
| 160 |
+
|
| 161 |
+
# return result
|
| 162 |
|
| 163 |
# ======================================================================
|
| 164 |
# FETCHING WITH FALLBACK TO DYNAMIC SCRAPER
|
|
|
|
| 173 |
if not self._is_valid_url(url):
|
| 174 |
raise TaskProcessingError(f"Invalid URL format: {url}")
|
| 175 |
|
| 176 |
+
from urllib.parse import urlparse
|
| 177 |
+
|
| 178 |
+
parsed = urlparse(url)
|
| 179 |
+
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
| 180 |
+
|
| 181 |
try:
|
| 182 |
response = await self._fetch_url(url)
|
| 183 |
content_type = self._detect_content_type(response)
|
| 184 |
+
html_content = response.text # β This is html_content
|
| 185 |
+
html_content = html_content.replace(
|
| 186 |
+
'<span class="origin"></span>',
|
| 187 |
+
base_url
|
| 188 |
+
)
|
| 189 |
# Basic extraction
|
| 190 |
+
task_description = await self._extract_basic_content_from_html(html_content, content_type)
|
| 191 |
raw_content = response.text[:5000]
|
| 192 |
|
| 193 |
+
metadata = self._parse_question_metadata(html_content)
|
| 194 |
# Heuristic: if nothing useful, try dynamic scraper
|
| 195 |
if self._looks_js_only(task_description, raw_content):
|
| 196 |
logger.warning("β οΈ Content looks JS-only/empty. Falling back to DynamicScraper for instructions.")
|
| 197 |
dyn = await self._fetch_with_dynamic_scraper(url)
|
| 198 |
task_description = dyn['task_description']
|
| 199 |
raw_content = dyn['raw_content']
|
| 200 |
+
metadata = dyn['question_metadata']
|
| 201 |
+
|
| 202 |
return {
|
| 203 |
+
'task_description': task_description,
|
| 204 |
+
'raw_content': raw_content,
|
| 205 |
+
'content_type': content_type,
|
| 206 |
+
'url': url,
|
| 207 |
+
'base_url': base_url,
|
| 208 |
+
'question_metadata': metadata, # β ADDED
|
| 209 |
+
'metadata': {
|
| 210 |
+
'content_length': len(response.content),
|
| 211 |
+
'status_code': response.status_code,
|
| 212 |
+
}
|
| 213 |
}
|
| 214 |
|
| 215 |
except Exception as e:
|
|
|
|
| 259 |
for instruction pages.
|
| 260 |
"""
|
| 261 |
from app.modules.scrapers.dynamic_scraper import DynamicScraper
|
| 262 |
+
from urllib.parse import urlparse
|
| 263 |
+
|
| 264 |
+
# Extract base URL
|
| 265 |
+
parsed = urlparse(url)
|
| 266 |
+
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
| 267 |
|
| 268 |
scraper = DynamicScraper(use_pool=True)
|
| 269 |
await scraper.initialize()
|
| 270 |
try:
|
| 271 |
# Auto-extract text blocks
|
| 272 |
result = await scraper.scrape_url(url)
|
| 273 |
+
|
| 274 |
if not result.success:
|
| 275 |
raise RuntimeError(result.error or "Dynamic scraping failed")
|
| 276 |
|
| 277 |
+
rendered_html = result.raw_html if hasattr(result, 'raw_html') else None
|
| 278 |
+
if rendered_html:
|
| 279 |
+
rendered_html = rendered_html.replace(
|
| 280 |
+
'<span class="origin"></span>',
|
| 281 |
+
base_url
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
question_metadata = None
|
| 285 |
+
if rendered_html:
|
| 286 |
+
soup = BeautifulSoup(rendered_html, 'html.parser')
|
| 287 |
+
question_metadata = self._parse_question_metadata_from_soup(soup)
|
| 288 |
+
file_links = []
|
| 289 |
+
if rendered_html:
|
| 290 |
+
soup = BeautifulSoup(rendered_html, 'html.parser')
|
| 291 |
+
for a in soup.find_all('a', href=True):
|
| 292 |
+
href = a['href']
|
| 293 |
+
if href.startswith('/project2/'):
|
| 294 |
+
file_links.append({
|
| 295 |
+
'href': href,
|
| 296 |
+
'text': a.get_text(strip=True)
|
| 297 |
+
})
|
| 298 |
+
|
| 299 |
# DynamicScraper._extract_auto returns list of dicts with 'text' for paragraphs
|
| 300 |
texts: List[str] = []
|
| 301 |
if isinstance(result.data, list):
|
|
|
|
| 309 |
|
| 310 |
# Best-effort raw_content: you could extend DynamicScraper to return page.content()
|
| 311 |
return {
|
| 312 |
+
'task_description': task_text,
|
| 313 |
+
'raw_content': rendered_html if rendered_html else task_text[:5000],
|
| 314 |
+
'base_url': base_url,
|
| 315 |
+
'question_metadata': question_metadata, # NEW
|
| 316 |
}
|
| 317 |
finally:
|
| 318 |
await scraper.cleanup()
|
|
|
|
| 321 |
# BASIC EXTRACTION (NO LLM)
|
| 322 |
# ======================================================================
|
| 323 |
|
| 324 |
+
async def _extract_basic_content_from_html(
|
| 325 |
+
self,
|
| 326 |
+
html_content: str, # β Changed from response
|
| 327 |
+
content_type: str
|
| 328 |
+
) -> str:
|
| 329 |
+
"""
|
| 330 |
+
Fast extraction from HTML string (no JS execution).
|
| 331 |
+
"""
|
| 332 |
if content_type == 'json':
|
| 333 |
try:
|
| 334 |
+
data = json.loads(html_content)
|
| 335 |
for field in ['task', 'description', 'question', 'content', 'text']:
|
| 336 |
if isinstance(data, dict) and field in data:
|
| 337 |
return str(data[field])
|
| 338 |
return json.dumps(data)
|
| 339 |
except Exception:
|
| 340 |
+
return html_content
|
| 341 |
|
| 342 |
if content_type == 'html':
|
| 343 |
try:
|
| 344 |
+
from bs4 import BeautifulSoup
|
| 345 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 346 |
+
|
| 347 |
+
# Remove scripts (but origin already replaced before this)
|
| 348 |
for script in soup(['script', 'style', 'nav', 'header', 'footer']):
|
| 349 |
script.decompose()
|
| 350 |
+
|
| 351 |
text = soup.get_text(strip=True, separator=' ')
|
| 352 |
return text
|
| 353 |
except Exception as e:
|
| 354 |
logger.error(f"HTML basic extraction failed: {e}")
|
| 355 |
+
return html_content
|
| 356 |
+
|
| 357 |
+
return html_content
|
| 358 |
+
|
| 359 |
+
def _parse_question_metadata(self, html: str) -> Dict[str, Any]:
|
| 360 |
+
"""
|
| 361 |
+
Extract structured metadata from question HTML.
|
| 362 |
+
"""
|
| 363 |
+
from bs4 import BeautifulSoup
|
| 364 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 365 |
+
|
| 366 |
+
metadata = {
|
| 367 |
+
'title': None,
|
| 368 |
+
'heading': None,
|
| 369 |
+
'difficulty': None,
|
| 370 |
+
'is_personalized': False,
|
| 371 |
+
'instructions': [],
|
| 372 |
+
'file_links': []
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
# Extract title
|
| 376 |
+
title_tag = soup.find('title')
|
| 377 |
+
if title_tag:
|
| 378 |
+
metadata['title'] = title_tag.text.strip()
|
| 379 |
+
|
| 380 |
+
# Extract heading
|
| 381 |
+
h1_tag = soup.find('h1')
|
| 382 |
+
if h1_tag:
|
| 383 |
+
metadata['heading'] = h1_tag.text.strip()
|
| 384 |
+
|
| 385 |
+
# Extract difficulty and personalization
|
| 386 |
+
for p in soup.find_all('p'):
|
| 387 |
+
text = p.get_text()
|
| 388 |
+
|
| 389 |
+
# Difficulty: "Difficulty: 1 (next URL revealed even if wrong)"
|
| 390 |
+
if 'Difficulty:' in text:
|
| 391 |
+
import re
|
| 392 |
+
match = re.search(r'Difficulty:\s*(\d+)', text)
|
| 393 |
+
if match:
|
| 394 |
+
metadata['difficulty'] = int(match.group(1))
|
| 395 |
+
|
| 396 |
+
# Personalization: "Personalized: Yes" or "Personalized: No"
|
| 397 |
+
if 'Personalized:' in text:
|
| 398 |
+
metadata['is_personalized'] = 'Yes' in text
|
| 399 |
+
|
| 400 |
+
# Extract ordered instructions
|
| 401 |
+
ol_tag = soup.find('ol')
|
| 402 |
+
if ol_tag:
|
| 403 |
+
for li in ol_tag.find_all('li', recursive=False):
|
| 404 |
+
metadata['instructions'].append(li.get_text(strip=True))
|
| 405 |
+
|
| 406 |
+
# Extract file links
|
| 407 |
+
for a in soup.find_all('a', href=True):
|
| 408 |
+
href = a['href']
|
| 409 |
+
if href.startswith('/project2/'):
|
| 410 |
+
metadata['file_links'].append({
|
| 411 |
+
'href': href,
|
| 412 |
+
'text': a.get_text(strip=True)
|
| 413 |
+
})
|
| 414 |
+
|
| 415 |
+
return metadata
|
| 416 |
+
|
| 417 |
+
def _parse_question_metadata_from_soup(self, soup) -> Dict[str, Any]:
|
| 418 |
+
"""
|
| 419 |
+
Extract structured metadata from BeautifulSoup object.
|
| 420 |
+
Helper method for both httpx and dynamic scraper paths.
|
| 421 |
+
|
| 422 |
+
Args:
|
| 423 |
+
soup: BeautifulSoup parsed HTML
|
| 424 |
+
|
| 425 |
+
Returns:
|
| 426 |
+
Dict with title, difficulty, personalization, instructions, file_links
|
| 427 |
+
"""
|
| 428 |
+
metadata = {
|
| 429 |
+
'title': None,
|
| 430 |
+
'heading': None,
|
| 431 |
+
'difficulty': None,
|
| 432 |
+
'is_personalized': False,
|
| 433 |
+
'instructions': [],
|
| 434 |
+
'file_links': []
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
# Extract title
|
| 438 |
+
title_tag = soup.find('title')
|
| 439 |
+
if title_tag:
|
| 440 |
+
metadata['title'] = title_tag.text.strip()
|
| 441 |
|
| 442 |
+
# Extract heading
|
| 443 |
+
h1_tag = soup.find('h1')
|
| 444 |
+
if h1_tag:
|
| 445 |
+
metadata['heading'] = h1_tag.text.strip()
|
| 446 |
+
|
| 447 |
+
# Extract difficulty and personalization from paragraphs
|
| 448 |
+
for p in soup.find_all('p'):
|
| 449 |
+
text = p.get_text()
|
| 450 |
+
|
| 451 |
+
# Parse difficulty: "Difficulty: 1 (next URL revealed even if wrong)"
|
| 452 |
+
if 'Difficulty:' in text or 'difficulty:' in text.lower():
|
| 453 |
+
import re
|
| 454 |
+
match = re.search(r'[Dd]ifficulty:\s*(\d+)', text)
|
| 455 |
+
if match:
|
| 456 |
+
metadata['difficulty'] = int(match.group(1))
|
| 457 |
+
logger.debug(f"Parsed difficulty: {metadata['difficulty']}")
|
| 458 |
+
|
| 459 |
+
# Parse personalization: "Personalized: Yes" or "Personalized: No"
|
| 460 |
+
if 'Personalized:' in text or 'personalized:' in text.lower():
|
| 461 |
+
metadata['is_personalized'] = 'yes' in text.lower()
|
| 462 |
+
logger.debug(f"Parsed personalization: {metadata['is_personalized']}")
|
| 463 |
+
|
| 464 |
+
# Extract ordered instructions from <ol> tag
|
| 465 |
+
ol_tag = soup.find('ol')
|
| 466 |
+
if ol_tag:
|
| 467 |
+
for li in ol_tag.find_all('li', recursive=False):
|
| 468 |
+
instruction_text = li.get_text(separator=' ', strip=True)
|
| 469 |
+
metadata['instructions'].append(instruction_text)
|
| 470 |
+
logger.debug(f"Parsed {len(metadata['instructions'])} instructions")
|
| 471 |
+
|
| 472 |
+
# Extract file links from <a> tags
|
| 473 |
+
for a in soup.find_all('a', href=True):
|
| 474 |
+
href = a['href']
|
| 475 |
+
# Look for project files
|
| 476 |
+
if href.startswith('/project2/') or '/project2/' in href:
|
| 477 |
+
metadata['file_links'].append({
|
| 478 |
+
'href': href,
|
| 479 |
+
'text': a.get_text(strip=True)
|
| 480 |
+
})
|
| 481 |
+
|
| 482 |
+
if metadata['file_links']:
|
| 483 |
+
logger.debug(f"Found {len(metadata['file_links'])} file links")
|
| 484 |
+
|
| 485 |
+
return metadata
|
| 486 |
+
|
| 487 |
|
| 488 |
def _detect_content_type(self, response: httpx.Response) -> str:
|
| 489 |
ct = response.headers.get('content-type', '').lower()
|
|
|
|
| 535 |
return analysis
|
| 536 |
except Exception as e:
|
| 537 |
logger.error(f"β LLM analysis failed: {e}", exc_info=True)
|
| 538 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
def _format_instructions(self, steps) -> List[Dict[str, Any]]:
|
| 540 |
return [
|
| 541 |
{
|
app/services/task_processor.py
CHANGED
|
@@ -5,14 +5,16 @@ Simplified with unified LLM analysis in task_fetcher + AnswerSubmitter integrati
|
|
| 5 |
|
| 6 |
from typing import Dict, Any, Optional
|
| 7 |
import asyncio
|
| 8 |
-
from app.models.request import
|
| 9 |
from app.core.logging import get_logger
|
| 10 |
from app.core.exceptions import TaskProcessingError
|
| 11 |
from app.orchestrator.orchestrator_engine import OrchestratorEngine
|
| 12 |
from app.modules import get_fully_loaded_registry # β
AUTO-REGISTRATION
|
| 13 |
from app.services.task_fetcher import TaskFetcher
|
| 14 |
from app.modules.submitters.answer_submitter import AnswerSubmitter # β
NEW
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
logger = get_logger(__name__)
|
| 17 |
|
| 18 |
class TaskProcessor:
|
|
@@ -28,13 +30,15 @@ class TaskProcessor:
|
|
| 28 |
# β
AUTO-REGISTER ALL MODULES
|
| 29 |
self.registry = get_fully_loaded_registry()
|
| 30 |
self.answer_submitter = AnswerSubmitter()
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Initialize orchestrator engine
|
| 33 |
self.orchestrator = OrchestratorEngine(self.registry)
|
| 34 |
|
| 35 |
logger.info(f"β
TaskProcessor initialized with {len(self.registry.modules)} modules")
|
| 36 |
|
| 37 |
-
async def process(self, task_data:
|
| 38 |
"""
|
| 39 |
Process TDS quiz task - COMPLETE END-TO-END FLOW
|
| 40 |
|
|
@@ -47,7 +51,7 @@ class TaskProcessor:
|
|
| 47 |
6. Build response
|
| 48 |
"""
|
| 49 |
logger.info("=" * 80)
|
| 50 |
-
logger.info(f"π Processing task for: {task_data.email}")
|
| 51 |
logger.info(f"π Request URL: {task_data.url}")
|
| 52 |
logger.info("=" * 80)
|
| 53 |
|
|
@@ -65,101 +69,31 @@ class TaskProcessor:
|
|
| 65 |
|
| 66 |
# β
FIXED: Use proper async context manager pattern
|
| 67 |
async with TaskFetcher() as fetcher:
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
submission_url = analysis.get('submission_url')
|
| 76 |
-
instructions = analysis.get('instructions', [])
|
| 77 |
-
question_url = request_url # Default to request URL
|
| 78 |
-
|
| 79 |
-
logger.info(f"π Submission URL: {submission_url}")
|
| 80 |
-
logger.info(f"π Instructions: {len(instructions)} steps")
|
| 81 |
-
|
| 82 |
-
# ===================================================================
|
| 83 |
-
# STEP 2: EXECUTE ORCHESTRATION (Scrape β Extract β Answer)
|
| 84 |
-
# ===================================================================
|
| 85 |
-
logger.info("\n" + "=" * 80)
|
| 86 |
-
logger.info("STEP 2: EXECUTING ORCHESTRATION")
|
| 87 |
-
logger.info("=" * 80)
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
'question_url': question_url,
|
| 96 |
-
'submission_url': submission_url,
|
| 97 |
-
'instructions': instructions
|
| 98 |
-
}
|
| 99 |
)
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
logger.info(f" Success: {orchestration_result['success']}")
|
| 103 |
-
|
| 104 |
-
# ===================================================================
|
| 105 |
-
# STEP 3: EXTRACT ANSWER
|
| 106 |
-
# ===================================================================
|
| 107 |
-
answer = self._extract_answer(orchestration_result)
|
| 108 |
-
logger.info(f"β Answer extracted: {str(answer)[:100]}")
|
| 109 |
-
|
| 110 |
-
if not answer or answer == "No answer found":
|
| 111 |
-
logger.warning("β οΈ No valid answer extracted")
|
| 112 |
-
return self._build_response(
|
| 113 |
-
task_data, request_url, question_url, submission_url,
|
| 114 |
-
analysis, orchestration_result, None, answer
|
| 115 |
-
)
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
submission_result = await self.answer_submitter.execute({
|
| 125 |
-
'submission_url': submission_url,
|
| 126 |
-
'email': task_data.email,
|
| 127 |
-
'secret': str(answer),
|
| 128 |
-
'quiz_url': question_url,
|
| 129 |
-
'answer': answer
|
| 130 |
-
})
|
| 131 |
-
|
| 132 |
-
logger.info(f"β Submission completed: {getattr(submission_result, 'success', False)}")
|
| 133 |
-
|
| 134 |
-
# β
ALWAYS check for new URL first
|
| 135 |
-
if (hasattr(submission_result, 'data') and
|
| 136 |
-
submission_result.data and
|
| 137 |
-
(next_url := submission_result.data.get('next_quiz_url'))):
|
| 138 |
-
|
| 139 |
-
logger.info(f"π NEW QUIZ DETECTED: {next_url}")
|
| 140 |
-
|
| 141 |
-
# β
FIXED: Proper background task handling with reference tracking
|
| 142 |
-
background_tasks = set()
|
| 143 |
-
task = asyncio.create_task(self._process_chained_quiz(task_data.email, next_url, submission_url))
|
| 144 |
-
background_tasks.add(task)
|
| 145 |
-
task.add_done_callback(background_tasks.discard)
|
| 146 |
-
|
| 147 |
-
return {
|
| 148 |
-
'success': True,
|
| 149 |
-
'status': 'chained',
|
| 150 |
-
'message': f'Submitted & chained to next quiz: {next_url}',
|
| 151 |
-
'next_url': next_url,
|
| 152 |
-
'correct': submission_result.data.get('correct', False)
|
| 153 |
-
}
|
| 154 |
-
|
| 155 |
-
# β
No new URL = SUCCESS (whether correct or not)
|
| 156 |
-
logger.info("β
No new quiz - Task completed successfully")
|
| 157 |
-
return {
|
| 158 |
-
'success': True,
|
| 159 |
-
'status': 'completed',
|
| 160 |
-
'message': 'Answer submitted successfully to TDS',
|
| 161 |
-
'correct': getattr(submission_result, 'data', {}).get('correct', False)
|
| 162 |
-
}
|
| 163 |
|
| 164 |
except Exception as e:
|
| 165 |
logger.error(f"β Task processing failed: {str(e)}", exc_info=True)
|
|
@@ -226,7 +160,7 @@ class TaskProcessor:
|
|
| 226 |
|
| 227 |
def _build_response(
|
| 228 |
self,
|
| 229 |
-
task_data:
|
| 230 |
request_url: str,
|
| 231 |
question_url: str,
|
| 232 |
submission_url: str,
|
|
|
|
| 5 |
|
| 6 |
from typing import Dict, Any, Optional
|
| 7 |
import asyncio
|
| 8 |
+
from app.models.request import ManualTriggeredRequestBody
|
| 9 |
from app.core.logging import get_logger
|
| 10 |
from app.core.exceptions import TaskProcessingError
|
| 11 |
from app.orchestrator.orchestrator_engine import OrchestratorEngine
|
| 12 |
from app.modules import get_fully_loaded_registry # β
AUTO-REGISTRATION
|
| 13 |
from app.services.task_fetcher import TaskFetcher
|
| 14 |
from app.modules.submitters.answer_submitter import AnswerSubmitter # β
NEW
|
| 15 |
+
from app.services.answer_generator import AnswerGenerator
|
| 16 |
+
from app.utils.llm_client import get_llm_client
|
| 17 |
+
from app.utils.submit_answer import submit_answer
|
| 18 |
logger = get_logger(__name__)
|
| 19 |
|
| 20 |
class TaskProcessor:
|
|
|
|
| 30 |
# β
AUTO-REGISTER ALL MODULES
|
| 31 |
self.registry = get_fully_loaded_registry()
|
| 32 |
self.answer_submitter = AnswerSubmitter()
|
| 33 |
+
self.llm_client = get_llm_client()
|
| 34 |
+
self.answer_generator = AnswerGenerator(self.llm_client)
|
| 35 |
|
| 36 |
# Initialize orchestrator engine
|
| 37 |
self.orchestrator = OrchestratorEngine(self.registry)
|
| 38 |
|
| 39 |
logger.info(f"β
TaskProcessor initialized with {len(self.registry.modules)} modules")
|
| 40 |
|
| 41 |
+
async def process(self, task_data: ManualTriggeredRequestBody) -> Dict[str, Any]:
|
| 42 |
"""
|
| 43 |
Process TDS quiz task - COMPLETE END-TO-END FLOW
|
| 44 |
|
|
|
|
| 51 |
6. Build response
|
| 52 |
"""
|
| 53 |
logger.info("=" * 80)
|
| 54 |
+
# logger.info(f"π Processing task for: {task_data.email}")
|
| 55 |
logger.info(f"π Request URL: {task_data.url}")
|
| 56 |
logger.info("=" * 80)
|
| 57 |
|
|
|
|
| 69 |
|
| 70 |
# β
FIXED: Use proper async context manager pattern
|
| 71 |
async with TaskFetcher() as fetcher:
|
| 72 |
+
result = await fetcher.fetch_and_analyze(url=request_url)
|
| 73 |
+
print("========")
|
| 74 |
+
print("analysis")
|
| 75 |
+
print(result)
|
| 76 |
+
# Initialize answer generator if needed
|
| 77 |
+
if not getattr(self.answer_generator, "_generator_agent", None):
|
| 78 |
+
await self.answer_generator.initialize()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
answer = await self.answer_generator.generate(
|
| 81 |
+
analysis=result["analysis"],
|
| 82 |
+
question_metadata=result["question_metadata"],
|
| 83 |
+
base_url=result["base_url"],
|
| 84 |
+
user_email=result["user_email"],
|
| 85 |
+
downloaded_files=result["downloaded_files"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
)
|
| 87 |
+
print("================================= answer")
|
| 88 |
+
print(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
return submit_answer(
|
| 91 |
+
submit_url="https://tds-llm-analysis.s-anand.net/submit",
|
| 92 |
+
answer=answer,
|
| 93 |
+
req_url=request_url,
|
| 94 |
+
background_tasks=None
|
| 95 |
+
)
|
| 96 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
except Exception as e:
|
| 99 |
logger.error(f"β Task processing failed: {str(e)}", exc_info=True)
|
|
|
|
| 160 |
|
| 161 |
def _build_response(
|
| 162 |
self,
|
| 163 |
+
task_data: ManualTriggeredRequestBody,
|
| 164 |
request_url: str,
|
| 165 |
question_url: str,
|
| 166 |
submission_url: str,
|
app/utils/prompts.py
CHANGED
|
@@ -132,6 +132,259 @@ class AnalysisPrompts:
|
|
| 132 |
|
| 133 |
Now analyze the content above."""
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
@staticmethod
|
| 136 |
def analysis_planning_prompt(
|
| 137 |
question: str,
|
|
|
|
| 132 |
|
| 133 |
Now analyze the content above."""
|
| 134 |
|
| 135 |
+
@staticmethod
|
| 136 |
+
def question_analysis_prompt(
|
| 137 |
+
instructions: List[str],
|
| 138 |
+
difficulty: int,
|
| 139 |
+
is_personalized: bool,
|
| 140 |
+
title: str,
|
| 141 |
+
heading: str,
|
| 142 |
+
base_url: str,
|
| 143 |
+
user_email: str,
|
| 144 |
+
available_files: List[Dict[str, Any]]
|
| 145 |
+
) -> str:
|
| 146 |
+
"""
|
| 147 |
+
Generate prompt for analyzing question.
|
| 148 |
+
Focused on extracting what's needed to generate the answer.
|
| 149 |
+
"""
|
| 150 |
+
|
| 151 |
+
files_text = "\n".join(
|
| 152 |
+
f"- {f.get('filename', 'unknown')} ({f.get('type', 'unknown')})"
|
| 153 |
+
for f in available_files
|
| 154 |
+
) if available_files else "None"
|
| 155 |
+
|
| 156 |
+
instructions_text = "\n".join(
|
| 157 |
+
f"{i+1}. {inst}"
|
| 158 |
+
for i, inst in enumerate(instructions)
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
return f"""Analyze this technical quiz question to determine how to generate the correct answer.
|
| 162 |
+
|
| 163 |
+
# QUESTION METADATA
|
| 164 |
+
- **Title**: {title}
|
| 165 |
+
- **Heading**: {heading}
|
| 166 |
+
- **Difficulty**: {difficulty}/5 (1=easiest, 5=hardest)
|
| 167 |
+
- **Personalized**: {is_personalized}
|
| 168 |
+
- **Base URL**: {base_url}
|
| 169 |
+
- **User Email**: {user_email}
|
| 170 |
+
|
| 171 |
+
# INSTRUCTIONS
|
| 172 |
+
{instructions_text}
|
| 173 |
+
|
| 174 |
+
# AVAILABLE FILES
|
| 175 |
+
{files_text}
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
# YOUR ANALYSIS TASK
|
| 180 |
+
|
| 181 |
+
Extract the following information to enable answer generation:
|
| 182 |
+
|
| 183 |
+
## 1. QUESTION TYPE
|
| 184 |
+
Categorize the task:
|
| 185 |
+
- **cli_command**: Generate command strings (uv, git, curl, docker)
|
| 186 |
+
- **file_path**: Return file paths or URLs
|
| 187 |
+
- **data_processing**: Process CSV/JSON/ZIP files
|
| 188 |
+
- **image_analysis**: Analyze images (colors, pixels, differences)
|
| 189 |
+
- **audio_transcription**: Transcribe audio to text
|
| 190 |
+
- **api_interaction**: Make API calls (GitHub, REST APIs)
|
| 191 |
+
- **document_parsing**: Extract data from PDFs
|
| 192 |
+
- **calculation**: Mathematical computations (sums, F1 scores)
|
| 193 |
+
- **text_generation**: Generate YAML, prompts, configuration
|
| 194 |
+
- **optimization**: Solve constraint/optimization problems
|
| 195 |
+
- **llm_reasoning**: Multi-step reasoning or tool planning
|
| 196 |
+
|
| 197 |
+
## 2. ANSWER FORMAT
|
| 198 |
+
How should the final answer be formatted?
|
| 199 |
+
- **plain_string**: Raw text, no quotes, no JSON (e.g., "uv http get ...")
|
| 200 |
+
- **json_object**: JSON dictionary (e.g., {{"key": "value"}})
|
| 201 |
+
- **json_array**: JSON list (e.g., ["a", "b", "c"])
|
| 202 |
+
- **number**: Integer or float (e.g., 42 or 3.14)
|
| 203 |
+
- **single_letter**: One character (e.g., A, B, or C)
|
| 204 |
+
|
| 205 |
+
## 3. KEY COMPONENTS
|
| 206 |
+
Extract specific data needed to generate the answer:
|
| 207 |
+
|
| 208 |
+
**For cli_command:**
|
| 209 |
+
- tool: "uv", "git", "curl"
|
| 210 |
+
- subcommand: "http get", "add", "commit"
|
| 211 |
+
- url_template: Pattern with placeholders
|
| 212 |
+
- flags: ["-H", "-m", "-p"]
|
| 213 |
+
- arguments: Headers, messages, parameters
|
| 214 |
+
|
| 215 |
+
**For file_path:**
|
| 216 |
+
- path: Exact path or pattern
|
| 217 |
+
|
| 218 |
+
**For data_processing:**
|
| 219 |
+
- operations: ["normalize", "filter", "aggregate"]
|
| 220 |
+
- output_format: "json", "csv"
|
| 221 |
+
- sorting: Field and direction
|
| 222 |
+
|
| 223 |
+
**For calculations:**
|
| 224 |
+
- formula: Mathematical expression
|
| 225 |
+
- input_sources: Where data comes from
|
| 226 |
+
- precision: Decimal places
|
| 227 |
+
|
| 228 |
+
**For any type:**
|
| 229 |
+
- Any other relevant details from instructions
|
| 230 |
+
|
| 231 |
+
## 4. PERSONALIZATION
|
| 232 |
+
Determine if answer depends on user's email:
|
| 233 |
+
|
| 234 |
+
**Types:**
|
| 235 |
+
- **email_in_url**: Email appears in URL (e.g., ?email={{user_email}})
|
| 236 |
+
- **email_length_offset**: offset = len(email) mod N, add to result
|
| 237 |
+
- **email_length_conditional**: Different answer based on email length (even/odd)
|
| 238 |
+
|
| 239 |
+
**Details:**
|
| 240 |
+
- Which mod value? (mod 2, mod 3, mod 5)
|
| 241 |
+
- How to apply? (add to result, choose option)
|
| 242 |
+
|
| 243 |
+
## 5. FILE REQUIREMENTS
|
| 244 |
+
Does the question need files from available_files list?
|
| 245 |
+
- Which file types? (csv, json, png, pdf, opus, zip)
|
| 246 |
+
- What to do with them? (process, analyze, extract)
|
| 247 |
+
|
| 248 |
+
## 6. EXTERNAL RESOURCES
|
| 249 |
+
Does the question require fetching from another URL/endpoint?
|
| 250 |
+
- API endpoints mentioned in instructions
|
| 251 |
+
- Data sources not in available_files
|
| 252 |
+
- Example: "Use GitHub API with params in /project2/gh-tree.json"
|
| 253 |
+
|
| 254 |
+
## 7. CRITICAL CONSTRAINTS
|
| 255 |
+
Extract must-follow rules:
|
| 256 |
+
- "command string" not "command output"
|
| 257 |
+
- Exact decimal places (2, 4)
|
| 258 |
+
- Sorting order (ascending, descending)
|
| 259 |
+
- Case sensitivity (lowercase, uppercase)
|
| 260 |
+
- Separators (comma, space, newline)
|
| 261 |
+
- Quote style ("double", 'single', none)
|
| 262 |
+
- No markdown formatting
|
| 263 |
+
- Specific value ranges
|
| 264 |
+
|
| 265 |
+
## 8. SUBMISSION URL PATH
|
| 266 |
+
The URL path for THIS specific question (from title/heading).
|
| 267 |
+
Pattern: /project2-{{question-name}}
|
| 268 |
+
Example: /project2-uv, /project2-git, /project2-md
|
| 269 |
+
|
| 270 |
+
---
|
| 271 |
+
|
| 272 |
+
# EXAMPLES
|
| 273 |
+
|
| 274 |
+
## Example 1: CLI Command (Q2-like)
|
| 275 |
+
|
| 276 |
+
**Instructions:**
|
| 277 |
+
1. Craft the command string using uv http get on {{{{base_url}}}}/project2/uv.json?email=<your email>
|
| 278 |
+
2. Include header Accept: application/json
|
| 279 |
+
3. POST that exact command string as answer
|
| 280 |
+
|
| 281 |
+
**Analysis:**
|
| 282 |
+
{{
|
| 283 |
+
"question_type": "cli_command",
|
| 284 |
+
"answer_format": "plain_string",
|
| 285 |
+
"key_components": {{
|
| 286 |
+
"tool": "uv",
|
| 287 |
+
"subcommand": "http get",
|
| 288 |
+
"url_template": "{{{{base_url}}}}/project2/uv.json?email={{{{user_email}}}}",
|
| 289 |
+
"headers": [{{"name": "Accept", "value": "application/json"}}],
|
| 290 |
+
"header_flag": "-H"
|
| 291 |
+
}},
|
| 292 |
+
"requires_personalization": true,
|
| 293 |
+
"personalization_type": "email_in_url",
|
| 294 |
+
"personalization_details": "User email in URL query parameter",
|
| 295 |
+
"requires_files": false,
|
| 296 |
+
"required_file_types": [],
|
| 297 |
+
"requires_external_fetch": false,
|
| 298 |
+
"external_resources": [],
|
| 299 |
+
"critical_constraints": [
|
| 300 |
+
"Return command string only, not output",
|
| 301 |
+
"Use double quotes for header value",
|
| 302 |
+
"Format: tool subcommand url -H \"header: value\""
|
| 303 |
+
],
|
| 304 |
+
"submission_url_path": "/project2-uv",
|
| 305 |
+
"reasoning": "Instructions explicitly ask for 'command string' using specific tool and parameters",
|
| 306 |
+
"confidence": 0.98
|
| 307 |
+
}}
|
| 308 |
+
|
| 309 |
+
text
|
| 310 |
+
|
| 311 |
+
## Example 2: File Path (Q4-like)
|
| 312 |
+
|
| 313 |
+
**Instructions:**
|
| 314 |
+
1. The correct relative link target is exactly /project2/data-preparation.md
|
| 315 |
+
2. Submit that exact string. Do not wrap in Markdown/HTML
|
| 316 |
+
|
| 317 |
+
**Analysis:**
|
| 318 |
+
{{
|
| 319 |
+
"question_type": "file_path",
|
| 320 |
+
"answer_format": "plain_string",
|
| 321 |
+
"key_components": {{
|
| 322 |
+
"path": "/project2/data-preparation.md"
|
| 323 |
+
}},
|
| 324 |
+
"requires_personalization": false,
|
| 325 |
+
"requires_files": false,
|
| 326 |
+
"requires_external_fetch": false,
|
| 327 |
+
"critical_constraints": [
|
| 328 |
+
"Exact string: /project2/data-preparation.md",
|
| 329 |
+
"No markdown formatting",
|
| 330 |
+
"No HTML tags",
|
| 331 |
+
"No quotes"
|
| 332 |
+
],
|
| 333 |
+
"submission_url_path": "/project2-md",
|
| 334 |
+
"reasoning": "Instructions provide exact path to return",
|
| 335 |
+
"confidence": 1.0
|
| 336 |
+
}}
|
| 337 |
+
|
| 338 |
+
text
|
| 339 |
+
|
| 340 |
+
## Example 3: Data Processing with Personalization (Q9-like)
|
| 341 |
+
|
| 342 |
+
**Instructions:**
|
| 343 |
+
1. Download logs.zip and sum bytes where event=="download"
|
| 344 |
+
2. Compute offset = (length of your email) mod 5
|
| 345 |
+
3. Final answer = base sum + offset
|
| 346 |
+
|
| 347 |
+
**Available Files:**
|
| 348 |
+
- logs.zip (zip)
|
| 349 |
+
|
| 350 |
+
**Analysis:**
|
| 351 |
+
{{
|
| 352 |
+
"question_type": "data_processing",
|
| 353 |
+
"answer_format": "number",
|
| 354 |
+
"key_components": {{
|
| 355 |
+
"file": "logs.zip",
|
| 356 |
+
"operation": "sum",
|
| 357 |
+
"field": "bytes",
|
| 358 |
+
"filter": {{"event": "download"}},
|
| 359 |
+
"offset_formula": "len(user_email) mod 5"
|
| 360 |
+
}},
|
| 361 |
+
"requires_personalization": true,
|
| 362 |
+
"personalization_type": "email_length_offset",
|
| 363 |
+
"personalization_details": "Add (len(email) mod 5) to base sum",
|
| 364 |
+
"requires_files": true,
|
| 365 |
+
"required_file_types": ["zip"],
|
| 366 |
+
"requires_external_fetch": false,
|
| 367 |
+
"critical_constraints": [
|
| 368 |
+
"Filter: event == 'download'",
|
| 369 |
+
"Sum the bytes field",
|
| 370 |
+
"Add email length offset",
|
| 371 |
+
"Return integer only"
|
| 372 |
+
],
|
| 373 |
+
"submission_url_path": "/project2-logs",
|
| 374 |
+
"reasoning": "File processing with email-based offset calculation",
|
| 375 |
+
"confidence": 0.92
|
| 376 |
+
}}
|
| 377 |
+
|
| 378 |
+
text
|
| 379 |
+
|
| 380 |
+
---
|
| 381 |
+
|
| 382 |
+
# NOW ANALYZE
|
| 383 |
+
|
| 384 |
+
Analyze the question above and return a complete QuestionAnalysis object.
|
| 385 |
+
Be precise and extract ALL relevant details from the instructions.
|
| 386 |
+
"""
|
| 387 |
+
|
| 388 |
@staticmethod
|
| 389 |
def analysis_planning_prompt(
|
| 390 |
question: str,
|
app/utils/submit_answer.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Answer Submission Utility
|
| 3 |
+
Handles answer submission and chained quiz processing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from fastapi import HTTPException, BackgroundTasks
|
| 8 |
+
from app.core.logging import get_logger
|
| 9 |
+
from app.models.request import ManualTriggeredRequestBody
|
| 10 |
+
logger = get_logger(__name__)
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def submit_answer(submit_url: str, req_url: str ,answer:str, background_tasks: BackgroundTasks = None) -> dict:
|
| 15 |
+
"""
|
| 16 |
+
Submits an answer to the provided submit_url and triggers next quiz if URL is returned.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
submit_url: The URL endpoint to submit the answer to
|
| 20 |
+
body: Dictionary containing email, secret, url, and answer
|
| 21 |
+
background_tasks: FastAPI BackgroundTasks for chained processing
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
The response from the server containing correct status, reason, url, and delay
|
| 25 |
+
|
| 26 |
+
Raises:
|
| 27 |
+
HTTPException on request failure
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
logger.info(f"Submitting answer to {submit_url}")
|
| 31 |
+
|
| 32 |
+
# Get email and secret from environment
|
| 33 |
+
from app.core.config import settings
|
| 34 |
+
|
| 35 |
+
answer_body = {
|
| 36 |
+
"email": settings.USER_EMAIL,
|
| 37 |
+
"secret": settings.API_SECRET,
|
| 38 |
+
"url": req_url,
|
| 39 |
+
"answer": answer
|
| 40 |
+
}
|
| 41 |
+
response = requests.post(submit_url, json=answer_body, timeout=15)
|
| 42 |
+
response.raise_for_status()
|
| 43 |
+
|
| 44 |
+
result = response.json()
|
| 45 |
+
logger.info(f"Submission response: {result}")
|
| 46 |
+
|
| 47 |
+
print(f"[submit_answer] Response from {submit_url}:")
|
| 48 |
+
|
| 49 |
+
print ("="* 8)
|
| 50 |
+
print("answer")
|
| 51 |
+
print(result.get("correct"))
|
| 52 |
+
print(result)
|
| 53 |
+
print ("="* 8)
|
| 54 |
+
|
| 55 |
+
# If response contains a url, process it as the next quiz in background
|
| 56 |
+
if result.get("url"):
|
| 57 |
+
next_url = result["url"]
|
| 58 |
+
logger.info(f"π Chained quiz detected: {next_url}")
|
| 59 |
+
print(f"\n[submit_answer] Adding next quiz to background tasks: {next_url}")
|
| 60 |
+
|
| 61 |
+
# If background_tasks available (from FastAPI), use it
|
| 62 |
+
if background_tasks:
|
| 63 |
+
background_tasks.add_task(
|
| 64 |
+
process_next_quiz,
|
| 65 |
+
next_url=next_url,
|
| 66 |
+
email=answer_body.get("email"),
|
| 67 |
+
start_time=datetime.now()
|
| 68 |
+
)
|
| 69 |
+
else:
|
| 70 |
+
# Fallback: run in background thread
|
| 71 |
+
import threading
|
| 72 |
+
thread = threading.Thread(
|
| 73 |
+
target=process_next_quiz,
|
| 74 |
+
args=(next_url, answer_body.get("email"), datetime.now()),
|
| 75 |
+
daemon=True
|
| 76 |
+
)
|
| 77 |
+
thread.start()
|
| 78 |
+
logger.info(f"β Started background thread for chained quiz")
|
| 79 |
+
|
| 80 |
+
return result
|
| 81 |
+
|
| 82 |
+
except requests.exceptions.RequestException as exc:
|
| 83 |
+
logger.error(f"Failed to submit answer to {submit_url}: {exc}")
|
| 84 |
+
raise HTTPException(status_code=400, detail=f"Submission failed: {exc}")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def process_next_quiz(next_url: str, email: str, start_time: datetime):
|
| 88 |
+
"""
|
| 89 |
+
Process the next quiz in the chain as a background task.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
next_url: URL of the next quiz to process
|
| 93 |
+
email: User's email address
|
| 94 |
+
start_time: Start time for tracking
|
| 95 |
+
"""
|
| 96 |
+
try:
|
| 97 |
+
logger.info(f"π Processing chained quiz: {next_url}")
|
| 98 |
+
|
| 99 |
+
# Import here to avoid circular dependency
|
| 100 |
+
from app.services.task_processor import TaskProcessor
|
| 101 |
+
|
| 102 |
+
# Create task data for next quiz
|
| 103 |
+
task_data = ManualTriggeredRequestBody(url=next_url)
|
| 104 |
+
|
| 105 |
+
# Process the next quiz
|
| 106 |
+
processor = TaskProcessor()
|
| 107 |
+
import asyncio
|
| 108 |
+
result = asyncio.run(processor.process(task_data))
|
| 109 |
+
|
| 110 |
+
elapsed = (datetime.now() - start_time).total_seconds()
|
| 111 |
+
logger.info(f"β
Chained quiz completed in {elapsed:.2f}s")
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error(f"β Failed to process chained quiz: {e}", exc_info=True)
|