Spaces:
Sleeping
Sleeping
Commit
Β·
3e70073
1
Parent(s):
c752e2d
module registry
Browse files- app/modules/__init__.py +25 -0
- app/modules/base.py +174 -0
- app/modules/capabilities.py +128 -0
- app/modules/mock_modules.py +273 -0
- app/modules/registry.py +387 -0
- app/orchestrator/models.py +1 -0
- test/test_module_registry.py +398 -0
app/modules/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Processing Modules Package
|
| 3 |
+
Module registry and base interfaces
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from app.modules.base import BaseModule, ModuleCapability, ModuleResult
|
| 7 |
+
from app.modules.registry import ModuleRegistry, ModuleSelector
|
| 8 |
+
from app.modules.capabilities import (
|
| 9 |
+
ScrapingCapability,
|
| 10 |
+
ProcessingCapability,
|
| 11 |
+
VisualizationCapability,
|
| 12 |
+
OutputCapability
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"BaseModule",
|
| 17 |
+
"ModuleCapability",
|
| 18 |
+
"ModuleResult",
|
| 19 |
+
"ModuleRegistry",
|
| 20 |
+
"ModuleSelector",
|
| 21 |
+
"ScrapingCapability",
|
| 22 |
+
"ProcessingCapability",
|
| 23 |
+
"VisualizationCapability",
|
| 24 |
+
"OutputCapability"
|
| 25 |
+
]
|
app/modules/base.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Module Interface
|
| 3 |
+
All processing modules inherit from this
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any, List, Optional, Set
|
| 7 |
+
from abc import ABC, abstractmethod
|
| 8 |
+
from enum import Enum
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
from app.core.logging import get_logger
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ModuleType(str, Enum):
|
| 17 |
+
"""Types of processing modules"""
|
| 18 |
+
SCRAPER = "scraper"
|
| 19 |
+
PROCESSOR = "processor"
|
| 20 |
+
ANALYZER = "analyzer"
|
| 21 |
+
VISUALIZER = "visualizer"
|
| 22 |
+
EXPORTER = "exporter"
|
| 23 |
+
API_CLIENT = "api_client"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ModuleCapability(BaseModel):
|
| 27 |
+
"""Capability definition for a module"""
|
| 28 |
+
|
| 29 |
+
# What can this module do?
|
| 30 |
+
can_scrape_static: bool = False
|
| 31 |
+
can_scrape_dynamic: bool = False
|
| 32 |
+
can_handle_javascript: bool = False
|
| 33 |
+
can_authenticate: bool = False
|
| 34 |
+
can_handle_api: bool = False
|
| 35 |
+
can_process_data: bool = False
|
| 36 |
+
can_clean_data: bool = False
|
| 37 |
+
can_transform_data: bool = False
|
| 38 |
+
can_aggregate: bool = False
|
| 39 |
+
can_filter: bool = False
|
| 40 |
+
can_sort: bool = False
|
| 41 |
+
can_visualize: bool = False
|
| 42 |
+
can_create_charts: bool = False
|
| 43 |
+
can_create_maps: bool = False
|
| 44 |
+
can_export_csv: bool = False
|
| 45 |
+
can_export_json: bool = False
|
| 46 |
+
can_export_excel: bool = False
|
| 47 |
+
can_export_pdf: bool = False
|
| 48 |
+
|
| 49 |
+
# What data formats can it handle?
|
| 50 |
+
supported_input_formats: Set[str] = Field(default_factory=set)
|
| 51 |
+
supported_output_formats: Set[str] = Field(default_factory=set)
|
| 52 |
+
|
| 53 |
+
# Performance characteristics
|
| 54 |
+
max_concurrent_requests: int = 1
|
| 55 |
+
estimated_speed: str = "medium" # fast, medium, slow
|
| 56 |
+
memory_usage: str = "medium" # low, medium, high
|
| 57 |
+
|
| 58 |
+
# Requirements
|
| 59 |
+
requires_browser: bool = False
|
| 60 |
+
requires_api_key: bool = False
|
| 61 |
+
requires_database: bool = False
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class ModuleResult(BaseModel):
|
| 65 |
+
"""Result from module execution"""
|
| 66 |
+
|
| 67 |
+
success: bool
|
| 68 |
+
data: Any = None
|
| 69 |
+
error: Optional[str] = None
|
| 70 |
+
warnings: List[str] = Field(default_factory=list)
|
| 71 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 72 |
+
execution_time: float = 0.0
|
| 73 |
+
|
| 74 |
+
class Config:
|
| 75 |
+
arbitrary_types_allowed = True
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class BaseModule(ABC):
|
| 79 |
+
"""
|
| 80 |
+
Base class for all processing modules
|
| 81 |
+
All modules must inherit from this and implement required methods
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(self, name: str, module_type: ModuleType):
|
| 85 |
+
"""
|
| 86 |
+
Initialize base module
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
name: Module name
|
| 90 |
+
module_type: Type of module
|
| 91 |
+
"""
|
| 92 |
+
self.name = name
|
| 93 |
+
self.module_type = module_type
|
| 94 |
+
self.logger = get_logger(f"module.{name}")
|
| 95 |
+
self._is_initialized = False
|
| 96 |
+
|
| 97 |
+
@abstractmethod
|
| 98 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 99 |
+
"""
|
| 100 |
+
Return module capabilities
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
ModuleCapability: What this module can do
|
| 104 |
+
"""
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
@abstractmethod
|
| 108 |
+
async def execute(
|
| 109 |
+
self,
|
| 110 |
+
parameters: Dict[str, Any],
|
| 111 |
+
context: Optional[Dict[str, Any]] = None
|
| 112 |
+
) -> ModuleResult:
|
| 113 |
+
"""
|
| 114 |
+
Execute module with given parameters
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
parameters: Execution parameters
|
| 118 |
+
context: Optional execution context
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
ModuleResult: Execution result
|
| 122 |
+
"""
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
async def initialize(self) -> bool:
|
| 126 |
+
"""
|
| 127 |
+
Initialize module (load models, connect to services, etc.)
|
| 128 |
+
Override this if your module needs initialization
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
bool: True if initialization successful
|
| 132 |
+
"""
|
| 133 |
+
self._is_initialized = True
|
| 134 |
+
return True
|
| 135 |
+
|
| 136 |
+
async def cleanup(self):
|
| 137 |
+
"""
|
| 138 |
+
Clean up module resources
|
| 139 |
+
Override this if your module needs cleanup
|
| 140 |
+
"""
|
| 141 |
+
self._is_initialized = False
|
| 142 |
+
|
| 143 |
+
def is_initialized(self) -> bool:
|
| 144 |
+
"""Check if module is initialized"""
|
| 145 |
+
return self._is_initialized
|
| 146 |
+
|
| 147 |
+
def can_handle(self, parameters: Dict[str, Any]) -> bool:
|
| 148 |
+
"""
|
| 149 |
+
Check if this module can handle given parameters
|
| 150 |
+
Override this for custom logic
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
parameters: Parameters to check
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
bool: True if module can handle these parameters
|
| 157 |
+
"""
|
| 158 |
+
return True
|
| 159 |
+
|
| 160 |
+
def estimate_cost(self, parameters: Dict[str, Any]) -> float:
|
| 161 |
+
"""
|
| 162 |
+
Estimate execution cost/time for given parameters
|
| 163 |
+
Used for module selection
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
parameters: Parameters to estimate for
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
float: Estimated cost (lower is better)
|
| 170 |
+
"""
|
| 171 |
+
return 1.0
|
| 172 |
+
|
| 173 |
+
def __repr__(self) -> str:
|
| 174 |
+
return f"<{self.__class__.__name__}(name='{self.name}', type='{self.module_type}')>"
|
app/modules/capabilities.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Module Capability Definitions
|
| 3 |
+
Pre-defined capability sets for different module types
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from app.modules.base import ModuleCapability
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ScrapingCapability:
|
| 10 |
+
"""Capability definitions for scraping modules"""
|
| 11 |
+
|
| 12 |
+
STATIC = ModuleCapability(
|
| 13 |
+
can_scrape_static=True,
|
| 14 |
+
can_scrape_dynamic=False,
|
| 15 |
+
can_handle_javascript=False,
|
| 16 |
+
can_authenticate=False,
|
| 17 |
+
supported_input_formats={'html', 'xml'},
|
| 18 |
+
supported_output_formats={'json', 'csv', 'dict'},
|
| 19 |
+
max_concurrent_requests=5,
|
| 20 |
+
estimated_speed="fast",
|
| 21 |
+
memory_usage="low",
|
| 22 |
+
requires_browser=False
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
DYNAMIC = ModuleCapability(
|
| 26 |
+
can_scrape_static=True,
|
| 27 |
+
can_scrape_dynamic=True,
|
| 28 |
+
can_handle_javascript=True,
|
| 29 |
+
can_authenticate=True,
|
| 30 |
+
supported_input_formats={'html', 'javascript'},
|
| 31 |
+
supported_output_formats={'json', 'csv', 'dict'},
|
| 32 |
+
max_concurrent_requests=2,
|
| 33 |
+
estimated_speed="medium",
|
| 34 |
+
memory_usage="high",
|
| 35 |
+
requires_browser=True
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
API_CLIENT = ModuleCapability(
|
| 39 |
+
can_handle_api=True,
|
| 40 |
+
supported_input_formats={'api', 'rest', 'graphql'},
|
| 41 |
+
supported_output_formats={'json', 'dict'},
|
| 42 |
+
max_concurrent_requests=10,
|
| 43 |
+
estimated_speed="fast",
|
| 44 |
+
memory_usage="low",
|
| 45 |
+
requires_api_key=True
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class ProcessingCapability:
|
| 50 |
+
"""Capability definitions for data processing modules"""
|
| 51 |
+
|
| 52 |
+
DATA_CLEANER = ModuleCapability(
|
| 53 |
+
can_process_data=True,
|
| 54 |
+
can_clean_data=True,
|
| 55 |
+
supported_input_formats={'csv', 'json', 'dict', 'dataframe'},
|
| 56 |
+
supported_output_formats={'csv', 'json', 'dict', 'dataframe'},
|
| 57 |
+
max_concurrent_requests=1,
|
| 58 |
+
estimated_speed="fast",
|
| 59 |
+
memory_usage="medium"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
DATA_TRANSFORMER = ModuleCapability(
|
| 63 |
+
can_process_data=True,
|
| 64 |
+
can_transform_data=True,
|
| 65 |
+
can_aggregate=True,
|
| 66 |
+
can_filter=True,
|
| 67 |
+
can_sort=True,
|
| 68 |
+
supported_input_formats={'csv', 'json', 'dict', 'dataframe'},
|
| 69 |
+
supported_output_formats={'csv', 'json', 'dict', 'dataframe'},
|
| 70 |
+
max_concurrent_requests=1,
|
| 71 |
+
estimated_speed="medium",
|
| 72 |
+
memory_usage="medium"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class VisualizationCapability:
|
| 77 |
+
"""Capability definitions for visualization modules"""
|
| 78 |
+
|
| 79 |
+
CHART_CREATOR = ModuleCapability(
|
| 80 |
+
can_visualize=True,
|
| 81 |
+
can_create_charts=True,
|
| 82 |
+
supported_input_formats={'csv', 'json', 'dict', 'dataframe'},
|
| 83 |
+
supported_output_formats={'png', 'jpg', 'svg', 'html'},
|
| 84 |
+
max_concurrent_requests=1,
|
| 85 |
+
estimated_speed="medium",
|
| 86 |
+
memory_usage="medium"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
MAP_CREATOR = ModuleCapability(
|
| 90 |
+
can_visualize=True,
|
| 91 |
+
can_create_maps=True,
|
| 92 |
+
supported_input_formats={'csv', 'json', 'dict', 'geojson'},
|
| 93 |
+
supported_output_formats={'html', 'png'},
|
| 94 |
+
max_concurrent_requests=1,
|
| 95 |
+
estimated_speed="slow",
|
| 96 |
+
memory_usage="high"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class OutputCapability:
|
| 101 |
+
"""Capability definitions for output/export modules"""
|
| 102 |
+
|
| 103 |
+
CSV_EXPORTER = ModuleCapability(
|
| 104 |
+
can_export_csv=True,
|
| 105 |
+
supported_input_formats={'dict', 'list', 'dataframe'},
|
| 106 |
+
supported_output_formats={'csv'},
|
| 107 |
+
max_concurrent_requests=1,
|
| 108 |
+
estimated_speed="fast",
|
| 109 |
+
memory_usage="low"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
EXCEL_EXPORTER = ModuleCapability(
|
| 113 |
+
can_export_excel=True,
|
| 114 |
+
supported_input_formats={'dict', 'list', 'dataframe'},
|
| 115 |
+
supported_output_formats={'xlsx', 'xls'},
|
| 116 |
+
max_concurrent_requests=1,
|
| 117 |
+
estimated_speed="medium",
|
| 118 |
+
memory_usage="medium"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
JSON_EXPORTER = ModuleCapability(
|
| 122 |
+
can_export_json=True,
|
| 123 |
+
supported_input_formats={'dict', 'list', 'dataframe'},
|
| 124 |
+
supported_output_formats={'json'},
|
| 125 |
+
max_concurrent_requests=1,
|
| 126 |
+
estimated_speed="fast",
|
| 127 |
+
memory_usage="low"
|
| 128 |
+
)
|
app/modules/mock_modules.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Mock Modules for Testing
|
| 3 |
+
These simulate real modules until Phase 2 modules are built
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any, Optional
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
from app.modules.base import (
|
| 10 |
+
BaseModule,
|
| 11 |
+
ModuleType,
|
| 12 |
+
ModuleCapability,
|
| 13 |
+
ModuleResult
|
| 14 |
+
)
|
| 15 |
+
from app.modules.capabilities import (
|
| 16 |
+
ScrapingCapability,
|
| 17 |
+
ProcessingCapability,
|
| 18 |
+
VisualizationCapability,
|
| 19 |
+
OutputCapability
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
from app.core.logging import get_logger
|
| 23 |
+
from app.modules.registry import ModuleRegistry
|
| 24 |
+
|
| 25 |
+
logger = get_logger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class MockStaticScraper(BaseModule):
|
| 29 |
+
"""Mock static web scraper"""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
super().__init__(name="static_scraper", module_type=ModuleType.SCRAPER)
|
| 33 |
+
|
| 34 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 35 |
+
return ScrapingCapability.STATIC
|
| 36 |
+
|
| 37 |
+
async def execute(
|
| 38 |
+
self,
|
| 39 |
+
parameters: Dict[str, Any],
|
| 40 |
+
context: Optional[Dict[str, Any]] = None
|
| 41 |
+
) -> ModuleResult:
|
| 42 |
+
"""Mock scraping execution"""
|
| 43 |
+
self.logger.info(f"[MOCK] Scraping static HTML from: {parameters.get('url')}")
|
| 44 |
+
|
| 45 |
+
# Simulate work
|
| 46 |
+
await self._simulate_work(1.0)
|
| 47 |
+
|
| 48 |
+
# Mock data
|
| 49 |
+
mock_data = [
|
| 50 |
+
{'name': 'Product 1', 'price': '$10.99'},
|
| 51 |
+
{'name': 'Product 2', 'price': '$24.99'},
|
| 52 |
+
{'name': 'Product 3', 'price': '$15.49'}
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
return ModuleResult(
|
| 56 |
+
success=True,
|
| 57 |
+
data=mock_data,
|
| 58 |
+
metadata={'rows': len(mock_data), 'source': 'mock'},
|
| 59 |
+
execution_time=1.0
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
async def _simulate_work(self, seconds: float):
|
| 63 |
+
"""Simulate async work"""
|
| 64 |
+
import asyncio
|
| 65 |
+
await asyncio.sleep(seconds)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class MockDynamicScraper(BaseModule):
|
| 69 |
+
"""Mock dynamic web scraper (Playwright)"""
|
| 70 |
+
|
| 71 |
+
def __init__(self):
|
| 72 |
+
super().__init__(name="dynamic_scraper", module_type=ModuleType.SCRAPER)
|
| 73 |
+
|
| 74 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 75 |
+
return ScrapingCapability.DYNAMIC
|
| 76 |
+
|
| 77 |
+
async def execute(
|
| 78 |
+
self,
|
| 79 |
+
parameters: Dict[str, Any],
|
| 80 |
+
context: Optional[Dict[str, Any]] = None
|
| 81 |
+
) -> ModuleResult:
|
| 82 |
+
"""Mock dynamic scraping"""
|
| 83 |
+
self.logger.info(f"[MOCK] Scraping with JavaScript from: {parameters.get('url')}")
|
| 84 |
+
|
| 85 |
+
await self._simulate_work(2.0)
|
| 86 |
+
|
| 87 |
+
mock_data = [
|
| 88 |
+
{'name': 'Dynamic Product 1', 'price': '$29.99'},
|
| 89 |
+
{'name': 'Dynamic Product 2', 'price': '$49.99'}
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
return ModuleResult(
|
| 93 |
+
success=True,
|
| 94 |
+
data=mock_data,
|
| 95 |
+
metadata={'rows': len(mock_data), 'method': 'playwright'},
|
| 96 |
+
execution_time=2.0
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
async def _simulate_work(self, seconds: float):
|
| 100 |
+
import asyncio
|
| 101 |
+
await asyncio.sleep(seconds)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class MockDataProcessor(BaseModule):
|
| 105 |
+
"""Mock data processor"""
|
| 106 |
+
|
| 107 |
+
def __init__(self):
|
| 108 |
+
super().__init__(name="data_processor", module_type=ModuleType.PROCESSOR)
|
| 109 |
+
|
| 110 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 111 |
+
return ProcessingCapability.DATA_TRANSFORMER
|
| 112 |
+
|
| 113 |
+
async def execute(
|
| 114 |
+
self,
|
| 115 |
+
parameters: Dict[str, Any],
|
| 116 |
+
context: Optional[Dict[str, Any]] = None
|
| 117 |
+
) -> ModuleResult:
|
| 118 |
+
"""Mock data processing"""
|
| 119 |
+
self.logger.info("[MOCK] Processing data with filters and aggregations")
|
| 120 |
+
|
| 121 |
+
# Get input data from context
|
| 122 |
+
input_data = context.get('data', []) if context else []
|
| 123 |
+
|
| 124 |
+
# Mock filtering
|
| 125 |
+
if parameters.get('filters'):
|
| 126 |
+
self.logger.info(f"[MOCK] Applying {len(parameters['filters'])} filters")
|
| 127 |
+
|
| 128 |
+
# Mock aggregation
|
| 129 |
+
if parameters.get('aggregations'):
|
| 130 |
+
self.logger.info(f"[MOCK] Applying {len(parameters['aggregations'])} aggregations")
|
| 131 |
+
|
| 132 |
+
mock_result = {
|
| 133 |
+
'filtered_rows': len(input_data),
|
| 134 |
+
'aggregated': True,
|
| 135 |
+
'data': input_data
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
return ModuleResult(
|
| 139 |
+
success=True,
|
| 140 |
+
data=mock_result,
|
| 141 |
+
metadata={'processed': True},
|
| 142 |
+
execution_time=0.5
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class MockChartCreator(BaseModule):
|
| 147 |
+
"""Mock chart/visualization creator"""
|
| 148 |
+
|
| 149 |
+
def __init__(self):
|
| 150 |
+
super().__init__(name="chart_creator", module_type=ModuleType.VISUALIZER)
|
| 151 |
+
|
| 152 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 153 |
+
return VisualizationCapability.CHART_CREATOR
|
| 154 |
+
|
| 155 |
+
async def execute(
|
| 156 |
+
self,
|
| 157 |
+
parameters: Dict[str, Any],
|
| 158 |
+
context: Optional[Dict[str, Any]] = None
|
| 159 |
+
) -> ModuleResult:
|
| 160 |
+
"""Mock chart creation"""
|
| 161 |
+
chart_type = parameters.get('chart_type', 'bar')
|
| 162 |
+
self.logger.info(f"[MOCK] Creating {chart_type} chart")
|
| 163 |
+
|
| 164 |
+
mock_chart = {
|
| 165 |
+
'type': chart_type,
|
| 166 |
+
'file': f'/tmp/chart_{chart_type}.png',
|
| 167 |
+
'created': True
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
return ModuleResult(
|
| 171 |
+
success=True,
|
| 172 |
+
data=mock_chart,
|
| 173 |
+
metadata={'chart_type': chart_type},
|
| 174 |
+
execution_time=0.8
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
class MockCSVExporter(BaseModule):
|
| 179 |
+
"""Mock CSV exporter"""
|
| 180 |
+
|
| 181 |
+
def __init__(self):
|
| 182 |
+
super().__init__(name="csv_exporter", module_type=ModuleType.EXPORTER)
|
| 183 |
+
|
| 184 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 185 |
+
return OutputCapability.CSV_EXPORTER
|
| 186 |
+
|
| 187 |
+
async def execute(
|
| 188 |
+
self,
|
| 189 |
+
parameters: Dict[str, Any],
|
| 190 |
+
context: Optional[Dict[str, Any]] = None
|
| 191 |
+
) -> ModuleResult:
|
| 192 |
+
"""Mock CSV export"""
|
| 193 |
+
filename = parameters.get('filename', 'output.csv')
|
| 194 |
+
self.logger.info(f"[MOCK] Exporting to CSV: {filename}")
|
| 195 |
+
|
| 196 |
+
mock_export = {
|
| 197 |
+
'filename': filename,
|
| 198 |
+
'path': f'/tmp/{filename}',
|
| 199 |
+
'rows': 10,
|
| 200 |
+
'exported': True
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
return ModuleResult(
|
| 204 |
+
success=True,
|
| 205 |
+
data=mock_export,
|
| 206 |
+
metadata={'format': 'csv'},
|
| 207 |
+
execution_time=0.3
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
class MockAPIClient(BaseModule):
|
| 212 |
+
"""Mock API client"""
|
| 213 |
+
|
| 214 |
+
def __init__(self):
|
| 215 |
+
super().__init__(name="api_client", module_type=ModuleType.API_CLIENT)
|
| 216 |
+
|
| 217 |
+
def get_capabilities(self) -> ModuleCapability:
|
| 218 |
+
return ScrapingCapability.API_CLIENT
|
| 219 |
+
|
| 220 |
+
async def execute(
|
| 221 |
+
self,
|
| 222 |
+
parameters: Dict[str, Any],
|
| 223 |
+
context: Optional[Dict[str, Any]] = None
|
| 224 |
+
) -> ModuleResult:
|
| 225 |
+
"""Mock API call"""
|
| 226 |
+
api_url = parameters.get('url', 'https://api.example.com')
|
| 227 |
+
self.logger.info(f"[MOCK] Calling API: {api_url}")
|
| 228 |
+
|
| 229 |
+
await self._simulate_work(0.5)
|
| 230 |
+
|
| 231 |
+
mock_data = {
|
| 232 |
+
'status': 'success',
|
| 233 |
+
'data': [
|
| 234 |
+
{'id': 1, 'name': 'Item 1'},
|
| 235 |
+
{'id': 2, 'name': 'Item 2'}
|
| 236 |
+
]
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
return ModuleResult(
|
| 240 |
+
success=True,
|
| 241 |
+
data=mock_data,
|
| 242 |
+
metadata={'api_url': api_url},
|
| 243 |
+
execution_time=0.5
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
async def _simulate_work(self, seconds: float):
|
| 247 |
+
import asyncio
|
| 248 |
+
await asyncio.sleep(seconds)
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def register_mock_modules(registry: Optional['ModuleRegistry'] = None):
|
| 252 |
+
"""
|
| 253 |
+
Register all mock modules for testing
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
registry: Registry to register to (creates new if None)
|
| 257 |
+
"""
|
| 258 |
+
from app.modules.registry import ModuleRegistry
|
| 259 |
+
|
| 260 |
+
if registry is None:
|
| 261 |
+
registry = ModuleRegistry()
|
| 262 |
+
|
| 263 |
+
# Register all mock modules
|
| 264 |
+
registry.register(MockStaticScraper())
|
| 265 |
+
registry.register(MockDynamicScraper())
|
| 266 |
+
registry.register(MockDataProcessor())
|
| 267 |
+
registry.register(MockChartCreator())
|
| 268 |
+
registry.register(MockCSVExporter())
|
| 269 |
+
registry.register(MockAPIClient())
|
| 270 |
+
|
| 271 |
+
logger.info("β
Registered 6 mock modules for testing")
|
| 272 |
+
|
| 273 |
+
return registry
|
app/modules/registry.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Module Registry
|
| 3 |
+
Central registry for all processing modules
|
| 4 |
+
Handles module registration, discovery, and selection
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, List, Optional, Type, Set
|
| 8 |
+
from collections import defaultdict
|
| 9 |
+
|
| 10 |
+
from app.modules.base import BaseModule, ModuleType, ModuleCapability
|
| 11 |
+
from app.orchestrator.parameter_models import ExtractedParameters
|
| 12 |
+
from app.orchestrator.models import TaskClassification
|
| 13 |
+
from app.core.logging import get_logger
|
| 14 |
+
|
| 15 |
+
logger = get_logger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ModuleRegistry:
|
| 19 |
+
"""
|
| 20 |
+
Central registry for all processing modules
|
| 21 |
+
Singleton pattern - only one registry exists
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
_instance = None
|
| 25 |
+
|
| 26 |
+
def __new__(cls):
|
| 27 |
+
if cls._instance is None:
|
| 28 |
+
cls._instance = super().__new__(cls)
|
| 29 |
+
cls._instance._initialized = False
|
| 30 |
+
return cls._instance
|
| 31 |
+
|
| 32 |
+
def __init__(self):
|
| 33 |
+
"""Initialize module registry"""
|
| 34 |
+
if self._initialized:
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
self.modules: Dict[str, BaseModule] = {}
|
| 38 |
+
self.modules_by_type: Dict[ModuleType, List[BaseModule]] = defaultdict(list)
|
| 39 |
+
self._initialized = True
|
| 40 |
+
|
| 41 |
+
logger.info("ModuleRegistry initialized")
|
| 42 |
+
|
| 43 |
+
def register(self, module: BaseModule):
|
| 44 |
+
"""
|
| 45 |
+
Register a module
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
module: Module to register
|
| 49 |
+
"""
|
| 50 |
+
if module.name in self.modules:
|
| 51 |
+
logger.warning(f"Module '{module.name}' already registered, replacing")
|
| 52 |
+
|
| 53 |
+
self.modules[module.name] = module
|
| 54 |
+
self.modules_by_type[module.module_type].append(module)
|
| 55 |
+
|
| 56 |
+
logger.info(
|
| 57 |
+
f"β Registered module: {module.name} "
|
| 58 |
+
f"(type: {module.module_type.value})"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
def unregister(self, module_name: str) -> bool:
|
| 62 |
+
"""
|
| 63 |
+
Unregister a module
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
module_name: Name of module to unregister
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
bool: True if unregistered
|
| 70 |
+
"""
|
| 71 |
+
if module_name not in self.modules:
|
| 72 |
+
return False
|
| 73 |
+
|
| 74 |
+
module = self.modules[module_name]
|
| 75 |
+
del self.modules[module_name]
|
| 76 |
+
self.modules_by_type[module.module_type].remove(module)
|
| 77 |
+
|
| 78 |
+
logger.info(f"Unregistered module: {module_name}")
|
| 79 |
+
return True
|
| 80 |
+
|
| 81 |
+
def get_module(self, name: str) -> Optional[BaseModule]:
|
| 82 |
+
"""Get module by name"""
|
| 83 |
+
return self.modules.get(name)
|
| 84 |
+
|
| 85 |
+
def get_modules_by_type(self, module_type: ModuleType) -> List[BaseModule]:
|
| 86 |
+
"""Get all modules of a specific type"""
|
| 87 |
+
return self.modules_by_type.get(module_type, [])
|
| 88 |
+
|
| 89 |
+
def get_all_modules(self) -> List[BaseModule]:
|
| 90 |
+
"""Get all registered modules"""
|
| 91 |
+
return list(self.modules.values())
|
| 92 |
+
|
| 93 |
+
def list_modules(self) -> Dict[str, Dict]:
|
| 94 |
+
"""
|
| 95 |
+
List all registered modules with their info
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Dict: Module information
|
| 99 |
+
"""
|
| 100 |
+
result = {}
|
| 101 |
+
|
| 102 |
+
for name, module in self.modules.items():
|
| 103 |
+
capabilities = module.get_capabilities()
|
| 104 |
+
|
| 105 |
+
result[name] = {
|
| 106 |
+
'type': module.module_type.value,
|
| 107 |
+
'initialized': module.is_initialized(),
|
| 108 |
+
'capabilities': capabilities.dict()
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
return result
|
| 112 |
+
|
| 113 |
+
def clear(self):
|
| 114 |
+
"""Clear all registered modules (for testing)"""
|
| 115 |
+
self.modules.clear()
|
| 116 |
+
self.modules_by_type.clear()
|
| 117 |
+
logger.info("Registry cleared")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class ModuleSelector:
|
| 121 |
+
"""
|
| 122 |
+
Selects appropriate modules based on task requirements
|
| 123 |
+
Uses classification and parameters to find best modules
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
def __init__(self, registry: Optional[ModuleRegistry] = None):
|
| 127 |
+
"""
|
| 128 |
+
Initialize module selector
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
registry: Module registry to use (creates new if None)
|
| 132 |
+
"""
|
| 133 |
+
self.registry = registry or ModuleRegistry()
|
| 134 |
+
logger.debug("ModuleSelector initialized")
|
| 135 |
+
|
| 136 |
+
def select_modules(
|
| 137 |
+
self,
|
| 138 |
+
classification: TaskClassification,
|
| 139 |
+
parameters: ExtractedParameters
|
| 140 |
+
) -> List[BaseModule]:
|
| 141 |
+
"""
|
| 142 |
+
Select appropriate modules for task
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
classification: Task classification
|
| 146 |
+
parameters: Extracted parameters
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
List[BaseModule]: Selected modules in execution order
|
| 150 |
+
"""
|
| 151 |
+
logger.info("π Selecting modules for task")
|
| 152 |
+
logger.debug(
|
| 153 |
+
f"Task type: {classification.primary_task.value}, "
|
| 154 |
+
f"Complexity: {classification.complexity.value}"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
selected = []
|
| 158 |
+
|
| 159 |
+
# Step 1: Select data sourcing module
|
| 160 |
+
sourcing_module = self._select_sourcing_module(classification, parameters)
|
| 161 |
+
if sourcing_module:
|
| 162 |
+
selected.append(sourcing_module)
|
| 163 |
+
|
| 164 |
+
# Step 2: Select processing modules
|
| 165 |
+
processing_modules = self._select_processing_modules(classification, parameters)
|
| 166 |
+
selected.extend(processing_modules)
|
| 167 |
+
|
| 168 |
+
# Step 3: Select visualization module (if needed)
|
| 169 |
+
viz_module = self._select_visualization_module(classification, parameters)
|
| 170 |
+
if viz_module:
|
| 171 |
+
selected.append(viz_module)
|
| 172 |
+
|
| 173 |
+
# Step 4: Select output/export module
|
| 174 |
+
output_module = self._select_output_module(classification, parameters)
|
| 175 |
+
if output_module:
|
| 176 |
+
selected.append(output_module)
|
| 177 |
+
|
| 178 |
+
logger.info(
|
| 179 |
+
f"β
Selected {len(selected)} modules: "
|
| 180 |
+
f"{[m.name for m in selected]}"
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
return selected
|
| 184 |
+
|
| 185 |
+
def _select_sourcing_module(
|
| 186 |
+
self,
|
| 187 |
+
classification: TaskClassification,
|
| 188 |
+
parameters: ExtractedParameters
|
| 189 |
+
) -> Optional[BaseModule]:
|
| 190 |
+
"""Select data sourcing module (scraper, API client, etc.)"""
|
| 191 |
+
|
| 192 |
+
# No data sources, no module needed
|
| 193 |
+
if not parameters.data_sources:
|
| 194 |
+
logger.debug("No data sources, skipping sourcing module")
|
| 195 |
+
return None
|
| 196 |
+
|
| 197 |
+
data_source = parameters.data_sources[0] # Use first source
|
| 198 |
+
|
| 199 |
+
# API data source
|
| 200 |
+
if data_source.type == 'api':
|
| 201 |
+
logger.debug("Selecting API client module")
|
| 202 |
+
candidates = self.registry.get_modules_by_type(ModuleType.API_CLIENT)
|
| 203 |
+
|
| 204 |
+
for module in candidates:
|
| 205 |
+
if module.get_capabilities().can_handle_api:
|
| 206 |
+
logger.info(f"β Selected API client: {module.name}")
|
| 207 |
+
return module
|
| 208 |
+
|
| 209 |
+
# URL/web scraping
|
| 210 |
+
if data_source.type == 'url':
|
| 211 |
+
# Check if JavaScript needed from CLASSIFICATION (primary source)
|
| 212 |
+
needs_javascript = classification.requires_javascript # β Fixed: Check classification first
|
| 213 |
+
|
| 214 |
+
# Also check parameters.urls if present
|
| 215 |
+
if parameters.urls:
|
| 216 |
+
needs_javascript = needs_javascript or any(
|
| 217 |
+
u.requires_javascript for u in parameters.urls
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
if needs_javascript:
|
| 221 |
+
logger.debug("JavaScript required, selecting dynamic scraper")
|
| 222 |
+
candidates = self.registry.get_modules_by_type(ModuleType.SCRAPER)
|
| 223 |
+
|
| 224 |
+
for module in candidates:
|
| 225 |
+
if module.get_capabilities().can_scrape_dynamic:
|
| 226 |
+
logger.info(f"β Selected dynamic scraper: {module.name}")
|
| 227 |
+
return module
|
| 228 |
+
|
| 229 |
+
logger.warning("JavaScript needed but no dynamic scraper available")
|
| 230 |
+
|
| 231 |
+
# Static HTML scraping
|
| 232 |
+
logger.debug("Selecting static scraper")
|
| 233 |
+
candidates = self.registry.get_modules_by_type(ModuleType.SCRAPER)
|
| 234 |
+
|
| 235 |
+
for module in candidates:
|
| 236 |
+
if module.get_capabilities().can_scrape_static:
|
| 237 |
+
logger.info(f"β Selected static scraper: {module.name}")
|
| 238 |
+
return module
|
| 239 |
+
|
| 240 |
+
logger.warning("No suitable sourcing module found")
|
| 241 |
+
return None
|
| 242 |
+
|
| 243 |
+
def _select_processing_modules(
|
| 244 |
+
self,
|
| 245 |
+
classification: TaskClassification,
|
| 246 |
+
parameters: ExtractedParameters
|
| 247 |
+
) -> List[BaseModule]:
|
| 248 |
+
"""Select data processing modules"""
|
| 249 |
+
|
| 250 |
+
selected = []
|
| 251 |
+
|
| 252 |
+
# Data cleaning (if filters present)
|
| 253 |
+
if parameters.filters:
|
| 254 |
+
logger.debug("Filters detected, need data processor")
|
| 255 |
+
candidates = self.registry.get_modules_by_type(ModuleType.PROCESSOR)
|
| 256 |
+
|
| 257 |
+
for module in candidates:
|
| 258 |
+
caps = module.get_capabilities()
|
| 259 |
+
if caps.can_filter or caps.can_transform_data:
|
| 260 |
+
logger.info(f"β Selected processor: {module.name}")
|
| 261 |
+
selected.append(module)
|
| 262 |
+
break
|
| 263 |
+
|
| 264 |
+
# Aggregation (if aggregations present)
|
| 265 |
+
if parameters.aggregations:
|
| 266 |
+
logger.debug("Aggregations detected")
|
| 267 |
+
candidates = self.registry.get_modules_by_type(ModuleType.PROCESSOR)
|
| 268 |
+
|
| 269 |
+
for module in candidates:
|
| 270 |
+
if module.get_capabilities().can_aggregate:
|
| 271 |
+
if module not in selected:
|
| 272 |
+
logger.info(f"β Selected aggregator: {module.name}")
|
| 273 |
+
selected.append(module)
|
| 274 |
+
break
|
| 275 |
+
|
| 276 |
+
return selected
|
| 277 |
+
|
| 278 |
+
def _select_visualization_module(
|
| 279 |
+
self,
|
| 280 |
+
classification: TaskClassification,
|
| 281 |
+
parameters: ExtractedParameters
|
| 282 |
+
) -> Optional[BaseModule]:
|
| 283 |
+
"""Select visualization module"""
|
| 284 |
+
|
| 285 |
+
if not parameters.visualizations:
|
| 286 |
+
return None
|
| 287 |
+
|
| 288 |
+
viz_req = parameters.visualizations[0] # Use first visualization
|
| 289 |
+
|
| 290 |
+
# Map visualization
|
| 291 |
+
if viz_req.type == 'map':
|
| 292 |
+
logger.debug("Map visualization needed")
|
| 293 |
+
candidates = self.registry.get_modules_by_type(ModuleType.VISUALIZER)
|
| 294 |
+
|
| 295 |
+
for module in candidates:
|
| 296 |
+
if module.get_capabilities().can_create_maps:
|
| 297 |
+
logger.info(f"β Selected map creator: {module.name}")
|
| 298 |
+
return module
|
| 299 |
+
|
| 300 |
+
# Chart visualization
|
| 301 |
+
if viz_req.type == 'chart':
|
| 302 |
+
logger.debug("Chart visualization needed")
|
| 303 |
+
candidates = self.registry.get_modules_by_type(ModuleType.VISUALIZER)
|
| 304 |
+
|
| 305 |
+
for module in candidates:
|
| 306 |
+
if module.get_capabilities().can_create_charts:
|
| 307 |
+
logger.info(f"β Selected chart creator: {module.name}")
|
| 308 |
+
return module
|
| 309 |
+
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
def _select_output_module(
|
| 313 |
+
self,
|
| 314 |
+
classification: TaskClassification,
|
| 315 |
+
parameters: ExtractedParameters
|
| 316 |
+
) -> Optional[BaseModule]:
|
| 317 |
+
"""Select output/export module"""
|
| 318 |
+
|
| 319 |
+
if not parameters.output:
|
| 320 |
+
logger.debug("No output format specified, using default CSV")
|
| 321 |
+
output_format = 'csv'
|
| 322 |
+
else:
|
| 323 |
+
output_format = parameters.output.format
|
| 324 |
+
|
| 325 |
+
candidates = self.registry.get_modules_by_type(ModuleType.EXPORTER)
|
| 326 |
+
|
| 327 |
+
# Match by format
|
| 328 |
+
for module in candidates:
|
| 329 |
+
caps = module.get_capabilities()
|
| 330 |
+
|
| 331 |
+
if output_format == 'csv' and caps.can_export_csv:
|
| 332 |
+
logger.info(f"β Selected CSV exporter: {module.name}")
|
| 333 |
+
return module
|
| 334 |
+
|
| 335 |
+
elif output_format == 'excel' and caps.can_export_excel:
|
| 336 |
+
logger.info(f"β Selected Excel exporter: {module.name}")
|
| 337 |
+
return module
|
| 338 |
+
|
| 339 |
+
elif output_format == 'json' and caps.can_export_json:
|
| 340 |
+
logger.info(f"β Selected JSON exporter: {module.name}")
|
| 341 |
+
return module
|
| 342 |
+
|
| 343 |
+
logger.warning(f"No exporter for format: {output_format}")
|
| 344 |
+
return None
|
| 345 |
+
|
| 346 |
+
def can_execute_task(
|
| 347 |
+
self,
|
| 348 |
+
classification: TaskClassification,
|
| 349 |
+
parameters: ExtractedParameters
|
| 350 |
+
) -> bool:
|
| 351 |
+
"""
|
| 352 |
+
Check if task can be executed with available modules
|
| 353 |
+
|
| 354 |
+
Args:
|
| 355 |
+
classification: Task classification
|
| 356 |
+
parameters: Extracted parameters
|
| 357 |
+
|
| 358 |
+
Returns:
|
| 359 |
+
bool: True if task can be executed
|
| 360 |
+
"""
|
| 361 |
+
selected = self.select_modules(classification, parameters)
|
| 362 |
+
|
| 363 |
+
# Need at least one module to execute
|
| 364 |
+
if not selected:
|
| 365 |
+
logger.warning("No modules selected, cannot execute task")
|
| 366 |
+
return False
|
| 367 |
+
|
| 368 |
+
# Check if we have sourcing module (if data sources present)
|
| 369 |
+
if parameters.data_sources and not any(
|
| 370 |
+
m.module_type in [ModuleType.SCRAPER, ModuleType.API_CLIENT]
|
| 371 |
+
for m in selected
|
| 372 |
+
):
|
| 373 |
+
logger.warning("Data sources present but no sourcing module")
|
| 374 |
+
return False
|
| 375 |
+
|
| 376 |
+
return True
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
# Convenience function
|
| 380 |
+
def get_module_registry() -> ModuleRegistry:
|
| 381 |
+
"""Get global module registry instance"""
|
| 382 |
+
return ModuleRegistry()
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def get_module_selector() -> ModuleSelector:
|
| 386 |
+
"""Get module selector instance"""
|
| 387 |
+
return ModuleSelector()
|
app/orchestrator/models.py
CHANGED
|
@@ -41,6 +41,7 @@ class OutputFormat(str, Enum):
|
|
| 41 |
CSV = "csv"
|
| 42 |
IMAGE = "image"
|
| 43 |
CHART = "chart"
|
|
|
|
| 44 |
HTML = "html"
|
| 45 |
PDF = "pdf"
|
| 46 |
UNKNOWN = "unknown"
|
|
|
|
| 41 |
CSV = "csv"
|
| 42 |
IMAGE = "image"
|
| 43 |
CHART = "chart"
|
| 44 |
+
EXCEL = "excel"
|
| 45 |
HTML = "html"
|
| 46 |
PDF = "pdf"
|
| 47 |
UNKNOWN = "unknown"
|
test/test_module_registry.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test Module Registry
|
| 3 |
+
Comprehensive tests for module registration and selection
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Add project root to Python path
|
| 11 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
| 12 |
+
sys.path.insert(0, project_root)
|
| 13 |
+
from app.modules.registry import ModuleRegistry, ModuleSelector
|
| 14 |
+
from app.modules.mock_modules import register_mock_modules
|
| 15 |
+
from app.orchestrator.models import TaskClassification, TaskType, ComplexityLevel, OutputFormat
|
| 16 |
+
from app.orchestrator.parameter_models import (
|
| 17 |
+
ExtractedParameters,
|
| 18 |
+
DataSource,
|
| 19 |
+
FilterCondition,
|
| 20 |
+
VisualizationRequirement,
|
| 21 |
+
OutputRequirement
|
| 22 |
+
)
|
| 23 |
+
from app.core.logging import setup_logging, get_logger
|
| 24 |
+
|
| 25 |
+
setup_logging()
|
| 26 |
+
logger = get_logger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_registry_registration():
|
| 30 |
+
"""Test module registration"""
|
| 31 |
+
|
| 32 |
+
print("\n" + "=" * 60)
|
| 33 |
+
print("Test 1: Module Registration")
|
| 34 |
+
print("=" * 60)
|
| 35 |
+
|
| 36 |
+
# Clear registry
|
| 37 |
+
registry = ModuleRegistry()
|
| 38 |
+
registry.clear()
|
| 39 |
+
|
| 40 |
+
# Register mock modules
|
| 41 |
+
register_mock_modules(registry)
|
| 42 |
+
|
| 43 |
+
# Check registration
|
| 44 |
+
all_modules = registry.get_all_modules()
|
| 45 |
+
|
| 46 |
+
print(f"\nβ Registered {len(all_modules)} modules:")
|
| 47 |
+
for module in all_modules:
|
| 48 |
+
print(f" - {module.name} ({module.module_type.value})")
|
| 49 |
+
|
| 50 |
+
# List with details
|
| 51 |
+
module_info = registry.list_modules()
|
| 52 |
+
|
| 53 |
+
print(f"\nπ Module Details:")
|
| 54 |
+
for name, info in module_info.items():
|
| 55 |
+
print(f"\n {name}:")
|
| 56 |
+
print(f" Type: {info['type']}")
|
| 57 |
+
print(f" Initialized: {info['initialized']}")
|
| 58 |
+
|
| 59 |
+
caps = info['capabilities']
|
| 60 |
+
cap_list = [k for k, v in caps.items() if v and k.startswith('can_')]
|
| 61 |
+
if cap_list:
|
| 62 |
+
print(f" Capabilities: {', '.join(cap_list[:3])}...")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_simple_scraping_selection():
|
| 66 |
+
"""Test module selection for simple scraping task"""
|
| 67 |
+
|
| 68 |
+
print("\n" + "=" * 60)
|
| 69 |
+
print("Test 2: Simple Scraping Task")
|
| 70 |
+
print("=" * 60)
|
| 71 |
+
|
| 72 |
+
# Setup
|
| 73 |
+
registry = ModuleRegistry()
|
| 74 |
+
registry.clear()
|
| 75 |
+
register_mock_modules(registry)
|
| 76 |
+
|
| 77 |
+
selector = ModuleSelector(registry)
|
| 78 |
+
|
| 79 |
+
# Create simple scraping task
|
| 80 |
+
classification = TaskClassification(
|
| 81 |
+
primary_task=TaskType.WEB_SCRAPING,
|
| 82 |
+
secondary_tasks=[],
|
| 83 |
+
complexity=ComplexityLevel.SIMPLE,
|
| 84 |
+
estimated_steps=2,
|
| 85 |
+
requires_javascript=False,
|
| 86 |
+
requires_authentication=False,
|
| 87 |
+
output_format=OutputFormat.CSV,
|
| 88 |
+
confidence=0.9, # β Added
|
| 89 |
+
reasoning="Simple static web scraping task" # β Added
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
parameters = ExtractedParameters(
|
| 93 |
+
data_sources=[
|
| 94 |
+
DataSource(
|
| 95 |
+
type='url',
|
| 96 |
+
location='https://example.com/products',
|
| 97 |
+
format='html',
|
| 98 |
+
description='Product listing page'
|
| 99 |
+
)
|
| 100 |
+
],
|
| 101 |
+
output=OutputRequirement(
|
| 102 |
+
format='csv',
|
| 103 |
+
description='Export as CSV'
|
| 104 |
+
)
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
print("\nπ Task:")
|
| 108 |
+
print(" Type: Simple web scraping")
|
| 109 |
+
print(" JavaScript: No")
|
| 110 |
+
print(" Output: CSV")
|
| 111 |
+
print("-" * 60)
|
| 112 |
+
|
| 113 |
+
# Select modules
|
| 114 |
+
selected = selector.select_modules(classification, parameters)
|
| 115 |
+
|
| 116 |
+
print(f"\nβ
Selected {len(selected)} modules:")
|
| 117 |
+
for i, module in enumerate(selected, 1):
|
| 118 |
+
print(f" {i}. {module.name} ({module.module_type.value})")
|
| 119 |
+
|
| 120 |
+
# Verify
|
| 121 |
+
assert len(selected) >= 2, "Should select at least scraper + exporter"
|
| 122 |
+
assert any(m.name == 'static_scraper' for m in selected), "Should use static scraper"
|
| 123 |
+
assert any(m.name == 'csv_exporter' for m in selected), "Should use CSV exporter"
|
| 124 |
+
|
| 125 |
+
print("\nβ Correct modules selected!")
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def test_dynamic_scraping_selection():
|
| 129 |
+
"""Test module selection for dynamic scraping (JavaScript)"""
|
| 130 |
+
|
| 131 |
+
print("\n" + "=" * 60)
|
| 132 |
+
print("Test 3: Dynamic Scraping Task (JavaScript)")
|
| 133 |
+
print("=" * 60)
|
| 134 |
+
|
| 135 |
+
registry = ModuleRegistry()
|
| 136 |
+
registry.clear()
|
| 137 |
+
register_mock_modules(registry)
|
| 138 |
+
|
| 139 |
+
selector = ModuleSelector(registry)
|
| 140 |
+
|
| 141 |
+
# Create dynamic scraping task
|
| 142 |
+
classification = TaskClassification(
|
| 143 |
+
primary_task=TaskType.WEB_SCRAPING,
|
| 144 |
+
secondary_tasks=[],
|
| 145 |
+
complexity=ComplexityLevel.MEDIUM,
|
| 146 |
+
estimated_steps=3,
|
| 147 |
+
requires_javascript=True,
|
| 148 |
+
requires_authentication=False,
|
| 149 |
+
output_format=OutputFormat.JSON,
|
| 150 |
+
confidence=0.85, # β Added
|
| 151 |
+
reasoning="Dynamic web scraping with JavaScript" # β Added
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
parameters = ExtractedParameters(
|
| 155 |
+
data_sources=[
|
| 156 |
+
DataSource(
|
| 157 |
+
type='url',
|
| 158 |
+
location='https://example.com/dynamic-products',
|
| 159 |
+
format='html',
|
| 160 |
+
description='Dynamic product listing (JavaScript)'
|
| 161 |
+
)
|
| 162 |
+
]
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
print("\nπ Task:")
|
| 166 |
+
print(" Type: Dynamic web scraping")
|
| 167 |
+
print(" JavaScript: Yes")
|
| 168 |
+
print(" Output: JSON")
|
| 169 |
+
print("-" * 60)
|
| 170 |
+
|
| 171 |
+
# Select modules
|
| 172 |
+
selected = selector.select_modules(classification, parameters)
|
| 173 |
+
|
| 174 |
+
print(f"\nβ
Selected {len(selected)} modules:")
|
| 175 |
+
for i, module in enumerate(selected, 1):
|
| 176 |
+
print(f" {i}. {module.name} ({module.module_type.value})")
|
| 177 |
+
|
| 178 |
+
# Verify dynamic scraper selected
|
| 179 |
+
assert any(m.name == 'dynamic_scraper' for m in selected), \
|
| 180 |
+
"Should use dynamic scraper for JavaScript"
|
| 181 |
+
|
| 182 |
+
print("\nβ Dynamic scraper selected for JavaScript task!")
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def test_complex_analysis_selection():
|
| 186 |
+
"""Test module selection for complex data analysis"""
|
| 187 |
+
|
| 188 |
+
print("\n" + "=" * 60)
|
| 189 |
+
print("Test 4: Complex Data Analysis Task")
|
| 190 |
+
print("=" * 60)
|
| 191 |
+
|
| 192 |
+
registry = ModuleRegistry()
|
| 193 |
+
registry.clear()
|
| 194 |
+
register_mock_modules(registry)
|
| 195 |
+
|
| 196 |
+
selector = ModuleSelector(registry)
|
| 197 |
+
|
| 198 |
+
# Create complex analysis task
|
| 199 |
+
classification = TaskClassification(
|
| 200 |
+
primary_task=TaskType.ML_ANALYSIS,
|
| 201 |
+
secondary_tasks=[TaskType.VISUALIZATION],
|
| 202 |
+
complexity=ComplexityLevel.COMPLEX,
|
| 203 |
+
estimated_steps=5,
|
| 204 |
+
requires_javascript=False,
|
| 205 |
+
output_format=OutputFormat.EXCEL,
|
| 206 |
+
confidence=0.88, # β Added
|
| 207 |
+
reasoning="Complex data analysis with filtering and visualization" # β Added
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
parameters = ExtractedParameters(
|
| 211 |
+
data_sources=[
|
| 212 |
+
DataSource(
|
| 213 |
+
type='url',
|
| 214 |
+
location='https://example.com/sales.csv',
|
| 215 |
+
format='csv',
|
| 216 |
+
description='Sales data'
|
| 217 |
+
)
|
| 218 |
+
],
|
| 219 |
+
filters=[
|
| 220 |
+
FilterCondition(
|
| 221 |
+
field='region',
|
| 222 |
+
operator='equals',
|
| 223 |
+
value='North',
|
| 224 |
+
description='Filter for North region'
|
| 225 |
+
)
|
| 226 |
+
],
|
| 227 |
+
visualizations=[
|
| 228 |
+
VisualizationRequirement(
|
| 229 |
+
type='chart',
|
| 230 |
+
chart_type='bar',
|
| 231 |
+
description='Bar chart of sales by category'
|
| 232 |
+
)
|
| 233 |
+
]
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
print("\nπ Task:")
|
| 237 |
+
print(" Type: Data analysis + visualization")
|
| 238 |
+
print(" Has filters: Yes")
|
| 239 |
+
print(" Has visualizations: Yes")
|
| 240 |
+
print(" Output: Excel")
|
| 241 |
+
print("-" * 60)
|
| 242 |
+
|
| 243 |
+
# Select modules
|
| 244 |
+
selected = selector.select_modules(classification, parameters)
|
| 245 |
+
|
| 246 |
+
print(f"\nβ
Selected {len(selected)} modules:")
|
| 247 |
+
for i, module in enumerate(selected, 1):
|
| 248 |
+
caps = module.get_capabilities()
|
| 249 |
+
cap_names = [k for k, v in caps.dict().items() if v and k.startswith('can_')]
|
| 250 |
+
print(f" {i}. {module.name} ({module.module_type.value})")
|
| 251 |
+
print(f" Capabilities: {', '.join(cap_names[:2])}...")
|
| 252 |
+
|
| 253 |
+
# Verify correct module types
|
| 254 |
+
module_types = [m.module_type.value for m in selected]
|
| 255 |
+
|
| 256 |
+
print(f"\nπ Module Pipeline:")
|
| 257 |
+
print(f" Scraper: {'β' if 'scraper' in module_types else 'β'}")
|
| 258 |
+
print(f" Processor: {'β' if 'processor' in module_types else 'β'}")
|
| 259 |
+
print(f" Visualizer: {'β' if 'visualizer' in module_types else 'β'}")
|
| 260 |
+
print(f" Exporter: {'β' if 'exporter' in module_types else 'β'}")
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def test_api_task_selection():
|
| 264 |
+
"""Test module selection for API-based task"""
|
| 265 |
+
|
| 266 |
+
print("\n" + "=" * 60)
|
| 267 |
+
print("Test 5: API Data Fetching Task")
|
| 268 |
+
print("=" * 60)
|
| 269 |
+
|
| 270 |
+
registry = ModuleRegistry()
|
| 271 |
+
registry.clear()
|
| 272 |
+
register_mock_modules(registry)
|
| 273 |
+
|
| 274 |
+
selector = ModuleSelector(registry)
|
| 275 |
+
|
| 276 |
+
# Create API task
|
| 277 |
+
classification = TaskClassification(
|
| 278 |
+
primary_task=TaskType.WEB_SCRAPING,
|
| 279 |
+
secondary_tasks=[],
|
| 280 |
+
complexity=ComplexityLevel.SIMPLE,
|
| 281 |
+
estimated_steps=2,
|
| 282 |
+
output_format=OutputFormat.JSON,
|
| 283 |
+
confidence=0.92, # β Added
|
| 284 |
+
reasoning="API data fetching task" # β Added
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
parameters = ExtractedParameters(
|
| 288 |
+
data_sources=[
|
| 289 |
+
DataSource(
|
| 290 |
+
type='api',
|
| 291 |
+
location='https://api.example.com/users',
|
| 292 |
+
format='json',
|
| 293 |
+
description='User data API'
|
| 294 |
+
)
|
| 295 |
+
]
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
print("\nπ Task:")
|
| 299 |
+
print(" Type: API data fetching")
|
| 300 |
+
print(" Source: REST API")
|
| 301 |
+
print(" Output: JSON")
|
| 302 |
+
print("-" * 60)
|
| 303 |
+
|
| 304 |
+
# Select modules
|
| 305 |
+
selected = selector.select_modules(classification, parameters)
|
| 306 |
+
|
| 307 |
+
print(f"\nβ
Selected {len(selected)} modules:")
|
| 308 |
+
for module in selected:
|
| 309 |
+
print(f" - {module.name} ({module.module_type.value})")
|
| 310 |
+
|
| 311 |
+
# Verify API client selected
|
| 312 |
+
assert any(m.name == 'api_client' for m in selected), \
|
| 313 |
+
"Should use API client for API data source"
|
| 314 |
+
|
| 315 |
+
print("\nβ API client selected for API task!")
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
async def test_module_execution():
|
| 319 |
+
"""Test actually executing a selected module"""
|
| 320 |
+
|
| 321 |
+
print("\n" + "=" * 60)
|
| 322 |
+
print("Test 6: Module Execution")
|
| 323 |
+
print("=" * 60)
|
| 324 |
+
|
| 325 |
+
registry = ModuleRegistry()
|
| 326 |
+
registry.clear()
|
| 327 |
+
register_mock_modules(registry)
|
| 328 |
+
|
| 329 |
+
# Get a module
|
| 330 |
+
scraper = registry.get_module('static_scraper')
|
| 331 |
+
|
| 332 |
+
print(f"\nπ§ Testing module: {scraper.name}")
|
| 333 |
+
print("-" * 60)
|
| 334 |
+
|
| 335 |
+
# Execute
|
| 336 |
+
parameters = {
|
| 337 |
+
'url': 'https://example.com/test',
|
| 338 |
+
'columns': ['name', 'price']
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
print(f"\nExecuting with parameters:")
|
| 342 |
+
print(f" URL: {parameters['url']}")
|
| 343 |
+
print(f" Columns: {parameters['columns']}")
|
| 344 |
+
|
| 345 |
+
result = await scraper.execute(parameters)
|
| 346 |
+
|
| 347 |
+
print(f"\nβ
Execution Result:")
|
| 348 |
+
print(f" Success: {result.success}")
|
| 349 |
+
print(f" Execution Time: {result.execution_time}s")
|
| 350 |
+
print(f" Data rows: {len(result.data) if result.data else 0}")
|
| 351 |
+
|
| 352 |
+
if result.data:
|
| 353 |
+
print(f"\nπ Sample Data:")
|
| 354 |
+
for item in result.data[:3]:
|
| 355 |
+
print(f" - {item}")
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def run_all_tests():
|
| 359 |
+
"""Run all registry tests"""
|
| 360 |
+
|
| 361 |
+
print("\n" + "=" * 80)
|
| 362 |
+
print(" " * 20 + "MODULE REGISTRY TEST SUITE")
|
| 363 |
+
print("=" * 80)
|
| 364 |
+
|
| 365 |
+
try:
|
| 366 |
+
# Synchronous tests
|
| 367 |
+
test_registry_registration()
|
| 368 |
+
test_simple_scraping_selection()
|
| 369 |
+
test_dynamic_scraping_selection()
|
| 370 |
+
test_complex_analysis_selection()
|
| 371 |
+
test_api_task_selection()
|
| 372 |
+
|
| 373 |
+
# Async test
|
| 374 |
+
asyncio.run(test_module_execution())
|
| 375 |
+
|
| 376 |
+
print("\n" + "=" * 80)
|
| 377 |
+
print(" " * 30 + "ALL TESTS PASSED")
|
| 378 |
+
print("=" * 80)
|
| 379 |
+
print("\nβ
Module registry tests complete!")
|
| 380 |
+
print("\nπ Summary:")
|
| 381 |
+
print(" β Module registration working")
|
| 382 |
+
print(" β Module selection logic working")
|
| 383 |
+
print(" β Different task types handled correctly")
|
| 384 |
+
print(" β Module execution working")
|
| 385 |
+
|
| 386 |
+
except AssertionError as e:
|
| 387 |
+
print(f"\nβ Assertion failed: {e}")
|
| 388 |
+
logger.error("Test assertion failed", exc_info=True)
|
| 389 |
+
raise
|
| 390 |
+
|
| 391 |
+
except Exception as e:
|
| 392 |
+
print(f"\nβ Test failed: {e}")
|
| 393 |
+
logger.error("Test suite failed", exc_info=True)
|
| 394 |
+
raise
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
if __name__ == "__main__":
|
| 398 |
+
run_all_tests()
|