23f3003322 commited on
Commit
3e70073
Β·
1 Parent(s): c752e2d

module registry

Browse files
app/modules/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Processing Modules Package
3
+ Module registry and base interfaces
4
+ """
5
+
6
+ from app.modules.base import BaseModule, ModuleCapability, ModuleResult
7
+ from app.modules.registry import ModuleRegistry, ModuleSelector
8
+ from app.modules.capabilities import (
9
+ ScrapingCapability,
10
+ ProcessingCapability,
11
+ VisualizationCapability,
12
+ OutputCapability
13
+ )
14
+
15
+ __all__ = [
16
+ "BaseModule",
17
+ "ModuleCapability",
18
+ "ModuleResult",
19
+ "ModuleRegistry",
20
+ "ModuleSelector",
21
+ "ScrapingCapability",
22
+ "ProcessingCapability",
23
+ "VisualizationCapability",
24
+ "OutputCapability"
25
+ ]
app/modules/base.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Module Interface
3
+ All processing modules inherit from this
4
+ """
5
+
6
+ from typing import Dict, Any, List, Optional, Set
7
+ from abc import ABC, abstractmethod
8
+ from enum import Enum
9
+ from pydantic import BaseModel, Field
10
+
11
+ from app.core.logging import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class ModuleType(str, Enum):
17
+ """Types of processing modules"""
18
+ SCRAPER = "scraper"
19
+ PROCESSOR = "processor"
20
+ ANALYZER = "analyzer"
21
+ VISUALIZER = "visualizer"
22
+ EXPORTER = "exporter"
23
+ API_CLIENT = "api_client"
24
+
25
+
26
+ class ModuleCapability(BaseModel):
27
+ """Capability definition for a module"""
28
+
29
+ # What can this module do?
30
+ can_scrape_static: bool = False
31
+ can_scrape_dynamic: bool = False
32
+ can_handle_javascript: bool = False
33
+ can_authenticate: bool = False
34
+ can_handle_api: bool = False
35
+ can_process_data: bool = False
36
+ can_clean_data: bool = False
37
+ can_transform_data: bool = False
38
+ can_aggregate: bool = False
39
+ can_filter: bool = False
40
+ can_sort: bool = False
41
+ can_visualize: bool = False
42
+ can_create_charts: bool = False
43
+ can_create_maps: bool = False
44
+ can_export_csv: bool = False
45
+ can_export_json: bool = False
46
+ can_export_excel: bool = False
47
+ can_export_pdf: bool = False
48
+
49
+ # What data formats can it handle?
50
+ supported_input_formats: Set[str] = Field(default_factory=set)
51
+ supported_output_formats: Set[str] = Field(default_factory=set)
52
+
53
+ # Performance characteristics
54
+ max_concurrent_requests: int = 1
55
+ estimated_speed: str = "medium" # fast, medium, slow
56
+ memory_usage: str = "medium" # low, medium, high
57
+
58
+ # Requirements
59
+ requires_browser: bool = False
60
+ requires_api_key: bool = False
61
+ requires_database: bool = False
62
+
63
+
64
+ class ModuleResult(BaseModel):
65
+ """Result from module execution"""
66
+
67
+ success: bool
68
+ data: Any = None
69
+ error: Optional[str] = None
70
+ warnings: List[str] = Field(default_factory=list)
71
+ metadata: Dict[str, Any] = Field(default_factory=dict)
72
+ execution_time: float = 0.0
73
+
74
+ class Config:
75
+ arbitrary_types_allowed = True
76
+
77
+
78
+ class BaseModule(ABC):
79
+ """
80
+ Base class for all processing modules
81
+ All modules must inherit from this and implement required methods
82
+ """
83
+
84
+ def __init__(self, name: str, module_type: ModuleType):
85
+ """
86
+ Initialize base module
87
+
88
+ Args:
89
+ name: Module name
90
+ module_type: Type of module
91
+ """
92
+ self.name = name
93
+ self.module_type = module_type
94
+ self.logger = get_logger(f"module.{name}")
95
+ self._is_initialized = False
96
+
97
+ @abstractmethod
98
+ def get_capabilities(self) -> ModuleCapability:
99
+ """
100
+ Return module capabilities
101
+
102
+ Returns:
103
+ ModuleCapability: What this module can do
104
+ """
105
+ pass
106
+
107
+ @abstractmethod
108
+ async def execute(
109
+ self,
110
+ parameters: Dict[str, Any],
111
+ context: Optional[Dict[str, Any]] = None
112
+ ) -> ModuleResult:
113
+ """
114
+ Execute module with given parameters
115
+
116
+ Args:
117
+ parameters: Execution parameters
118
+ context: Optional execution context
119
+
120
+ Returns:
121
+ ModuleResult: Execution result
122
+ """
123
+ pass
124
+
125
+ async def initialize(self) -> bool:
126
+ """
127
+ Initialize module (load models, connect to services, etc.)
128
+ Override this if your module needs initialization
129
+
130
+ Returns:
131
+ bool: True if initialization successful
132
+ """
133
+ self._is_initialized = True
134
+ return True
135
+
136
+ async def cleanup(self):
137
+ """
138
+ Clean up module resources
139
+ Override this if your module needs cleanup
140
+ """
141
+ self._is_initialized = False
142
+
143
+ def is_initialized(self) -> bool:
144
+ """Check if module is initialized"""
145
+ return self._is_initialized
146
+
147
+ def can_handle(self, parameters: Dict[str, Any]) -> bool:
148
+ """
149
+ Check if this module can handle given parameters
150
+ Override this for custom logic
151
+
152
+ Args:
153
+ parameters: Parameters to check
154
+
155
+ Returns:
156
+ bool: True if module can handle these parameters
157
+ """
158
+ return True
159
+
160
+ def estimate_cost(self, parameters: Dict[str, Any]) -> float:
161
+ """
162
+ Estimate execution cost/time for given parameters
163
+ Used for module selection
164
+
165
+ Args:
166
+ parameters: Parameters to estimate for
167
+
168
+ Returns:
169
+ float: Estimated cost (lower is better)
170
+ """
171
+ return 1.0
172
+
173
+ def __repr__(self) -> str:
174
+ return f"<{self.__class__.__name__}(name='{self.name}', type='{self.module_type}')>"
app/modules/capabilities.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module Capability Definitions
3
+ Pre-defined capability sets for different module types
4
+ """
5
+
6
+ from app.modules.base import ModuleCapability
7
+
8
+
9
+ class ScrapingCapability:
10
+ """Capability definitions for scraping modules"""
11
+
12
+ STATIC = ModuleCapability(
13
+ can_scrape_static=True,
14
+ can_scrape_dynamic=False,
15
+ can_handle_javascript=False,
16
+ can_authenticate=False,
17
+ supported_input_formats={'html', 'xml'},
18
+ supported_output_formats={'json', 'csv', 'dict'},
19
+ max_concurrent_requests=5,
20
+ estimated_speed="fast",
21
+ memory_usage="low",
22
+ requires_browser=False
23
+ )
24
+
25
+ DYNAMIC = ModuleCapability(
26
+ can_scrape_static=True,
27
+ can_scrape_dynamic=True,
28
+ can_handle_javascript=True,
29
+ can_authenticate=True,
30
+ supported_input_formats={'html', 'javascript'},
31
+ supported_output_formats={'json', 'csv', 'dict'},
32
+ max_concurrent_requests=2,
33
+ estimated_speed="medium",
34
+ memory_usage="high",
35
+ requires_browser=True
36
+ )
37
+
38
+ API_CLIENT = ModuleCapability(
39
+ can_handle_api=True,
40
+ supported_input_formats={'api', 'rest', 'graphql'},
41
+ supported_output_formats={'json', 'dict'},
42
+ max_concurrent_requests=10,
43
+ estimated_speed="fast",
44
+ memory_usage="low",
45
+ requires_api_key=True
46
+ )
47
+
48
+
49
+ class ProcessingCapability:
50
+ """Capability definitions for data processing modules"""
51
+
52
+ DATA_CLEANER = ModuleCapability(
53
+ can_process_data=True,
54
+ can_clean_data=True,
55
+ supported_input_formats={'csv', 'json', 'dict', 'dataframe'},
56
+ supported_output_formats={'csv', 'json', 'dict', 'dataframe'},
57
+ max_concurrent_requests=1,
58
+ estimated_speed="fast",
59
+ memory_usage="medium"
60
+ )
61
+
62
+ DATA_TRANSFORMER = ModuleCapability(
63
+ can_process_data=True,
64
+ can_transform_data=True,
65
+ can_aggregate=True,
66
+ can_filter=True,
67
+ can_sort=True,
68
+ supported_input_formats={'csv', 'json', 'dict', 'dataframe'},
69
+ supported_output_formats={'csv', 'json', 'dict', 'dataframe'},
70
+ max_concurrent_requests=1,
71
+ estimated_speed="medium",
72
+ memory_usage="medium"
73
+ )
74
+
75
+
76
+ class VisualizationCapability:
77
+ """Capability definitions for visualization modules"""
78
+
79
+ CHART_CREATOR = ModuleCapability(
80
+ can_visualize=True,
81
+ can_create_charts=True,
82
+ supported_input_formats={'csv', 'json', 'dict', 'dataframe'},
83
+ supported_output_formats={'png', 'jpg', 'svg', 'html'},
84
+ max_concurrent_requests=1,
85
+ estimated_speed="medium",
86
+ memory_usage="medium"
87
+ )
88
+
89
+ MAP_CREATOR = ModuleCapability(
90
+ can_visualize=True,
91
+ can_create_maps=True,
92
+ supported_input_formats={'csv', 'json', 'dict', 'geojson'},
93
+ supported_output_formats={'html', 'png'},
94
+ max_concurrent_requests=1,
95
+ estimated_speed="slow",
96
+ memory_usage="high"
97
+ )
98
+
99
+
100
+ class OutputCapability:
101
+ """Capability definitions for output/export modules"""
102
+
103
+ CSV_EXPORTER = ModuleCapability(
104
+ can_export_csv=True,
105
+ supported_input_formats={'dict', 'list', 'dataframe'},
106
+ supported_output_formats={'csv'},
107
+ max_concurrent_requests=1,
108
+ estimated_speed="fast",
109
+ memory_usage="low"
110
+ )
111
+
112
+ EXCEL_EXPORTER = ModuleCapability(
113
+ can_export_excel=True,
114
+ supported_input_formats={'dict', 'list', 'dataframe'},
115
+ supported_output_formats={'xlsx', 'xls'},
116
+ max_concurrent_requests=1,
117
+ estimated_speed="medium",
118
+ memory_usage="medium"
119
+ )
120
+
121
+ JSON_EXPORTER = ModuleCapability(
122
+ can_export_json=True,
123
+ supported_input_formats={'dict', 'list', 'dataframe'},
124
+ supported_output_formats={'json'},
125
+ max_concurrent_requests=1,
126
+ estimated_speed="fast",
127
+ memory_usage="low"
128
+ )
app/modules/mock_modules.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mock Modules for Testing
3
+ These simulate real modules until Phase 2 modules are built
4
+ """
5
+
6
+ from typing import Dict, Any, Optional
7
+ import time
8
+
9
+ from app.modules.base import (
10
+ BaseModule,
11
+ ModuleType,
12
+ ModuleCapability,
13
+ ModuleResult
14
+ )
15
+ from app.modules.capabilities import (
16
+ ScrapingCapability,
17
+ ProcessingCapability,
18
+ VisualizationCapability,
19
+ OutputCapability
20
+ )
21
+
22
+ from app.core.logging import get_logger
23
+ from app.modules.registry import ModuleRegistry
24
+
25
+ logger = get_logger(__name__)
26
+
27
+
28
+ class MockStaticScraper(BaseModule):
29
+ """Mock static web scraper"""
30
+
31
+ def __init__(self):
32
+ super().__init__(name="static_scraper", module_type=ModuleType.SCRAPER)
33
+
34
+ def get_capabilities(self) -> ModuleCapability:
35
+ return ScrapingCapability.STATIC
36
+
37
+ async def execute(
38
+ self,
39
+ parameters: Dict[str, Any],
40
+ context: Optional[Dict[str, Any]] = None
41
+ ) -> ModuleResult:
42
+ """Mock scraping execution"""
43
+ self.logger.info(f"[MOCK] Scraping static HTML from: {parameters.get('url')}")
44
+
45
+ # Simulate work
46
+ await self._simulate_work(1.0)
47
+
48
+ # Mock data
49
+ mock_data = [
50
+ {'name': 'Product 1', 'price': '$10.99'},
51
+ {'name': 'Product 2', 'price': '$24.99'},
52
+ {'name': 'Product 3', 'price': '$15.49'}
53
+ ]
54
+
55
+ return ModuleResult(
56
+ success=True,
57
+ data=mock_data,
58
+ metadata={'rows': len(mock_data), 'source': 'mock'},
59
+ execution_time=1.0
60
+ )
61
+
62
+ async def _simulate_work(self, seconds: float):
63
+ """Simulate async work"""
64
+ import asyncio
65
+ await asyncio.sleep(seconds)
66
+
67
+
68
+ class MockDynamicScraper(BaseModule):
69
+ """Mock dynamic web scraper (Playwright)"""
70
+
71
+ def __init__(self):
72
+ super().__init__(name="dynamic_scraper", module_type=ModuleType.SCRAPER)
73
+
74
+ def get_capabilities(self) -> ModuleCapability:
75
+ return ScrapingCapability.DYNAMIC
76
+
77
+ async def execute(
78
+ self,
79
+ parameters: Dict[str, Any],
80
+ context: Optional[Dict[str, Any]] = None
81
+ ) -> ModuleResult:
82
+ """Mock dynamic scraping"""
83
+ self.logger.info(f"[MOCK] Scraping with JavaScript from: {parameters.get('url')}")
84
+
85
+ await self._simulate_work(2.0)
86
+
87
+ mock_data = [
88
+ {'name': 'Dynamic Product 1', 'price': '$29.99'},
89
+ {'name': 'Dynamic Product 2', 'price': '$49.99'}
90
+ ]
91
+
92
+ return ModuleResult(
93
+ success=True,
94
+ data=mock_data,
95
+ metadata={'rows': len(mock_data), 'method': 'playwright'},
96
+ execution_time=2.0
97
+ )
98
+
99
+ async def _simulate_work(self, seconds: float):
100
+ import asyncio
101
+ await asyncio.sleep(seconds)
102
+
103
+
104
+ class MockDataProcessor(BaseModule):
105
+ """Mock data processor"""
106
+
107
+ def __init__(self):
108
+ super().__init__(name="data_processor", module_type=ModuleType.PROCESSOR)
109
+
110
+ def get_capabilities(self) -> ModuleCapability:
111
+ return ProcessingCapability.DATA_TRANSFORMER
112
+
113
+ async def execute(
114
+ self,
115
+ parameters: Dict[str, Any],
116
+ context: Optional[Dict[str, Any]] = None
117
+ ) -> ModuleResult:
118
+ """Mock data processing"""
119
+ self.logger.info("[MOCK] Processing data with filters and aggregations")
120
+
121
+ # Get input data from context
122
+ input_data = context.get('data', []) if context else []
123
+
124
+ # Mock filtering
125
+ if parameters.get('filters'):
126
+ self.logger.info(f"[MOCK] Applying {len(parameters['filters'])} filters")
127
+
128
+ # Mock aggregation
129
+ if parameters.get('aggregations'):
130
+ self.logger.info(f"[MOCK] Applying {len(parameters['aggregations'])} aggregations")
131
+
132
+ mock_result = {
133
+ 'filtered_rows': len(input_data),
134
+ 'aggregated': True,
135
+ 'data': input_data
136
+ }
137
+
138
+ return ModuleResult(
139
+ success=True,
140
+ data=mock_result,
141
+ metadata={'processed': True},
142
+ execution_time=0.5
143
+ )
144
+
145
+
146
+ class MockChartCreator(BaseModule):
147
+ """Mock chart/visualization creator"""
148
+
149
+ def __init__(self):
150
+ super().__init__(name="chart_creator", module_type=ModuleType.VISUALIZER)
151
+
152
+ def get_capabilities(self) -> ModuleCapability:
153
+ return VisualizationCapability.CHART_CREATOR
154
+
155
+ async def execute(
156
+ self,
157
+ parameters: Dict[str, Any],
158
+ context: Optional[Dict[str, Any]] = None
159
+ ) -> ModuleResult:
160
+ """Mock chart creation"""
161
+ chart_type = parameters.get('chart_type', 'bar')
162
+ self.logger.info(f"[MOCK] Creating {chart_type} chart")
163
+
164
+ mock_chart = {
165
+ 'type': chart_type,
166
+ 'file': f'/tmp/chart_{chart_type}.png',
167
+ 'created': True
168
+ }
169
+
170
+ return ModuleResult(
171
+ success=True,
172
+ data=mock_chart,
173
+ metadata={'chart_type': chart_type},
174
+ execution_time=0.8
175
+ )
176
+
177
+
178
+ class MockCSVExporter(BaseModule):
179
+ """Mock CSV exporter"""
180
+
181
+ def __init__(self):
182
+ super().__init__(name="csv_exporter", module_type=ModuleType.EXPORTER)
183
+
184
+ def get_capabilities(self) -> ModuleCapability:
185
+ return OutputCapability.CSV_EXPORTER
186
+
187
+ async def execute(
188
+ self,
189
+ parameters: Dict[str, Any],
190
+ context: Optional[Dict[str, Any]] = None
191
+ ) -> ModuleResult:
192
+ """Mock CSV export"""
193
+ filename = parameters.get('filename', 'output.csv')
194
+ self.logger.info(f"[MOCK] Exporting to CSV: {filename}")
195
+
196
+ mock_export = {
197
+ 'filename': filename,
198
+ 'path': f'/tmp/{filename}',
199
+ 'rows': 10,
200
+ 'exported': True
201
+ }
202
+
203
+ return ModuleResult(
204
+ success=True,
205
+ data=mock_export,
206
+ metadata={'format': 'csv'},
207
+ execution_time=0.3
208
+ )
209
+
210
+
211
+ class MockAPIClient(BaseModule):
212
+ """Mock API client"""
213
+
214
+ def __init__(self):
215
+ super().__init__(name="api_client", module_type=ModuleType.API_CLIENT)
216
+
217
+ def get_capabilities(self) -> ModuleCapability:
218
+ return ScrapingCapability.API_CLIENT
219
+
220
+ async def execute(
221
+ self,
222
+ parameters: Dict[str, Any],
223
+ context: Optional[Dict[str, Any]] = None
224
+ ) -> ModuleResult:
225
+ """Mock API call"""
226
+ api_url = parameters.get('url', 'https://api.example.com')
227
+ self.logger.info(f"[MOCK] Calling API: {api_url}")
228
+
229
+ await self._simulate_work(0.5)
230
+
231
+ mock_data = {
232
+ 'status': 'success',
233
+ 'data': [
234
+ {'id': 1, 'name': 'Item 1'},
235
+ {'id': 2, 'name': 'Item 2'}
236
+ ]
237
+ }
238
+
239
+ return ModuleResult(
240
+ success=True,
241
+ data=mock_data,
242
+ metadata={'api_url': api_url},
243
+ execution_time=0.5
244
+ )
245
+
246
+ async def _simulate_work(self, seconds: float):
247
+ import asyncio
248
+ await asyncio.sleep(seconds)
249
+
250
+
251
+ def register_mock_modules(registry: Optional['ModuleRegistry'] = None):
252
+ """
253
+ Register all mock modules for testing
254
+
255
+ Args:
256
+ registry: Registry to register to (creates new if None)
257
+ """
258
+ from app.modules.registry import ModuleRegistry
259
+
260
+ if registry is None:
261
+ registry = ModuleRegistry()
262
+
263
+ # Register all mock modules
264
+ registry.register(MockStaticScraper())
265
+ registry.register(MockDynamicScraper())
266
+ registry.register(MockDataProcessor())
267
+ registry.register(MockChartCreator())
268
+ registry.register(MockCSVExporter())
269
+ registry.register(MockAPIClient())
270
+
271
+ logger.info("βœ… Registered 6 mock modules for testing")
272
+
273
+ return registry
app/modules/registry.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module Registry
3
+ Central registry for all processing modules
4
+ Handles module registration, discovery, and selection
5
+ """
6
+
7
+ from typing import Dict, List, Optional, Type, Set
8
+ from collections import defaultdict
9
+
10
+ from app.modules.base import BaseModule, ModuleType, ModuleCapability
11
+ from app.orchestrator.parameter_models import ExtractedParameters
12
+ from app.orchestrator.models import TaskClassification
13
+ from app.core.logging import get_logger
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class ModuleRegistry:
19
+ """
20
+ Central registry for all processing modules
21
+ Singleton pattern - only one registry exists
22
+ """
23
+
24
+ _instance = None
25
+
26
+ def __new__(cls):
27
+ if cls._instance is None:
28
+ cls._instance = super().__new__(cls)
29
+ cls._instance._initialized = False
30
+ return cls._instance
31
+
32
+ def __init__(self):
33
+ """Initialize module registry"""
34
+ if self._initialized:
35
+ return
36
+
37
+ self.modules: Dict[str, BaseModule] = {}
38
+ self.modules_by_type: Dict[ModuleType, List[BaseModule]] = defaultdict(list)
39
+ self._initialized = True
40
+
41
+ logger.info("ModuleRegistry initialized")
42
+
43
+ def register(self, module: BaseModule):
44
+ """
45
+ Register a module
46
+
47
+ Args:
48
+ module: Module to register
49
+ """
50
+ if module.name in self.modules:
51
+ logger.warning(f"Module '{module.name}' already registered, replacing")
52
+
53
+ self.modules[module.name] = module
54
+ self.modules_by_type[module.module_type].append(module)
55
+
56
+ logger.info(
57
+ f"βœ“ Registered module: {module.name} "
58
+ f"(type: {module.module_type.value})"
59
+ )
60
+
61
+ def unregister(self, module_name: str) -> bool:
62
+ """
63
+ Unregister a module
64
+
65
+ Args:
66
+ module_name: Name of module to unregister
67
+
68
+ Returns:
69
+ bool: True if unregistered
70
+ """
71
+ if module_name not in self.modules:
72
+ return False
73
+
74
+ module = self.modules[module_name]
75
+ del self.modules[module_name]
76
+ self.modules_by_type[module.module_type].remove(module)
77
+
78
+ logger.info(f"Unregistered module: {module_name}")
79
+ return True
80
+
81
+ def get_module(self, name: str) -> Optional[BaseModule]:
82
+ """Get module by name"""
83
+ return self.modules.get(name)
84
+
85
+ def get_modules_by_type(self, module_type: ModuleType) -> List[BaseModule]:
86
+ """Get all modules of a specific type"""
87
+ return self.modules_by_type.get(module_type, [])
88
+
89
+ def get_all_modules(self) -> List[BaseModule]:
90
+ """Get all registered modules"""
91
+ return list(self.modules.values())
92
+
93
+ def list_modules(self) -> Dict[str, Dict]:
94
+ """
95
+ List all registered modules with their info
96
+
97
+ Returns:
98
+ Dict: Module information
99
+ """
100
+ result = {}
101
+
102
+ for name, module in self.modules.items():
103
+ capabilities = module.get_capabilities()
104
+
105
+ result[name] = {
106
+ 'type': module.module_type.value,
107
+ 'initialized': module.is_initialized(),
108
+ 'capabilities': capabilities.dict()
109
+ }
110
+
111
+ return result
112
+
113
+ def clear(self):
114
+ """Clear all registered modules (for testing)"""
115
+ self.modules.clear()
116
+ self.modules_by_type.clear()
117
+ logger.info("Registry cleared")
118
+
119
+
120
+ class ModuleSelector:
121
+ """
122
+ Selects appropriate modules based on task requirements
123
+ Uses classification and parameters to find best modules
124
+ """
125
+
126
+ def __init__(self, registry: Optional[ModuleRegistry] = None):
127
+ """
128
+ Initialize module selector
129
+
130
+ Args:
131
+ registry: Module registry to use (creates new if None)
132
+ """
133
+ self.registry = registry or ModuleRegistry()
134
+ logger.debug("ModuleSelector initialized")
135
+
136
+ def select_modules(
137
+ self,
138
+ classification: TaskClassification,
139
+ parameters: ExtractedParameters
140
+ ) -> List[BaseModule]:
141
+ """
142
+ Select appropriate modules for task
143
+
144
+ Args:
145
+ classification: Task classification
146
+ parameters: Extracted parameters
147
+
148
+ Returns:
149
+ List[BaseModule]: Selected modules in execution order
150
+ """
151
+ logger.info("πŸ” Selecting modules for task")
152
+ logger.debug(
153
+ f"Task type: {classification.primary_task.value}, "
154
+ f"Complexity: {classification.complexity.value}"
155
+ )
156
+
157
+ selected = []
158
+
159
+ # Step 1: Select data sourcing module
160
+ sourcing_module = self._select_sourcing_module(classification, parameters)
161
+ if sourcing_module:
162
+ selected.append(sourcing_module)
163
+
164
+ # Step 2: Select processing modules
165
+ processing_modules = self._select_processing_modules(classification, parameters)
166
+ selected.extend(processing_modules)
167
+
168
+ # Step 3: Select visualization module (if needed)
169
+ viz_module = self._select_visualization_module(classification, parameters)
170
+ if viz_module:
171
+ selected.append(viz_module)
172
+
173
+ # Step 4: Select output/export module
174
+ output_module = self._select_output_module(classification, parameters)
175
+ if output_module:
176
+ selected.append(output_module)
177
+
178
+ logger.info(
179
+ f"βœ… Selected {len(selected)} modules: "
180
+ f"{[m.name for m in selected]}"
181
+ )
182
+
183
+ return selected
184
+
185
+ def _select_sourcing_module(
186
+ self,
187
+ classification: TaskClassification,
188
+ parameters: ExtractedParameters
189
+ ) -> Optional[BaseModule]:
190
+ """Select data sourcing module (scraper, API client, etc.)"""
191
+
192
+ # No data sources, no module needed
193
+ if not parameters.data_sources:
194
+ logger.debug("No data sources, skipping sourcing module")
195
+ return None
196
+
197
+ data_source = parameters.data_sources[0] # Use first source
198
+
199
+ # API data source
200
+ if data_source.type == 'api':
201
+ logger.debug("Selecting API client module")
202
+ candidates = self.registry.get_modules_by_type(ModuleType.API_CLIENT)
203
+
204
+ for module in candidates:
205
+ if module.get_capabilities().can_handle_api:
206
+ logger.info(f"βœ“ Selected API client: {module.name}")
207
+ return module
208
+
209
+ # URL/web scraping
210
+ if data_source.type == 'url':
211
+ # Check if JavaScript needed from CLASSIFICATION (primary source)
212
+ needs_javascript = classification.requires_javascript # ← Fixed: Check classification first
213
+
214
+ # Also check parameters.urls if present
215
+ if parameters.urls:
216
+ needs_javascript = needs_javascript or any(
217
+ u.requires_javascript for u in parameters.urls
218
+ )
219
+
220
+ if needs_javascript:
221
+ logger.debug("JavaScript required, selecting dynamic scraper")
222
+ candidates = self.registry.get_modules_by_type(ModuleType.SCRAPER)
223
+
224
+ for module in candidates:
225
+ if module.get_capabilities().can_scrape_dynamic:
226
+ logger.info(f"βœ“ Selected dynamic scraper: {module.name}")
227
+ return module
228
+
229
+ logger.warning("JavaScript needed but no dynamic scraper available")
230
+
231
+ # Static HTML scraping
232
+ logger.debug("Selecting static scraper")
233
+ candidates = self.registry.get_modules_by_type(ModuleType.SCRAPER)
234
+
235
+ for module in candidates:
236
+ if module.get_capabilities().can_scrape_static:
237
+ logger.info(f"βœ“ Selected static scraper: {module.name}")
238
+ return module
239
+
240
+ logger.warning("No suitable sourcing module found")
241
+ return None
242
+
243
+ def _select_processing_modules(
244
+ self,
245
+ classification: TaskClassification,
246
+ parameters: ExtractedParameters
247
+ ) -> List[BaseModule]:
248
+ """Select data processing modules"""
249
+
250
+ selected = []
251
+
252
+ # Data cleaning (if filters present)
253
+ if parameters.filters:
254
+ logger.debug("Filters detected, need data processor")
255
+ candidates = self.registry.get_modules_by_type(ModuleType.PROCESSOR)
256
+
257
+ for module in candidates:
258
+ caps = module.get_capabilities()
259
+ if caps.can_filter or caps.can_transform_data:
260
+ logger.info(f"βœ“ Selected processor: {module.name}")
261
+ selected.append(module)
262
+ break
263
+
264
+ # Aggregation (if aggregations present)
265
+ if parameters.aggregations:
266
+ logger.debug("Aggregations detected")
267
+ candidates = self.registry.get_modules_by_type(ModuleType.PROCESSOR)
268
+
269
+ for module in candidates:
270
+ if module.get_capabilities().can_aggregate:
271
+ if module not in selected:
272
+ logger.info(f"βœ“ Selected aggregator: {module.name}")
273
+ selected.append(module)
274
+ break
275
+
276
+ return selected
277
+
278
+ def _select_visualization_module(
279
+ self,
280
+ classification: TaskClassification,
281
+ parameters: ExtractedParameters
282
+ ) -> Optional[BaseModule]:
283
+ """Select visualization module"""
284
+
285
+ if not parameters.visualizations:
286
+ return None
287
+
288
+ viz_req = parameters.visualizations[0] # Use first visualization
289
+
290
+ # Map visualization
291
+ if viz_req.type == 'map':
292
+ logger.debug("Map visualization needed")
293
+ candidates = self.registry.get_modules_by_type(ModuleType.VISUALIZER)
294
+
295
+ for module in candidates:
296
+ if module.get_capabilities().can_create_maps:
297
+ logger.info(f"βœ“ Selected map creator: {module.name}")
298
+ return module
299
+
300
+ # Chart visualization
301
+ if viz_req.type == 'chart':
302
+ logger.debug("Chart visualization needed")
303
+ candidates = self.registry.get_modules_by_type(ModuleType.VISUALIZER)
304
+
305
+ for module in candidates:
306
+ if module.get_capabilities().can_create_charts:
307
+ logger.info(f"βœ“ Selected chart creator: {module.name}")
308
+ return module
309
+
310
+ return None
311
+
312
+ def _select_output_module(
313
+ self,
314
+ classification: TaskClassification,
315
+ parameters: ExtractedParameters
316
+ ) -> Optional[BaseModule]:
317
+ """Select output/export module"""
318
+
319
+ if not parameters.output:
320
+ logger.debug("No output format specified, using default CSV")
321
+ output_format = 'csv'
322
+ else:
323
+ output_format = parameters.output.format
324
+
325
+ candidates = self.registry.get_modules_by_type(ModuleType.EXPORTER)
326
+
327
+ # Match by format
328
+ for module in candidates:
329
+ caps = module.get_capabilities()
330
+
331
+ if output_format == 'csv' and caps.can_export_csv:
332
+ logger.info(f"βœ“ Selected CSV exporter: {module.name}")
333
+ return module
334
+
335
+ elif output_format == 'excel' and caps.can_export_excel:
336
+ logger.info(f"βœ“ Selected Excel exporter: {module.name}")
337
+ return module
338
+
339
+ elif output_format == 'json' and caps.can_export_json:
340
+ logger.info(f"βœ“ Selected JSON exporter: {module.name}")
341
+ return module
342
+
343
+ logger.warning(f"No exporter for format: {output_format}")
344
+ return None
345
+
346
+ def can_execute_task(
347
+ self,
348
+ classification: TaskClassification,
349
+ parameters: ExtractedParameters
350
+ ) -> bool:
351
+ """
352
+ Check if task can be executed with available modules
353
+
354
+ Args:
355
+ classification: Task classification
356
+ parameters: Extracted parameters
357
+
358
+ Returns:
359
+ bool: True if task can be executed
360
+ """
361
+ selected = self.select_modules(classification, parameters)
362
+
363
+ # Need at least one module to execute
364
+ if not selected:
365
+ logger.warning("No modules selected, cannot execute task")
366
+ return False
367
+
368
+ # Check if we have sourcing module (if data sources present)
369
+ if parameters.data_sources and not any(
370
+ m.module_type in [ModuleType.SCRAPER, ModuleType.API_CLIENT]
371
+ for m in selected
372
+ ):
373
+ logger.warning("Data sources present but no sourcing module")
374
+ return False
375
+
376
+ return True
377
+
378
+
379
+ # Convenience function
380
+ def get_module_registry() -> ModuleRegistry:
381
+ """Get global module registry instance"""
382
+ return ModuleRegistry()
383
+
384
+
385
+ def get_module_selector() -> ModuleSelector:
386
+ """Get module selector instance"""
387
+ return ModuleSelector()
app/orchestrator/models.py CHANGED
@@ -41,6 +41,7 @@ class OutputFormat(str, Enum):
41
  CSV = "csv"
42
  IMAGE = "image"
43
  CHART = "chart"
 
44
  HTML = "html"
45
  PDF = "pdf"
46
  UNKNOWN = "unknown"
 
41
  CSV = "csv"
42
  IMAGE = "image"
43
  CHART = "chart"
44
+ EXCEL = "excel"
45
  HTML = "html"
46
  PDF = "pdf"
47
  UNKNOWN = "unknown"
test/test_module_registry.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test Module Registry
3
+ Comprehensive tests for module registration and selection
4
+ """
5
+
6
+ import asyncio
7
+ import sys
8
+ import os
9
+
10
+ # Add project root to Python path
11
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
12
+ sys.path.insert(0, project_root)
13
+ from app.modules.registry import ModuleRegistry, ModuleSelector
14
+ from app.modules.mock_modules import register_mock_modules
15
+ from app.orchestrator.models import TaskClassification, TaskType, ComplexityLevel, OutputFormat
16
+ from app.orchestrator.parameter_models import (
17
+ ExtractedParameters,
18
+ DataSource,
19
+ FilterCondition,
20
+ VisualizationRequirement,
21
+ OutputRequirement
22
+ )
23
+ from app.core.logging import setup_logging, get_logger
24
+
25
+ setup_logging()
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ def test_registry_registration():
30
+ """Test module registration"""
31
+
32
+ print("\n" + "=" * 60)
33
+ print("Test 1: Module Registration")
34
+ print("=" * 60)
35
+
36
+ # Clear registry
37
+ registry = ModuleRegistry()
38
+ registry.clear()
39
+
40
+ # Register mock modules
41
+ register_mock_modules(registry)
42
+
43
+ # Check registration
44
+ all_modules = registry.get_all_modules()
45
+
46
+ print(f"\nβœ“ Registered {len(all_modules)} modules:")
47
+ for module in all_modules:
48
+ print(f" - {module.name} ({module.module_type.value})")
49
+
50
+ # List with details
51
+ module_info = registry.list_modules()
52
+
53
+ print(f"\nπŸ“Š Module Details:")
54
+ for name, info in module_info.items():
55
+ print(f"\n {name}:")
56
+ print(f" Type: {info['type']}")
57
+ print(f" Initialized: {info['initialized']}")
58
+
59
+ caps = info['capabilities']
60
+ cap_list = [k for k, v in caps.items() if v and k.startswith('can_')]
61
+ if cap_list:
62
+ print(f" Capabilities: {', '.join(cap_list[:3])}...")
63
+
64
+
65
+ def test_simple_scraping_selection():
66
+ """Test module selection for simple scraping task"""
67
+
68
+ print("\n" + "=" * 60)
69
+ print("Test 2: Simple Scraping Task")
70
+ print("=" * 60)
71
+
72
+ # Setup
73
+ registry = ModuleRegistry()
74
+ registry.clear()
75
+ register_mock_modules(registry)
76
+
77
+ selector = ModuleSelector(registry)
78
+
79
+ # Create simple scraping task
80
+ classification = TaskClassification(
81
+ primary_task=TaskType.WEB_SCRAPING,
82
+ secondary_tasks=[],
83
+ complexity=ComplexityLevel.SIMPLE,
84
+ estimated_steps=2,
85
+ requires_javascript=False,
86
+ requires_authentication=False,
87
+ output_format=OutputFormat.CSV,
88
+ confidence=0.9, # ← Added
89
+ reasoning="Simple static web scraping task" # ← Added
90
+ )
91
+
92
+ parameters = ExtractedParameters(
93
+ data_sources=[
94
+ DataSource(
95
+ type='url',
96
+ location='https://example.com/products',
97
+ format='html',
98
+ description='Product listing page'
99
+ )
100
+ ],
101
+ output=OutputRequirement(
102
+ format='csv',
103
+ description='Export as CSV'
104
+ )
105
+ )
106
+
107
+ print("\nπŸ“‹ Task:")
108
+ print(" Type: Simple web scraping")
109
+ print(" JavaScript: No")
110
+ print(" Output: CSV")
111
+ print("-" * 60)
112
+
113
+ # Select modules
114
+ selected = selector.select_modules(classification, parameters)
115
+
116
+ print(f"\nβœ… Selected {len(selected)} modules:")
117
+ for i, module in enumerate(selected, 1):
118
+ print(f" {i}. {module.name} ({module.module_type.value})")
119
+
120
+ # Verify
121
+ assert len(selected) >= 2, "Should select at least scraper + exporter"
122
+ assert any(m.name == 'static_scraper' for m in selected), "Should use static scraper"
123
+ assert any(m.name == 'csv_exporter' for m in selected), "Should use CSV exporter"
124
+
125
+ print("\nβœ“ Correct modules selected!")
126
+
127
+
128
+ def test_dynamic_scraping_selection():
129
+ """Test module selection for dynamic scraping (JavaScript)"""
130
+
131
+ print("\n" + "=" * 60)
132
+ print("Test 3: Dynamic Scraping Task (JavaScript)")
133
+ print("=" * 60)
134
+
135
+ registry = ModuleRegistry()
136
+ registry.clear()
137
+ register_mock_modules(registry)
138
+
139
+ selector = ModuleSelector(registry)
140
+
141
+ # Create dynamic scraping task
142
+ classification = TaskClassification(
143
+ primary_task=TaskType.WEB_SCRAPING,
144
+ secondary_tasks=[],
145
+ complexity=ComplexityLevel.MEDIUM,
146
+ estimated_steps=3,
147
+ requires_javascript=True,
148
+ requires_authentication=False,
149
+ output_format=OutputFormat.JSON,
150
+ confidence=0.85, # ← Added
151
+ reasoning="Dynamic web scraping with JavaScript" # ← Added
152
+ )
153
+
154
+ parameters = ExtractedParameters(
155
+ data_sources=[
156
+ DataSource(
157
+ type='url',
158
+ location='https://example.com/dynamic-products',
159
+ format='html',
160
+ description='Dynamic product listing (JavaScript)'
161
+ )
162
+ ]
163
+ )
164
+
165
+ print("\nπŸ“‹ Task:")
166
+ print(" Type: Dynamic web scraping")
167
+ print(" JavaScript: Yes")
168
+ print(" Output: JSON")
169
+ print("-" * 60)
170
+
171
+ # Select modules
172
+ selected = selector.select_modules(classification, parameters)
173
+
174
+ print(f"\nβœ… Selected {len(selected)} modules:")
175
+ for i, module in enumerate(selected, 1):
176
+ print(f" {i}. {module.name} ({module.module_type.value})")
177
+
178
+ # Verify dynamic scraper selected
179
+ assert any(m.name == 'dynamic_scraper' for m in selected), \
180
+ "Should use dynamic scraper for JavaScript"
181
+
182
+ print("\nβœ“ Dynamic scraper selected for JavaScript task!")
183
+
184
+
185
+ def test_complex_analysis_selection():
186
+ """Test module selection for complex data analysis"""
187
+
188
+ print("\n" + "=" * 60)
189
+ print("Test 4: Complex Data Analysis Task")
190
+ print("=" * 60)
191
+
192
+ registry = ModuleRegistry()
193
+ registry.clear()
194
+ register_mock_modules(registry)
195
+
196
+ selector = ModuleSelector(registry)
197
+
198
+ # Create complex analysis task
199
+ classification = TaskClassification(
200
+ primary_task=TaskType.ML_ANALYSIS,
201
+ secondary_tasks=[TaskType.VISUALIZATION],
202
+ complexity=ComplexityLevel.COMPLEX,
203
+ estimated_steps=5,
204
+ requires_javascript=False,
205
+ output_format=OutputFormat.EXCEL,
206
+ confidence=0.88, # ← Added
207
+ reasoning="Complex data analysis with filtering and visualization" # ← Added
208
+ )
209
+
210
+ parameters = ExtractedParameters(
211
+ data_sources=[
212
+ DataSource(
213
+ type='url',
214
+ location='https://example.com/sales.csv',
215
+ format='csv',
216
+ description='Sales data'
217
+ )
218
+ ],
219
+ filters=[
220
+ FilterCondition(
221
+ field='region',
222
+ operator='equals',
223
+ value='North',
224
+ description='Filter for North region'
225
+ )
226
+ ],
227
+ visualizations=[
228
+ VisualizationRequirement(
229
+ type='chart',
230
+ chart_type='bar',
231
+ description='Bar chart of sales by category'
232
+ )
233
+ ]
234
+ )
235
+
236
+ print("\nπŸ“‹ Task:")
237
+ print(" Type: Data analysis + visualization")
238
+ print(" Has filters: Yes")
239
+ print(" Has visualizations: Yes")
240
+ print(" Output: Excel")
241
+ print("-" * 60)
242
+
243
+ # Select modules
244
+ selected = selector.select_modules(classification, parameters)
245
+
246
+ print(f"\nβœ… Selected {len(selected)} modules:")
247
+ for i, module in enumerate(selected, 1):
248
+ caps = module.get_capabilities()
249
+ cap_names = [k for k, v in caps.dict().items() if v and k.startswith('can_')]
250
+ print(f" {i}. {module.name} ({module.module_type.value})")
251
+ print(f" Capabilities: {', '.join(cap_names[:2])}...")
252
+
253
+ # Verify correct module types
254
+ module_types = [m.module_type.value for m in selected]
255
+
256
+ print(f"\nπŸ“Š Module Pipeline:")
257
+ print(f" Scraper: {'βœ“' if 'scraper' in module_types else 'βœ—'}")
258
+ print(f" Processor: {'βœ“' if 'processor' in module_types else 'βœ—'}")
259
+ print(f" Visualizer: {'βœ“' if 'visualizer' in module_types else 'βœ—'}")
260
+ print(f" Exporter: {'βœ“' if 'exporter' in module_types else 'βœ—'}")
261
+
262
+
263
+ def test_api_task_selection():
264
+ """Test module selection for API-based task"""
265
+
266
+ print("\n" + "=" * 60)
267
+ print("Test 5: API Data Fetching Task")
268
+ print("=" * 60)
269
+
270
+ registry = ModuleRegistry()
271
+ registry.clear()
272
+ register_mock_modules(registry)
273
+
274
+ selector = ModuleSelector(registry)
275
+
276
+ # Create API task
277
+ classification = TaskClassification(
278
+ primary_task=TaskType.WEB_SCRAPING,
279
+ secondary_tasks=[],
280
+ complexity=ComplexityLevel.SIMPLE,
281
+ estimated_steps=2,
282
+ output_format=OutputFormat.JSON,
283
+ confidence=0.92, # ← Added
284
+ reasoning="API data fetching task" # ← Added
285
+ )
286
+
287
+ parameters = ExtractedParameters(
288
+ data_sources=[
289
+ DataSource(
290
+ type='api',
291
+ location='https://api.example.com/users',
292
+ format='json',
293
+ description='User data API'
294
+ )
295
+ ]
296
+ )
297
+
298
+ print("\nπŸ“‹ Task:")
299
+ print(" Type: API data fetching")
300
+ print(" Source: REST API")
301
+ print(" Output: JSON")
302
+ print("-" * 60)
303
+
304
+ # Select modules
305
+ selected = selector.select_modules(classification, parameters)
306
+
307
+ print(f"\nβœ… Selected {len(selected)} modules:")
308
+ for module in selected:
309
+ print(f" - {module.name} ({module.module_type.value})")
310
+
311
+ # Verify API client selected
312
+ assert any(m.name == 'api_client' for m in selected), \
313
+ "Should use API client for API data source"
314
+
315
+ print("\nβœ“ API client selected for API task!")
316
+
317
+
318
+ async def test_module_execution():
319
+ """Test actually executing a selected module"""
320
+
321
+ print("\n" + "=" * 60)
322
+ print("Test 6: Module Execution")
323
+ print("=" * 60)
324
+
325
+ registry = ModuleRegistry()
326
+ registry.clear()
327
+ register_mock_modules(registry)
328
+
329
+ # Get a module
330
+ scraper = registry.get_module('static_scraper')
331
+
332
+ print(f"\nπŸ”§ Testing module: {scraper.name}")
333
+ print("-" * 60)
334
+
335
+ # Execute
336
+ parameters = {
337
+ 'url': 'https://example.com/test',
338
+ 'columns': ['name', 'price']
339
+ }
340
+
341
+ print(f"\nExecuting with parameters:")
342
+ print(f" URL: {parameters['url']}")
343
+ print(f" Columns: {parameters['columns']}")
344
+
345
+ result = await scraper.execute(parameters)
346
+
347
+ print(f"\nβœ… Execution Result:")
348
+ print(f" Success: {result.success}")
349
+ print(f" Execution Time: {result.execution_time}s")
350
+ print(f" Data rows: {len(result.data) if result.data else 0}")
351
+
352
+ if result.data:
353
+ print(f"\nπŸ“Š Sample Data:")
354
+ for item in result.data[:3]:
355
+ print(f" - {item}")
356
+
357
+
358
+ def run_all_tests():
359
+ """Run all registry tests"""
360
+
361
+ print("\n" + "=" * 80)
362
+ print(" " * 20 + "MODULE REGISTRY TEST SUITE")
363
+ print("=" * 80)
364
+
365
+ try:
366
+ # Synchronous tests
367
+ test_registry_registration()
368
+ test_simple_scraping_selection()
369
+ test_dynamic_scraping_selection()
370
+ test_complex_analysis_selection()
371
+ test_api_task_selection()
372
+
373
+ # Async test
374
+ asyncio.run(test_module_execution())
375
+
376
+ print("\n" + "=" * 80)
377
+ print(" " * 30 + "ALL TESTS PASSED")
378
+ print("=" * 80)
379
+ print("\nβœ… Module registry tests complete!")
380
+ print("\nπŸ“Š Summary:")
381
+ print(" βœ“ Module registration working")
382
+ print(" βœ“ Module selection logic working")
383
+ print(" βœ“ Different task types handled correctly")
384
+ print(" βœ“ Module execution working")
385
+
386
+ except AssertionError as e:
387
+ print(f"\n❌ Assertion failed: {e}")
388
+ logger.error("Test assertion failed", exc_info=True)
389
+ raise
390
+
391
+ except Exception as e:
392
+ print(f"\n❌ Test failed: {e}")
393
+ logger.error("Test suite failed", exc_info=True)
394
+ raise
395
+
396
+
397
+ if __name__ == "__main__":
398
+ run_all_tests()