Spaces:
Sleeping
Sleeping
| """ | |
| Test Dynamic Web Scraper | |
| Comprehensive tests for Playwright scraper | |
| """ | |
| import sys | |
| import os | |
| # Add project root to Python path | |
| project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) | |
| sys.path.insert(0, project_root) | |
| import pytest | |
| pytest_plugins = ('pytest_asyncio',) | |
| """ | |
| Test Dynamic Web Scraper | |
| Comprehensive tests for Playwright-based scraper | |
| """ | |
| """ | |
| Test Dynamic Web Scraper | |
| Comprehensive tests for Playwright scraper | |
| No skipping - all tests must pass | |
| """ | |
| import asyncio | |
| import sys | |
| from app.modules.scrapers.dynamic_scraper import DynamicScraper | |
| from app.modules.scrapers.browser_config import BrowserConfig, BrowserPresets, get_browser_config | |
| from app.core.logging import setup_logging, get_logger | |
| setup_logging() | |
| logger = get_logger(__name__) | |
| async def test_playwright_installation(): | |
| """Test if Playwright is properly installed""" | |
| print("\n" + "=" * 60) | |
| print("Test 0: Playwright Installation Check") | |
| print("=" * 60) | |
| try: | |
| from playwright.async_api import async_playwright | |
| print("β Playwright Python package: Installed") | |
| # Try to start playwright | |
| async with async_playwright() as p: | |
| print("β Playwright started successfully") | |
| # Check browser availability | |
| try: | |
| browser = await p.chromium.launch(headless=True) | |
| print("β Chromium browser: Available") | |
| await browser.close() | |
| except Exception as e: | |
| print(f"β Chromium browser: Not installed") | |
| print(f" Error: {e}") | |
| print("\nπ§ Fix: Run 'playwright install chromium'") | |
| raise | |
| print("\nβ Playwright setup verified") | |
| return True | |
| except ImportError as e: | |
| print(f"β Playwright not installed: {e}") | |
| print("\nπ§ Fix: Run 'pip install playwright'") | |
| raise | |
| except Exception as e: | |
| print(f"β Playwright setup error: {e}") | |
| raise | |
| async def test_scraper_initialization(): | |
| """Test scraper initialization""" | |
| print("\n" + "=" * 60) | |
| print("Test 1: Scraper Initialization") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| print(f" Name: {scraper.name}") | |
| print(f" Config: {scraper.config is not None}") | |
| print(f" Headless: {scraper.config.headless}") | |
| print(f" Timeout: {scraper.config.timeout}ms") | |
| print(f" Browser manager: {scraper.browser_manager}") | |
| assert scraper.name == "dynamic_scraper", "Wrong scraper name" | |
| assert scraper.config is not None, "Config not initialized" | |
| assert scraper.browser_manager is None, "Browser manager should be None before initialize()" | |
| print("\nβ Scraper initialized successfully") | |
| async def test_browser_startup(): | |
| """Test browser manager initialization""" | |
| print("\n" + "=" * 60) | |
| print("Test 2: Browser Startup") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| print(" Initializing browser...") | |
| await scraper.initialize() | |
| print(f" Browser manager: {scraper.browser_manager}") | |
| print(f" Is initialized: {scraper.browser_manager.is_initialized()}") | |
| print(f" Browser type: {scraper.config.browser_type.value}") | |
| assert scraper.browser_manager is not None, "Browser manager not created" | |
| assert scraper.browser_manager.is_initialized(), "Browser not initialized" | |
| print("\nβ Browser started successfully") | |
| # Cleanup | |
| await scraper.cleanup() | |
| print(" Browser closed") | |
| async def test_scrape_example_site(): | |
| """Test scraping example.com with browser""" | |
| print("\n" + "=" * 60) | |
| print("Test 3: Scrape Example.com") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| url = "https://example.com" | |
| print(f" Scraping: {url}") | |
| result = await scraper.scrape_url(url) | |
| print(f"\n Result:") | |
| print(f" Success: {result.success}") | |
| print(f" Status code: {result.status_code}") | |
| print(f" Rows extracted: {result.rows_extracted}") | |
| print(f" Columns: {result.columns_extracted}") | |
| print(f" Response time: {result.response_time:.2f}s") | |
| if not result.success: | |
| print(f"\n β Error: {result.error}") | |
| print(f" Warnings: {result.warnings}") | |
| raise AssertionError(f"Scraping failed: {result.error}") | |
| assert result.success, "Scraping should succeed" | |
| assert result.status_code == 200, f"Expected 200, got {result.status_code}" | |
| assert result.rows_extracted > 0, "Should extract at least one row" | |
| print(f"\n Sample data (first 2 rows):") | |
| for i, row in enumerate(result.data[:2], 1): | |
| print(f" {i}. {row}") | |
| print("\nβ Scraping successful") | |
| await scraper.cleanup() | |
| async def test_scrape_with_wait(): | |
| """Test scraping with wait for selector""" | |
| print("\n" + "=" * 60) | |
| print("Test 4: Scrape with Wait Selector") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| url = "https://example.com" | |
| wait_for = "h1" | |
| print(f" Scraping: {url}") | |
| print(f" Waiting for: {wait_for}") | |
| result = await scraper.scrape_url( | |
| url=url, | |
| wait_for=wait_for | |
| ) | |
| print(f"\n Result:") | |
| print(f" Success: {result.success}") | |
| print(f" Rows: {result.rows_extracted}") | |
| assert result.success, "Should succeed with wait_for" | |
| assert result.rows_extracted > 0, "Should extract data" | |
| print("\nβ Wait selector working") | |
| await scraper.cleanup() | |
| async def test_scrape_with_execute(): | |
| """Test scraping via execute method""" | |
| print("\n" + "=" * 60) | |
| print("Test 5: Execute Method (Module Interface)") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| parameters = { | |
| 'url': 'https://example.com', | |
| 'screenshot': False, | |
| 'scroll': False | |
| } | |
| print(f" Parameters: {parameters}") | |
| result = await scraper.execute(parameters) | |
| print(f"\n Result:") | |
| print(f" Success: {result.success}") | |
| print(f" Execution time: {result.execution_time:.2f}s") | |
| print(f" Data rows: {len(result.data) if result.data else 0}") | |
| print(f" Metadata: {result.metadata}") | |
| assert result.success, "Execute should succeed" | |
| assert result.data is not None, "Should return data" | |
| assert 'browser' in result.metadata, "Should include browser metadata" | |
| assert result.metadata['javascript'] == True, "Should indicate JavaScript" | |
| if result.data: | |
| print(f"\n Sample data:") | |
| for row in result.data[:2]: | |
| print(f" {row}") | |
| print("\nβ Execute method working") | |
| await scraper.cleanup() | |
| async def test_browser_configs(): | |
| """Test different browser configurations""" | |
| print("\n" + "=" * 60) | |
| print("Test 6: Browser Configurations") | |
| print("=" * 60) | |
| configs = { | |
| 'optimized': get_browser_config('optimized'), | |
| 'fast': get_browser_config('fast'), | |
| 'mobile': get_browser_config('mobile'), | |
| 'desktop': get_browser_config('desktop') | |
| } | |
| for name, config in configs.items(): | |
| print(f"\n Config: {name}") | |
| print(f" Headless: {config.headless}") | |
| print(f" Viewport: {config.viewport_width}x{config.viewport_height}") | |
| print(f" Timeout: {config.timeout}ms") | |
| print(f" Is mobile: {config.is_mobile}") | |
| scraper = DynamicScraper(config=config, use_pool=False) | |
| assert scraper.config.headless == config.headless | |
| assert scraper.config.viewport_width == config.viewport_width | |
| print("\nβ All configs loaded correctly") | |
| async def test_mobile_emulation(): | |
| """Test mobile device emulation""" | |
| print("\n" + "=" * 60) | |
| print("Test 7: Mobile Emulation") | |
| print("=" * 60) | |
| config = get_browser_config('mobile') | |
| scraper = DynamicScraper(config=config, use_pool=False) | |
| print(f" Viewport: {config.viewport_width}x{config.viewport_height}") | |
| print(f" Is mobile: {config.is_mobile}") | |
| print(f" Has touch: {config.has_touch}") | |
| result = await scraper.scrape_url("https://example.com") | |
| print(f"\n Result:") | |
| print(f" Success: {result.success}") | |
| print(f" Rows: {result.rows_extracted}") | |
| assert result.success, "Mobile scraping should succeed" | |
| print("\nβ Mobile emulation working") | |
| await scraper.cleanup() | |
| async def test_multiple_urls(): | |
| """Test scraping multiple URLs""" | |
| print("\n" + "=" * 60) | |
| print("Test 8: Multiple URLs (Batch Scraping)") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| urls = [ | |
| "https://example.com", | |
| "https://example.org" | |
| ] | |
| print(f" Scraping {len(urls)} URLs...") | |
| results = await scraper.scrape_multiple(urls) | |
| print(f"\n Results:") | |
| for i, result in enumerate(results, 1): | |
| print(f" {i}. {urls[i-1]}") | |
| print(f" Success: {result.success}") | |
| print(f" Rows: {result.rows_extracted}") | |
| print(f" Status: {result.status_code}") | |
| assert len(results) == len(urls), "Should return result for each URL" | |
| assert all(r.success for r in results), "All scrapes should succeed" | |
| print("\nβ Multiple URLs working") | |
| await scraper.cleanup() | |
| async def test_error_handling(): | |
| """Test error handling with invalid URL""" | |
| print("\n" + "=" * 60) | |
| print("Test 9: Error Handling") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| # Test invalid URL | |
| invalid_url = "https://this-domain-definitely-does-not-exist-12345.com" | |
| print(f" Testing with invalid URL: {invalid_url}") | |
| result = await scraper.scrape_url(invalid_url) | |
| print(f"\n Result:") | |
| print(f" Success: {result.success}") | |
| print(f" Error: {result.error}") | |
| assert not result.success, "Should fail for invalid URL" | |
| assert result.error is not None, "Should have error message" | |
| print("\nβ Error handling working") | |
| await scraper.cleanup() | |
| async def test_capabilities(): | |
| """Test module capabilities""" | |
| print("\n" + "=" * 60) | |
| print("Test 10: Module Capabilities") | |
| print("=" * 60) | |
| scraper = DynamicScraper(use_pool=False) | |
| capabilities = scraper.get_capabilities() | |
| print(f" Capabilities: {capabilities}") | |
| assert capabilities is not None, "Should return capabilities" | |
| print("\nβ Capabilities working") | |
| async def run_all_tests(): | |
| """Run all tests""" | |
| print("\n" + "=" * 80) | |
| print(" " * 15 + "DYNAMIC SCRAPER TEST SUITE (PRODUCTION)") | |
| print("=" * 80) | |
| tests = [ | |
| ("Playwright Installation", test_playwright_installation), | |
| ("Scraper Initialization", test_scraper_initialization), | |
| ("Browser Startup", test_browser_startup), | |
| ("Scrape Example Site", test_scrape_example_site), | |
| ("Wait for Selector", test_scrape_with_wait), | |
| ("Execute Method", test_scrape_with_execute), | |
| ("Browser Configs", test_browser_configs), | |
| ("Mobile Emulation", test_mobile_emulation), | |
| ("Multiple URLs", test_multiple_urls), | |
| ("Error Handling", test_error_handling), | |
| ("Module Capabilities", test_capabilities), | |
| ] | |
| passed = 0 | |
| failed = 0 | |
| errors = [] | |
| for name, test_func in tests: | |
| try: | |
| await test_func() | |
| passed += 1 | |
| except AssertionError as e: | |
| failed += 1 | |
| error_msg = f"{name}: {str(e)}" | |
| errors.append(error_msg) | |
| print(f"\nβ TEST FAILED: {name}") | |
| print(f" Error: {e}") | |
| except Exception as e: | |
| failed += 1 | |
| error_msg = f"{name}: {str(e)}" | |
| errors.append(error_msg) | |
| print(f"\nβ TEST ERROR: {name}") | |
| print(f" Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| # Summary | |
| print("\n" + "=" * 80) | |
| print(" " * 30 + "TEST SUMMARY") | |
| print("=" * 80) | |
| print(f"\n Total Tests: {len(tests)}") | |
| print(f" β Passed: {passed}") | |
| print(f" β Failed: {failed}") | |
| if errors: | |
| print(f"\n Failed Tests:") | |
| for error in errors: | |
| print(f" - {error}") | |
| if failed == 0: | |
| print("\n" + "=" * 80) | |
| print(" " * 25 + "π ALL TESTS PASSED π") | |
| print("=" * 80) | |
| print("\nβ Dynamic scraper is production ready!") | |
| print("\nπ Verified:") | |
| print(" β Playwright installation") | |
| print(" β Browser startup and shutdown") | |
| print(" β Page navigation") | |
| print(" β JavaScript execution") | |
| print(" β Data extraction") | |
| print(" β Mobile emulation") | |
| print(" β Batch scraping") | |
| print(" β Error handling") | |
| print(" β Module interface") | |
| else: | |
| print("\n" + "=" * 80) | |
| print(" " * 25 + "β SOME TESTS FAILED β") | |
| print("=" * 80) | |
| raise AssertionError(f"{failed} test(s) failed") | |
| if __name__ == "__main__": | |
| asyncio.run(run_all_tests()) | |