feat:add dynamic rendering

This commit is contained in:
Marco Vinciguerra 2024-10-09 10:45:47 +02:00
parent 4f816f3b04
commit 88ba2310ac

View File

@ -1,5 +1,5 @@
""""
chromium module
"""
chromiumloader module
"""
import asyncio
from typing import Any, AsyncIterator, Iterator, List, Optional
@ -83,7 +83,7 @@ class ChromiumLoader(BaseLoader):
async with async_timeout.timeout(self.TIMEOUT):
driver = uc.Chrome(headless=self.headless)
driver.get(url)
results = driver.page_content
results = driver.page_source
logger.info(f"Successfully scraped {url}")
break
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
@ -137,6 +137,45 @@ class ChromiumLoader(BaseLoader):
return results
async def ascrape_with_js_support(self, url: str) -> str:
"""
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
Args:
url (str): The URL to scrape.
Returns:
str: The fully rendered HTML content after JavaScript execution,
or an error message if an exception occurs.
"""
from playwright.async_api import async_playwright
logger.info(f"Starting scraping with JavaScript support for {url}...")
results = ""
attempt = 0
while attempt < self.RETRY_LIMIT:
try:
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
context = await browser.new_context()
page = await context.new_page()
await page.goto(url, wait_until="networkidle")
results = await page.content()
logger.info("Content scraped after JavaScript rendering")
break
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
attempt += 1
logger.error(f"Attempt {attempt} failed: {e}")
if attempt == self.RETRY_LIMIT:
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
finally:
await browser.close()
return results
def lazy_load(self) -> Iterator[Document]:
"""
Lazily load text content from the provided URLs.