mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat:add dynamic rendering
This commit is contained in:
parent
4f816f3b04
commit
88ba2310ac
@ -1,5 +1,5 @@
|
||||
""""
|
||||
chromium module
|
||||
"""
|
||||
chromiumloader module
|
||||
"""
|
||||
import asyncio
|
||||
from typing import Any, AsyncIterator, Iterator, List, Optional
|
||||
@ -83,7 +83,7 @@ class ChromiumLoader(BaseLoader):
|
||||
async with async_timeout.timeout(self.TIMEOUT):
|
||||
driver = uc.Chrome(headless=self.headless)
|
||||
driver.get(url)
|
||||
results = driver.page_content
|
||||
results = driver.page_source
|
||||
logger.info(f"Successfully scraped {url}")
|
||||
break
|
||||
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
||||
@ -137,6 +137,45 @@ class ChromiumLoader(BaseLoader):
|
||||
|
||||
return results
|
||||
|
||||
async def ascrape_with_js_support(self, url: str) -> str:
|
||||
"""
|
||||
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
|
||||
Returns:
|
||||
str: The fully rendered HTML content after JavaScript execution,
|
||||
or an error message if an exception occurs.
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger.info(f"Starting scraping with JavaScript support for {url}...")
|
||||
results = ""
|
||||
attempt = 0
|
||||
|
||||
while attempt < self.RETRY_LIMIT:
|
||||
try:
|
||||
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
|
||||
browser = await p.chromium.launch(
|
||||
headless=self.headless, proxy=self.proxy, **self.browser_config
|
||||
)
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
await page.goto(url, wait_until="networkidle")
|
||||
results = await page.content()
|
||||
logger.info("Content scraped after JavaScript rendering")
|
||||
break
|
||||
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
|
||||
attempt += 1
|
||||
logger.error(f"Attempt {attempt} failed: {e}")
|
||||
if attempt == self.RETRY_LIMIT:
|
||||
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
return results
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""
|
||||
Lazily load text content from the provided URLs.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user