mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge pull request #368 from stevenmichaelthomas/wait-for-network-idle
Add the ability to specify load state
This commit is contained in:
commit
fa951b4c8b
@ -29,6 +29,7 @@ class ChromiumLoader(BaseLoader):
|
||||
backend: str = "playwright",
|
||||
headless: bool = True,
|
||||
proxy: Optional[Proxy] = None,
|
||||
load_state: str = "domcontentloaded",
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Initialize the loader with a list of URL paths.
|
||||
@ -55,6 +56,7 @@ class ChromiumLoader(BaseLoader):
|
||||
self.headless = headless
|
||||
self.proxy = parse_or_search_proxy(proxy) if proxy else None
|
||||
self.urls = urls
|
||||
self.load_state = load_state
|
||||
|
||||
async def ascrape_playwright(self, url: str) -> str:
|
||||
"""
|
||||
@ -81,6 +83,7 @@ class ChromiumLoader(BaseLoader):
|
||||
await Malenia.apply_stealth(context)
|
||||
page = await context.new_page()
|
||||
await page.goto(url)
|
||||
await page.wait_for_load_state(self.load_state)
|
||||
results = await page.content() # Simply get the HTML content
|
||||
logger.info("Content scraped")
|
||||
except Exception as e:
|
||||
|
||||
@ -83,6 +83,9 @@ class SearchLinkNode(BaseNode):
|
||||
|
||||
Assume relevance broadly, including any links that might be related or potentially useful
|
||||
in relation to the task.
|
||||
|
||||
Sort it in order of importance, the first one should be the most important one, the last one
|
||||
the least important
|
||||
|
||||
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
|
||||
whether the content at the link is directly relevant.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user