Merge pull request #368 from stevenmichaelthomas/wait-for-network-idle

Add the ability to specify load state
2026-06-23 21:00:30 +08:00 · 2024-06-11 19:59:23 +02:00 · 2024-06-11 19:59:23 +02:00 · fa951b4c8b
commit fa951b4c8b
parent e5bb5ae473 8f405ff87a
2 changed files with 6 additions and 0 deletions
--- a/scrapegraphai/docloaders/chromium.py
+++ b/scrapegraphai/docloaders/chromium.py
@ -29,6 +29,7 @@ class ChromiumLoader(BaseLoader):
        backend: str = "playwright",
        headless: bool = True,
        proxy: Optional[Proxy] = None,
+        load_state: str = "domcontentloaded",
        **kwargs: Any,
    ):
        """Initialize the loader with a list of URL paths.
@ -55,6 +56,7 @@ class ChromiumLoader(BaseLoader):
        self.headless = headless
        self.proxy = parse_or_search_proxy(proxy) if proxy else None
        self.urls = urls
+        self.load_state = load_state

    async def ascrape_playwright(self, url: str) -> str:
        """
@ -81,6 +83,7 @@ class ChromiumLoader(BaseLoader):
                await Malenia.apply_stealth(context)
                page = await context.new_page()
                await page.goto(url)
+                await page.wait_for_load_state(self.load_state)
                results = await page.content()  # Simply get the HTML content
                logger.info("Content scraped")
            except Exception as e:
--- a/scrapegraphai/nodes/search_link_node.py
+++ b/scrapegraphai/nodes/search_link_node.py
@ -83,6 +83,9 @@ class SearchLinkNode(BaseNode):
            
            Assume relevance broadly, including any links that might be related or potentially useful 
            in relation to the task.
+
+            Sort it in order of importance, the first one should be the most important one, the last one
+            the least important
            
            Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
            whether the content at the link is directly relevant.