feat(fetch): added playwright support

2026-06-23 21:00:30 +08:00 · 2024-04-30 04:02:58 +02:00 · 2024-04-30 04:02:58 +02:00 · 42ab0aa1d2
commit 42ab0aa1d2
parent 450291f52e
8 changed files with 28 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
 ```bash
 pip install scrapegraphai
 ```
+you will also need to install Playwright for javascript-based scraping:
+```bash
+playwright install
+```
 ## 🔍 Demo
 Official streamlit demo:

--- a/examples/mixed_models/smart_scraper_mixed.py
+++ b/examples/mixed_models/smart_scraper_mixed.py
@ -24,7 +24,8 @@ graph_config = {
        "model": "ollama/nomic-embed-text",
        "temperature": 0,
        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
-    }
+    },
+    "headless": False
 }

 # ************************************************
--- a/pyproject.toml
+++ b/pyproject.toml
@ -39,6 +39,7 @@ google = "3.0.0"
 minify-html = "0.15.0"
 free-proxy = "1.1.1"
 langchain-groq = "0.1.3"
+playwright = "^1.43.0"

 [tool.poetry.dev-dependencies]
 pytest = "8.0.0"
--- a/requirements.txt
+++ b/requirements.txt
@ -13,3 +13,4 @@ google==3.0.0
 minify-html==0.15.0
 free-proxy==1.1.1
 langchain-groq==0.1.3
+playwright==1.43.0
--- a/scrapegraphai/graphs/search_graph.py
+++ b/scrapegraphai/graphs/search_graph.py
@ -29,6 +29,7 @@ class SearchGraph(AbstractGraph):
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
        )
        parse_node = ParseNode(
            input="doc",
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -25,6 +25,7 @@ class SmartScraperGraph(AbstractGraph):

        self.input_key = "url" if source.startswith("http") else "local_dir"

+
    def _create_graph(self):
        """
        Creates the graph of nodes representing the workflow for web scraping.
@ -32,6 +33,7 @@ class SmartScraperGraph(AbstractGraph):
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
        )
        parse_node = ParseNode(
            input="doc",
--- a/scrapegraphai/graphs/speech_graph.py
+++ b/scrapegraphai/graphs/speech_graph.py
@ -35,6 +35,7 @@ class SpeechGraph(AbstractGraph):
        fetch_node = FetchNode(
            input="url | local_dir",
            output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
        )
        parse_node = ParseNode(
            input="doc",
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -2,8 +2,8 @@
 Module for fetching the HTML node
 """

-from typing import List
-from langchain_community.document_loaders import AsyncHtmlLoader
+from typing import List, Optional
+from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..utils.remover import remover
@ -37,7 +37,7 @@ class FetchNode(BaseNode):
                        to succeed.
    """

-    def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
+    def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
        """
        Initializes the FetchHTMLNode with a node name and node type.
        Arguments:
@ -46,6 +46,8 @@ class FetchNode(BaseNode):
        """
        super().__init__(node_name, "node", input, output, 1)

+        self.headless = True if node_config is None else node_config.get("headless", True)
+
    def execute(self, state):
        """
        Executes the node's logic to fetch HTML content from a specified URL and
@ -79,14 +81,21 @@ class FetchNode(BaseNode):

        else:
            if self.node_config is not None and self.node_config.get("endpoint") is not None:
-                loader = AsyncHtmlLoader(
-                    source, proxies={"http": self.node_config["endpoint"]})
+                
+                loader = AsyncChromiumLoader(
+                    [source],
+                    proxies={"http": self.node_config["endpoint"]},
+                    headless=self.headless,
+                )
            else:
-                loader = AsyncHtmlLoader(source)
+                loader = AsyncChromiumLoader(
+                    [source],
+                    headless=self.headless,
+                )

            document = loader.load()
            compressed_document = [
-                Document(page_content=remover(str(document)))]
+                Document(page_content=remover(str(document[0].page_content)))]

        state.update({self.output[0]: compressed_document})
        return state