feat: add scrape_do_integration

2026-06-28 21:01:55 +08:00 · 2024-09-06 16:51:48 +02:00 · 2024-09-06 16:51:48 +02:00 · 94e69a0515
commit 94e69a0515
parent 9e9c77551f
7 changed files with 85 additions and 6 deletions
--- a/examples/extras/browser_base_integration.py
+++ b/examples/extras/browser_base_integration.py
@ -18,7 +18,7 @@ load_dotenv()
 graph_config = {
    "llm": {
        "api_key": os.getenv("OPENAI_API_KEY"),
-        "model": "gpt-4o",
+        "model": "openai/gpt-4o",
    },
    "browser_base": {
        "api_key": os.getenv("BROWSER_BASE_API_KEY"),
--- a/examples/extras/scrape_do.py
+++ b/examples/extras/scrape_do.py
@ -0,0 +1,40 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "scrape_do": {
+        "api_key": os.getenv("SCRAPE_DO_API_KEY"),
+    },
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
--- a/scrapegraphai/docloaders/init.py
+++ b/scrapegraphai/docloaders/init.py
@ -2,3 +2,4 @@

 from .chromium import ChromiumLoader
 from .browser_base import browser_base_fetch
+from .scrape_do import scrape_do_fetch
--- a/scrapegraphai/docloaders/scrape_do.py
+++ b/scrapegraphai/docloaders/scrape_do.py
@ -0,0 +1,23 @@
+"""
+scrape_do module
+"""
+import urllib.parse
+import requests
+
+def scrape_do_fetch(token, target_url):
+    """
+    This function takes a token and a URL as inputs. 
+    It returns the IP address of the machine associated with the given URL.
+
+    Args:
+        token (str): The API token for scrape.do service.
+        target_url (str): A valid web page URL to fetch its associated IP address.
+
+    Returns:
+        str: The IP address of the machine associated with the target URL.
+    """
+
+    encoded_url = urllib.parse.quote(target_url)
+    url = f"http://api.scrape.do?token={token}&url={encoded_url}"
+    response = requests.request("GET", url)
+    return response.text
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@ -63,6 +63,7 @@ class AbstractGraph(ABC):
        self.loader_kwargs = self.config.get("loader_kwargs", {})
        self.cache_path = self.config.get("cache_path", False)
        self.browser_base = self.config.get("browser_base")
+        self.scrape_do = self.config.get("scrape_do")

        self.graph = self._create_graph()
        self.final_state = None
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -67,7 +67,8 @@ class SmartScraperGraph(AbstractGraph):
                "force": self.config.get("force", False),
                "cut": self.config.get("cut", True),
                "loader_kwargs": self.config.get("loader_kwargs", {}),
-                "browser_base": self.config.get("browser_base")
+                "browser_base": self.config.get("browser_base"),
+                "scrape_do": self.config.get("scrape_do")
            }
        )
        parse_node = ParseNode(
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -76,6 +76,10 @@ class FetchNode(BaseNode):
            None if node_config is None else node_config.get("browser_base", None)
        )

+        self.scrape_do = (
+            None if node_config is None else node_config.get("scrape_do", None)
+        )
+
    def execute(self, state):
        """
        Executes the node's logic to fetch HTML content from a specified URL and
@ -102,7 +106,7 @@ class FetchNode(BaseNode):

        source = input_data[0]
        input_type = input_keys[0]
-        
+
        handlers = {
            "json_dir": self.handle_directory,
            "xml_dir": self.handle_directory,
@ -271,11 +275,19 @@ class FetchNode(BaseNode):
                try:
                    from ..docloaders.browser_base import browser_base_fetch
                except ImportError:
-                    raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
+                    raise ImportError("""The browserbase module is not installed. 
+                                      Please install it using `pip install browserbase`.""")

                data =  browser_base_fetch(self.browser_base.get("api_key"),
                                            self.browser_base.get("project_id"), [source])

+                document = [Document(page_content=content,
+                                    metadata={"source": source}) for content in data]
+            elif self.scrape_do is not None:
+                from ..docloaders.scrape_do import scrape_do_fetch
+                data =  scrape_do_fetch(self.scrape_do.get("api_key"),
+                                            source)
+
                document = [Document(page_content=content,
                                    metadata={"source": source}) for content in data]
            else:
@ -283,7 +295,8 @@ class FetchNode(BaseNode):
                document = loader.load()

            if not document or not document[0].page_content.strip():
-                raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
+                raise ValueError("""No HTML body content found in 
+                                 the document fetched by ChromiumLoader.""")
            parsed_content = document[0].page_content

            if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI))  and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
@ -292,7 +305,7 @@ class FetchNode(BaseNode):
            compressed_document = [
                Document(page_content=parsed_content, metadata={"source": "html file"})
            ]
-        
+
        return self.update_state(state, compressed_document)

    def update_state(self, state, compressed_document):