feat: add scrape_do_integration

2026-06-28 21:01:55 +08:00 · 2024-09-06 16:51:48 +02:00 · 2024-09-06 16:51:48 +02:00 · 94e69a0515
commit 94e69a0515
parent 9e9c77551f
7 changed files with 85 additions and 6 deletions
--- a/examples/extras/browser_base_integration.py
+++ b/examples/extras/browser_base_integration.py
@ -18,7 +18,7 @@ load_dotenv()
 graph_config = {
    "llm": {
        "api_key": os.getenv("OPENAI_API_KEY"),
-        "model": "gpt-4o",
+        "model": "openai/gpt-4o",
    },
    "browser_base": {
        "api_key": os.getenv("BROWSER_BASE_API_KEY"),
--- a/examples/extras/scrape_do.py
+++ b/examples/extras/scrape_do.py
@ -0,0 +1,40 @@
 """ 
 Basic example of scraping pipeline using SmartScraper
 """
 import os
 import json
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 load_dotenv()
 # ************************************************
 # Define the configuration for the graph
 # ************************************************
 graph_config = {
    "llm": {
        "api_key": os.getenv("OPENAI_API_KEY"),
        "model": "openai/gpt-4o",
    },
    "scrape_do": {
        "api_key": os.getenv("SCRAPE_DO_API_KEY"),
    },
    "verbose": True,
    "headless": False,
 }
 # ************************************************
 # Create the SmartScraperGraph instance and run it
 # ************************************************
 smart_scraper_graph = SmartScraperGraph(
    prompt="List me what does the company do, the name and a contact email.",
    source="https://scrapegraphai.com/",
    config=graph_config
 )
 result = smart_scraper_graph.run()
 print(json.dumps(result, indent=4))
--- a/scrapegraphai/docloaders/init.py
+++ b/scrapegraphai/docloaders/init.py
@ -2,3 +2,4 @@
 from .chromium import ChromiumLoader
 from .browser_base import browser_base_fetch
 from .scrape_do import scrape_do_fetch
--- a/scrapegraphai/docloaders/scrape_do.py
+++ b/scrapegraphai/docloaders/scrape_do.py
@ -0,0 +1,23 @@
 """
 scrape_do module
 """
 import urllib.parse
 import requests
 def scrape_do_fetch(token, target_url):
    """
    This function takes a token and a URL as inputs. 
    It returns the IP address of the machine associated with the given URL.
    Args:
        token (str): The API token for scrape.do service.
        target_url (str): A valid web page URL to fetch its associated IP address.
    Returns:
        str: The IP address of the machine associated with the target URL.
    """
    encoded_url = urllib.parse.quote(target_url)
    url = f"http://api.scrape.do?token={token}&url={encoded_url}"
    response = requests.request("GET", url)
    return response.text
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@ -63,6 +63,7 @@ class AbstractGraph(ABC):
        self.loader_kwargs = self.config.get("loader_kwargs", {})
        self.cache_path = self.config.get("cache_path", False)
        self.browser_base = self.config.get("browser_base")
        self.scrape_do = self.config.get("scrape_do")
        self.graph = self._create_graph()
        self.final_state = None
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -67,7 +67,8 @@ class SmartScraperGraph(AbstractGraph):
                "force": self.config.get("force", False),
                "cut": self.config.get("cut", True),
                "loader_kwargs": self.config.get("loader_kwargs", {}),
-                "browser_base": self.config.get("browser_base")
+                "browser_base": self.config.get("browser_base"),
                "scrape_do": self.config.get("scrape_do")
            }
        )
        parse_node = ParseNode(
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -76,6 +76,10 @@ class FetchNode(BaseNode):
            None if node_config is None else node_config.get("browser_base", None)
        )
        self.scrape_do = (
            None if node_config is None else node_config.get("scrape_do", None)
        )
    def execute(self, state):
        """
        Executes the node's logic to fetch HTML content from a specified URL and
@ -102,7 +106,7 @@ class FetchNode(BaseNode):
        source = input_data[0]
        input_type = input_keys[0]
-        
+
        handlers = {
            "json_dir": self.handle_directory,
            "xml_dir": self.handle_directory,
@ -271,11 +275,19 @@ class FetchNode(BaseNode):
                try:
                    from ..docloaders.browser_base import browser_base_fetch
                except ImportError:
-                    raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
+                    raise ImportError("""The browserbase module is not installed. 
                                      Please install it using `pip install browserbase`.""")
                data =  browser_base_fetch(self.browser_base.get("api_key"),
                                            self.browser_base.get("project_id"), [source])
                document = [Document(page_content=content,
                                    metadata={"source": source}) for content in data]
            elif self.scrape_do is not None:
                from ..docloaders.scrape_do import scrape_do_fetch
                data =  scrape_do_fetch(self.scrape_do.get("api_key"),
                                            source)
                document = [Document(page_content=content,
                                    metadata={"source": source}) for content in data]
            else:
@ -283,7 +295,8 @@ class FetchNode(BaseNode):
                document = loader.load()
            if not document or not document[0].page_content.strip():
-                raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
+                raise ValueError("""No HTML body content found in 
                                 the document fetched by ChromiumLoader.""")
            parsed_content = document[0].page_content
            if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI))  and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
@ -292,7 +305,7 @@ class FetchNode(BaseNode):
            compressed_document = [
                Document(page_content=parsed_content, metadata={"source": "html file"})
            ]
-        
+
        return self.update_state(state, compressed_document)
    def update_state(self, state, compressed_document):