diff --git a/examples/extras/browser_base_integration.py b/examples/extras/browser_base_integration.py index 61363024..7030e101 100644 --- a/examples/extras/browser_base_integration.py +++ b/examples/extras/browser_base_integration.py @@ -18,7 +18,7 @@ load_dotenv() graph_config = { "llm": { "api_key": os.getenv("OPENAI_API_KEY"), - "model": "gpt-4o", + "model": "openai/gpt-4o", }, "browser_base": { "api_key": os.getenv("BROWSER_BASE_API_KEY"), diff --git a/examples/extras/scrape_do.py b/examples/extras/scrape_do.py new file mode 100644 index 00000000..7f173a99 --- /dev/null +++ b/examples/extras/scrape_do.py @@ -0,0 +1,40 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + }, + "scrape_do": { + "api_key": os.getenv("SCRAPE_DO_API_KEY"), + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 45a3783d..1010a6be 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -2,3 +2,4 @@ from .chromium import ChromiumLoader from .browser_base import browser_base_fetch +from .scrape_do import scrape_do_fetch diff --git a/scrapegraphai/docloaders/scrape_do.py b/scrapegraphai/docloaders/scrape_do.py new file mode 100644 index 00000000..5ed288a8 --- /dev/null +++ b/scrapegraphai/docloaders/scrape_do.py @@ -0,0 +1,23 @@ +""" +scrape_do module +""" +import urllib.parse +import requests + +def scrape_do_fetch(token, target_url): + """ + This function takes a token and a URL as inputs. + It returns the IP address of the machine associated with the given URL. + + Args: + token (str): The API token for scrape.do service. + target_url (str): A valid web page URL to fetch its associated IP address. + + Returns: + str: The IP address of the machine associated with the target URL. + """ + + encoded_url = urllib.parse.quote(target_url) + url = f"http://api.scrape.do?token={token}&url={encoded_url}" + response = requests.request("GET", url) + return response.text diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index ae5cc496..357ff94d 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -63,6 +63,7 @@ class AbstractGraph(ABC): self.loader_kwargs = self.config.get("loader_kwargs", {}) self.cache_path = self.config.get("cache_path", False) self.browser_base = self.config.get("browser_base") + self.scrape_do = self.config.get("scrape_do") self.graph = self._create_graph() self.final_state = None diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 5fbba71d..045f31ac 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -67,7 +67,8 @@ class SmartScraperGraph(AbstractGraph): "force": self.config.get("force", False), "cut": self.config.get("cut", True), "loader_kwargs": self.config.get("loader_kwargs", {}), - "browser_base": self.config.get("browser_base") + "browser_base": self.config.get("browser_base"), + "scrape_do": self.config.get("scrape_do") } ) parse_node = ParseNode( diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index f015278d..d727d873 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -76,6 +76,10 @@ class FetchNode(BaseNode): None if node_config is None else node_config.get("browser_base", None) ) + self.scrape_do = ( + None if node_config is None else node_config.get("scrape_do", None) + ) + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -102,7 +106,7 @@ class FetchNode(BaseNode): source = input_data[0] input_type = input_keys[0] - + handlers = { "json_dir": self.handle_directory, "xml_dir": self.handle_directory, @@ -271,11 +275,19 @@ class FetchNode(BaseNode): try: from ..docloaders.browser_base import browser_base_fetch except ImportError: - raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.") + raise ImportError("""The browserbase module is not installed. + Please install it using `pip install browserbase`.""") data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) + document = [Document(page_content=content, + metadata={"source": source}) for content in data] + elif self.scrape_do is not None: + from ..docloaders.scrape_do import scrape_do_fetch + data = scrape_do_fetch(self.scrape_do.get("api_key"), + source) + document = [Document(page_content=content, metadata={"source": source}) for content in data] else: @@ -283,7 +295,8 @@ class FetchNode(BaseNode): document = loader.load() if not document or not document[0].page_content.strip(): - raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.") + raise ValueError("""No HTML body content found in + the document fetched by ChromiumLoader.""") parsed_content = document[0].page_content if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled: @@ -292,7 +305,7 @@ class FetchNode(BaseNode): compressed_document = [ Document(page_content=parsed_content, metadata={"source": "html file"}) ] - + return self.update_state(state, compressed_document) def update_state(self, state, compressed_document):