feat: add scrape_do_integration

This commit is contained in:
Marco Vinciguerra 2024-09-06 16:51:48 +02:00
parent 9e9c77551f
commit 94e69a0515
7 changed files with 85 additions and 6 deletions

View File

@ -18,7 +18,7 @@ load_dotenv()
graph_config = {
"llm": {
"api_key": os.getenv("OPENAI_API_KEY"),
"model": "gpt-4o",
"model": "openai/gpt-4o",
},
"browser_base": {
"api_key": os.getenv("BROWSER_BASE_API_KEY"),

View File

@ -0,0 +1,40 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"api_key": os.getenv("OPENAI_API_KEY"),
"model": "openai/gpt-4o",
},
"scrape_do": {
"api_key": os.getenv("SCRAPE_DO_API_KEY"),
},
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me what does the company do, the name and a contact email.",
source="https://scrapegraphai.com/",
config=graph_config
)
result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))

View File

@ -2,3 +2,4 @@
from .chromium import ChromiumLoader
from .browser_base import browser_base_fetch
from .scrape_do import scrape_do_fetch

View File

@ -0,0 +1,23 @@
"""
scrape_do module
"""
import urllib.parse
import requests
def scrape_do_fetch(token, target_url):
"""
This function takes a token and a URL as inputs.
It returns the IP address of the machine associated with the given URL.
Args:
token (str): The API token for scrape.do service.
target_url (str): A valid web page URL to fetch its associated IP address.
Returns:
str: The IP address of the machine associated with the target URL.
"""
encoded_url = urllib.parse.quote(target_url)
url = f"http://api.scrape.do?token={token}&url={encoded_url}"
response = requests.request("GET", url)
return response.text

View File

@ -63,6 +63,7 @@ class AbstractGraph(ABC):
self.loader_kwargs = self.config.get("loader_kwargs", {})
self.cache_path = self.config.get("cache_path", False)
self.browser_base = self.config.get("browser_base")
self.scrape_do = self.config.get("scrape_do")
self.graph = self._create_graph()
self.final_state = None

View File

@ -67,7 +67,8 @@ class SmartScraperGraph(AbstractGraph):
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"loader_kwargs": self.config.get("loader_kwargs", {}),
"browser_base": self.config.get("browser_base")
"browser_base": self.config.get("browser_base"),
"scrape_do": self.config.get("scrape_do")
}
)
parse_node = ParseNode(

View File

@ -76,6 +76,10 @@ class FetchNode(BaseNode):
None if node_config is None else node_config.get("browser_base", None)
)
self.scrape_do = (
None if node_config is None else node_config.get("scrape_do", None)
)
def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
@ -102,7 +106,7 @@ class FetchNode(BaseNode):
source = input_data[0]
input_type = input_keys[0]
handlers = {
"json_dir": self.handle_directory,
"xml_dir": self.handle_directory,
@ -271,11 +275,19 @@ class FetchNode(BaseNode):
try:
from ..docloaders.browser_base import browser_base_fetch
except ImportError:
raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
raise ImportError("""The browserbase module is not installed.
Please install it using `pip install browserbase`.""")
data = browser_base_fetch(self.browser_base.get("api_key"),
self.browser_base.get("project_id"), [source])
document = [Document(page_content=content,
metadata={"source": source}) for content in data]
elif self.scrape_do is not None:
from ..docloaders.scrape_do import scrape_do_fetch
data = scrape_do_fetch(self.scrape_do.get("api_key"),
source)
document = [Document(page_content=content,
metadata={"source": source}) for content in data]
else:
@ -283,7 +295,8 @@ class FetchNode(BaseNode):
document = loader.load()
if not document or not document[0].page_content.strip():
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
raise ValueError("""No HTML body content found in
the document fetched by ChromiumLoader.""")
parsed_content = document[0].page_content
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
@ -292,7 +305,7 @@ class FetchNode(BaseNode):
compressed_document = [
Document(page_content=parsed_content, metadata={"source": "html file"})
]
return self.update_state(state, compressed_document)
def update_state(self, state, compressed_document):