mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
feat: add scrape_do_integration
This commit is contained in:
parent
9e9c77551f
commit
94e69a0515
@ -18,7 +18,7 @@ load_dotenv()
|
|||||||
graph_config = {
|
graph_config = {
|
||||||
"llm": {
|
"llm": {
|
||||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
"model": "gpt-4o",
|
"model": "openai/gpt-4o",
|
||||||
},
|
},
|
||||||
"browser_base": {
|
"browser_base": {
|
||||||
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
|
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
|
||||||
|
|||||||
40
examples/extras/scrape_do.py
Normal file
40
examples/extras/scrape_do.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
"""
|
||||||
|
Basic example of scraping pipeline using SmartScraper
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from scrapegraphai.graphs import SmartScraperGraph
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Define the configuration for the graph
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
|
||||||
|
graph_config = {
|
||||||
|
"llm": {
|
||||||
|
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||||
|
"model": "openai/gpt-4o",
|
||||||
|
},
|
||||||
|
"scrape_do": {
|
||||||
|
"api_key": os.getenv("SCRAPE_DO_API_KEY"),
|
||||||
|
},
|
||||||
|
"verbose": True,
|
||||||
|
"headless": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ************************************************
|
||||||
|
# Create the SmartScraperGraph instance and run it
|
||||||
|
# ************************************************
|
||||||
|
|
||||||
|
smart_scraper_graph = SmartScraperGraph(
|
||||||
|
prompt="List me what does the company do, the name and a contact email.",
|
||||||
|
source="https://scrapegraphai.com/",
|
||||||
|
config=graph_config
|
||||||
|
)
|
||||||
|
|
||||||
|
result = smart_scraper_graph.run()
|
||||||
|
print(json.dumps(result, indent=4))
|
||||||
@ -2,3 +2,4 @@
|
|||||||
|
|
||||||
from .chromium import ChromiumLoader
|
from .chromium import ChromiumLoader
|
||||||
from .browser_base import browser_base_fetch
|
from .browser_base import browser_base_fetch
|
||||||
|
from .scrape_do import scrape_do_fetch
|
||||||
|
|||||||
23
scrapegraphai/docloaders/scrape_do.py
Normal file
23
scrapegraphai/docloaders/scrape_do.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
"""
|
||||||
|
scrape_do module
|
||||||
|
"""
|
||||||
|
import urllib.parse
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def scrape_do_fetch(token, target_url):
|
||||||
|
"""
|
||||||
|
This function takes a token and a URL as inputs.
|
||||||
|
It returns the IP address of the machine associated with the given URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token (str): The API token for scrape.do service.
|
||||||
|
target_url (str): A valid web page URL to fetch its associated IP address.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The IP address of the machine associated with the target URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
encoded_url = urllib.parse.quote(target_url)
|
||||||
|
url = f"http://api.scrape.do?token={token}&url={encoded_url}"
|
||||||
|
response = requests.request("GET", url)
|
||||||
|
return response.text
|
||||||
@ -63,6 +63,7 @@ class AbstractGraph(ABC):
|
|||||||
self.loader_kwargs = self.config.get("loader_kwargs", {})
|
self.loader_kwargs = self.config.get("loader_kwargs", {})
|
||||||
self.cache_path = self.config.get("cache_path", False)
|
self.cache_path = self.config.get("cache_path", False)
|
||||||
self.browser_base = self.config.get("browser_base")
|
self.browser_base = self.config.get("browser_base")
|
||||||
|
self.scrape_do = self.config.get("scrape_do")
|
||||||
|
|
||||||
self.graph = self._create_graph()
|
self.graph = self._create_graph()
|
||||||
self.final_state = None
|
self.final_state = None
|
||||||
|
|||||||
@ -67,7 +67,8 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
"force": self.config.get("force", False),
|
"force": self.config.get("force", False),
|
||||||
"cut": self.config.get("cut", True),
|
"cut": self.config.get("cut", True),
|
||||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||||
"browser_base": self.config.get("browser_base")
|
"browser_base": self.config.get("browser_base"),
|
||||||
|
"scrape_do": self.config.get("scrape_do")
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
|
|||||||
@ -76,6 +76,10 @@ class FetchNode(BaseNode):
|
|||||||
None if node_config is None else node_config.get("browser_base", None)
|
None if node_config is None else node_config.get("browser_base", None)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.scrape_do = (
|
||||||
|
None if node_config is None else node_config.get("scrape_do", None)
|
||||||
|
)
|
||||||
|
|
||||||
def execute(self, state):
|
def execute(self, state):
|
||||||
"""
|
"""
|
||||||
Executes the node's logic to fetch HTML content from a specified URL and
|
Executes the node's logic to fetch HTML content from a specified URL and
|
||||||
@ -102,7 +106,7 @@ class FetchNode(BaseNode):
|
|||||||
|
|
||||||
source = input_data[0]
|
source = input_data[0]
|
||||||
input_type = input_keys[0]
|
input_type = input_keys[0]
|
||||||
|
|
||||||
handlers = {
|
handlers = {
|
||||||
"json_dir": self.handle_directory,
|
"json_dir": self.handle_directory,
|
||||||
"xml_dir": self.handle_directory,
|
"xml_dir": self.handle_directory,
|
||||||
@ -271,11 +275,19 @@ class FetchNode(BaseNode):
|
|||||||
try:
|
try:
|
||||||
from ..docloaders.browser_base import browser_base_fetch
|
from ..docloaders.browser_base import browser_base_fetch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
|
raise ImportError("""The browserbase module is not installed.
|
||||||
|
Please install it using `pip install browserbase`.""")
|
||||||
|
|
||||||
data = browser_base_fetch(self.browser_base.get("api_key"),
|
data = browser_base_fetch(self.browser_base.get("api_key"),
|
||||||
self.browser_base.get("project_id"), [source])
|
self.browser_base.get("project_id"), [source])
|
||||||
|
|
||||||
|
document = [Document(page_content=content,
|
||||||
|
metadata={"source": source}) for content in data]
|
||||||
|
elif self.scrape_do is not None:
|
||||||
|
from ..docloaders.scrape_do import scrape_do_fetch
|
||||||
|
data = scrape_do_fetch(self.scrape_do.get("api_key"),
|
||||||
|
source)
|
||||||
|
|
||||||
document = [Document(page_content=content,
|
document = [Document(page_content=content,
|
||||||
metadata={"source": source}) for content in data]
|
metadata={"source": source}) for content in data]
|
||||||
else:
|
else:
|
||||||
@ -283,7 +295,8 @@ class FetchNode(BaseNode):
|
|||||||
document = loader.load()
|
document = loader.load()
|
||||||
|
|
||||||
if not document or not document[0].page_content.strip():
|
if not document or not document[0].page_content.strip():
|
||||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
raise ValueError("""No HTML body content found in
|
||||||
|
the document fetched by ChromiumLoader.""")
|
||||||
parsed_content = document[0].page_content
|
parsed_content = document[0].page_content
|
||||||
|
|
||||||
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
||||||
@ -292,7 +305,7 @@ class FetchNode(BaseNode):
|
|||||||
compressed_document = [
|
compressed_document = [
|
||||||
Document(page_content=parsed_content, metadata={"source": "html file"})
|
Document(page_content=parsed_content, metadata={"source": "html file"})
|
||||||
]
|
]
|
||||||
|
|
||||||
return self.update_state(state, compressed_document)
|
return self.update_state(state, compressed_document)
|
||||||
|
|
||||||
def update_state(self, state, compressed_document):
|
def update_state(self, state, compressed_document):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user