mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
feat: add scrape_do_integration
This commit is contained in:
parent
9e9c77551f
commit
94e69a0515
@ -18,7 +18,7 @@ load_dotenv()
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
"model": "gpt-4o",
|
||||
"model": "openai/gpt-4o",
|
||||
},
|
||||
"browser_base": {
|
||||
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
|
||||
|
||||
40
examples/extras/scrape_do.py
Normal file
40
examples/extras/scrape_do.py
Normal file
@ -0,0 +1,40 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
"model": "openai/gpt-4o",
|
||||
},
|
||||
"scrape_do": {
|
||||
"api_key": os.getenv("SCRAPE_DO_API_KEY"),
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me what does the company do, the name and a contact email.",
|
||||
source="https://scrapegraphai.com/",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(json.dumps(result, indent=4))
|
||||
@ -2,3 +2,4 @@
|
||||
|
||||
from .chromium import ChromiumLoader
|
||||
from .browser_base import browser_base_fetch
|
||||
from .scrape_do import scrape_do_fetch
|
||||
|
||||
23
scrapegraphai/docloaders/scrape_do.py
Normal file
23
scrapegraphai/docloaders/scrape_do.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""
|
||||
scrape_do module
|
||||
"""
|
||||
import urllib.parse
|
||||
import requests
|
||||
|
||||
def scrape_do_fetch(token, target_url):
|
||||
"""
|
||||
This function takes a token and a URL as inputs.
|
||||
It returns the IP address of the machine associated with the given URL.
|
||||
|
||||
Args:
|
||||
token (str): The API token for scrape.do service.
|
||||
target_url (str): A valid web page URL to fetch its associated IP address.
|
||||
|
||||
Returns:
|
||||
str: The IP address of the machine associated with the target URL.
|
||||
"""
|
||||
|
||||
encoded_url = urllib.parse.quote(target_url)
|
||||
url = f"http://api.scrape.do?token={token}&url={encoded_url}"
|
||||
response = requests.request("GET", url)
|
||||
return response.text
|
||||
@ -63,6 +63,7 @@ class AbstractGraph(ABC):
|
||||
self.loader_kwargs = self.config.get("loader_kwargs", {})
|
||||
self.cache_path = self.config.get("cache_path", False)
|
||||
self.browser_base = self.config.get("browser_base")
|
||||
self.scrape_do = self.config.get("scrape_do")
|
||||
|
||||
self.graph = self._create_graph()
|
||||
self.final_state = None
|
||||
|
||||
@ -67,7 +67,8 @@ class SmartScraperGraph(AbstractGraph):
|
||||
"force": self.config.get("force", False),
|
||||
"cut": self.config.get("cut", True),
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
"browser_base": self.config.get("browser_base")
|
||||
"browser_base": self.config.get("browser_base"),
|
||||
"scrape_do": self.config.get("scrape_do")
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
|
||||
@ -76,6 +76,10 @@ class FetchNode(BaseNode):
|
||||
None if node_config is None else node_config.get("browser_base", None)
|
||||
)
|
||||
|
||||
self.scrape_do = (
|
||||
None if node_config is None else node_config.get("scrape_do", None)
|
||||
)
|
||||
|
||||
def execute(self, state):
|
||||
"""
|
||||
Executes the node's logic to fetch HTML content from a specified URL and
|
||||
@ -102,7 +106,7 @@ class FetchNode(BaseNode):
|
||||
|
||||
source = input_data[0]
|
||||
input_type = input_keys[0]
|
||||
|
||||
|
||||
handlers = {
|
||||
"json_dir": self.handle_directory,
|
||||
"xml_dir": self.handle_directory,
|
||||
@ -271,11 +275,19 @@ class FetchNode(BaseNode):
|
||||
try:
|
||||
from ..docloaders.browser_base import browser_base_fetch
|
||||
except ImportError:
|
||||
raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.")
|
||||
raise ImportError("""The browserbase module is not installed.
|
||||
Please install it using `pip install browserbase`.""")
|
||||
|
||||
data = browser_base_fetch(self.browser_base.get("api_key"),
|
||||
self.browser_base.get("project_id"), [source])
|
||||
|
||||
document = [Document(page_content=content,
|
||||
metadata={"source": source}) for content in data]
|
||||
elif self.scrape_do is not None:
|
||||
from ..docloaders.scrape_do import scrape_do_fetch
|
||||
data = scrape_do_fetch(self.scrape_do.get("api_key"),
|
||||
source)
|
||||
|
||||
document = [Document(page_content=content,
|
||||
metadata={"source": source}) for content in data]
|
||||
else:
|
||||
@ -283,7 +295,8 @@ class FetchNode(BaseNode):
|
||||
document = loader.load()
|
||||
|
||||
if not document or not document[0].page_content.strip():
|
||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||
raise ValueError("""No HTML body content found in
|
||||
the document fetched by ChromiumLoader.""")
|
||||
parsed_content = document[0].page_content
|
||||
|
||||
if (isinstance(self.llm_model, ChatOpenAI) or isinstance(self.llm_model, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
|
||||
@ -292,7 +305,7 @@ class FetchNode(BaseNode):
|
||||
compressed_document = [
|
||||
Document(page_content=parsed_content, metadata={"source": "html file"})
|
||||
]
|
||||
|
||||
|
||||
return self.update_state(state, compressed_document)
|
||||
|
||||
def update_state(self, state, compressed_document):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user