mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
Merge pull request #499 from ScrapeGraphAI/browserbase_integration
Browserbase integration
This commit is contained in:
commit
78343e50dc
49
examples/extras/browser_base_integration.py
Normal file
49
examples/extras/browser_base_integration.py
Normal file
@ -0,0 +1,49 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
"browser_base": {
|
||||
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
|
||||
"project_id": os.getenv("BROWSER_BASE_API_KEY"),
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SmartScraperGraph(
|
||||
prompt="List me what does the company do, the name and a contact email.",
|
||||
source="https://scrapegraphai.com/",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(json.dumps(result, indent=4))
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -54,6 +54,8 @@ boto3==1.34.146
|
||||
botocore==1.34.146
|
||||
# via boto3
|
||||
# via s3transfer
|
||||
browserbase==0.3.0
|
||||
# via scrapegraphai
|
||||
burr==0.22.1
|
||||
# via scrapegraphai
|
||||
cachetools==5.4.0
|
||||
@ -208,6 +210,7 @@ httptools==0.6.1
|
||||
# via uvicorn
|
||||
httpx==0.27.0
|
||||
# via anthropic
|
||||
# via browserbase
|
||||
# via fastapi
|
||||
# via fireworks-ai
|
||||
# via groq
|
||||
@ -383,6 +386,7 @@ pillow==10.4.0
|
||||
platformdirs==4.2.2
|
||||
# via pylint
|
||||
playwright==1.45.0
|
||||
# via browserbase
|
||||
# via scrapegraphai
|
||||
# via undetected-playwright
|
||||
pluggy==1.5.0
|
||||
@ -412,6 +416,7 @@ pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pydantic==2.8.2
|
||||
# via anthropic
|
||||
# via browserbase
|
||||
# via burr
|
||||
# via fastapi
|
||||
# via fastapi-pagination
|
||||
|
||||
@ -37,6 +37,8 @@ boto3==1.34.146
|
||||
botocore==1.34.146
|
||||
# via boto3
|
||||
# via s3transfer
|
||||
browserbase==0.3.0
|
||||
# via scrapegraphai
|
||||
cachetools==5.4.0
|
||||
# via google-auth
|
||||
certifi==2024.7.4
|
||||
@ -153,6 +155,7 @@ httplib2==0.22.0
|
||||
# via google-auth-httplib2
|
||||
httpx==0.27.0
|
||||
# via anthropic
|
||||
# via browserbase
|
||||
# via fireworks-ai
|
||||
# via groq
|
||||
# via openai
|
||||
@ -275,6 +278,7 @@ pillow==10.4.0
|
||||
# via langchain-nvidia-ai-endpoints
|
||||
# via sentence-transformers
|
||||
playwright==1.45.0
|
||||
# via browserbase
|
||||
# via scrapegraphai
|
||||
# via undetected-playwright
|
||||
proto-plus==1.24.0
|
||||
@ -299,6 +303,7 @@ pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pydantic==2.8.2
|
||||
# via anthropic
|
||||
# via browserbase
|
||||
# via fireworks-ai
|
||||
# via google-cloud-aiplatform
|
||||
# via google-generativeai
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""__init__.py file for docloaders folder"""
|
||||
|
||||
from .chromium import ChromiumLoader
|
||||
from .broswer_base import browser_base_fetch
|
||||
from .browser_base import browser_base_fetch
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
"""
|
||||
browserbase integration module
|
||||
"""
|
||||
from typing import List
|
||||
from browserbase import Browserbase
|
||||
|
||||
def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
|
||||
def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]:
|
||||
"""
|
||||
BrowserBase Fetch
|
||||
|
||||
@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
|
||||
- `link`: The URL or link that you want to fetch data from.
|
||||
|
||||
It initializes a Browserbase object with the given API key and project ID,
|
||||
then uses this object to load the specified link. It returns the result of the loading operation.
|
||||
then uses this object to load the specified link.
|
||||
It returns the result of the loading operation.
|
||||
|
||||
Example usage:
|
||||
|
||||
@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
|
||||
|
||||
browserbase = Browserbase(api_key=api_key, project_id=project_id)
|
||||
|
||||
result = browserbase.load(link)
|
||||
result = browserbase.load([link])
|
||||
|
||||
return result
|
||||
@ -1,3 +1,6 @@
|
||||
"""
|
||||
Chromium module
|
||||
"""
|
||||
import asyncio
|
||||
from typing import Any, AsyncIterator, Iterator, List, Optional
|
||||
|
||||
|
||||
@ -59,10 +59,11 @@ class AbstractGraph(ABC):
|
||||
self.llm_model = self._create_llm(config["llm"])
|
||||
self.verbose = False if config is None else config.get(
|
||||
"verbose", False)
|
||||
self.headless = True if config is None else config.get(
|
||||
self.headless = True if self.config is None else config.get(
|
||||
"headless", True)
|
||||
self.loader_kwargs = config.get("loader_kwargs", {})
|
||||
self.cache_path = config.get("cache_path", False)
|
||||
self.loader_kwargs = self.config.get("loader_kwargs", {})
|
||||
self.cache_path = self.config.get("cache_path", False)
|
||||
self.browser_base = self.config.get("browser_base")
|
||||
|
||||
# Create the graph
|
||||
self.graph = self._create_graph()
|
||||
|
||||
@ -11,6 +11,7 @@ from langchain_community.document_loaders import PyPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from ..utils.cleanup_html import cleanup_html
|
||||
from ..docloaders import ChromiumLoader
|
||||
from ..docloaders.browser_base import browser_base_fetch
|
||||
from ..utils.convert_to_md import convert_to_md
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
@ -74,6 +75,8 @@ class FetchNode(BaseNode):
|
||||
False if node_config is None else node_config.get("cut", True)
|
||||
)
|
||||
|
||||
self.browser_base = node_config.get("browser_base")
|
||||
|
||||
def execute(self, state):
|
||||
"""
|
||||
Executes the node's logic to fetch HTML content from a specified URL and
|
||||
@ -164,7 +167,7 @@ class FetchNode(BaseNode):
|
||||
|
||||
parsed_content = source
|
||||
|
||||
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||
parsed_content = convert_to_md(source)
|
||||
|
||||
compressed_document = [
|
||||
@ -177,7 +180,7 @@ class FetchNode(BaseNode):
|
||||
if response.status_code == 200:
|
||||
if not response.text.strip():
|
||||
raise ValueError("No HTML body content found in the response.")
|
||||
|
||||
|
||||
parsed_content = response
|
||||
|
||||
if not self.cut:
|
||||
@ -198,8 +201,14 @@ class FetchNode(BaseNode):
|
||||
if self.node_config is not None:
|
||||
loader_kwargs = self.node_config.get("loader_kwargs", {})
|
||||
|
||||
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
||||
document = loader.load()
|
||||
if self.browser_base is not None:
|
||||
data = browser_base_fetch(self.browser_base.get("api_key"),
|
||||
self.browser_base.get("project_id"), [source])
|
||||
|
||||
document = [Document(page_content=content, metadata={"source": source}) for content in data]
|
||||
else:
|
||||
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
||||
document = loader.load()
|
||||
|
||||
if not document or not document[0].page_content.strip():
|
||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user