Merge pull request #499 from ScrapeGraphAI/browserbase_integration

Browserbase integration
This commit is contained in:
Federico Minutoli 2024-08-01 15:15:59 +02:00 committed by GitHub
commit 78343e50dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 85 additions and 11 deletions

View File

@ -0,0 +1,49 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"api_key": os.getenv("OPENAI_API_KEY"),
"model": "gpt-3.5-turbo",
},
"browser_base": {
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
"project_id": os.getenv("BROWSER_BASE_API_KEY"),
},
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me what does the company do, the name and a contact email.",
source="https://scrapegraphai.com/",
config=graph_config
)
result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -54,6 +54,8 @@ boto3==1.34.146
botocore==1.34.146
# via boto3
# via s3transfer
browserbase==0.3.0
# via scrapegraphai
burr==0.22.1
# via scrapegraphai
cachetools==5.4.0
@ -208,6 +210,7 @@ httptools==0.6.1
# via uvicorn
httpx==0.27.0
# via anthropic
# via browserbase
# via fastapi
# via fireworks-ai
# via groq
@ -383,6 +386,7 @@ pillow==10.4.0
platformdirs==4.2.2
# via pylint
playwright==1.45.0
# via browserbase
# via scrapegraphai
# via undetected-playwright
pluggy==1.5.0
@ -412,6 +416,7 @@ pyasn1-modules==0.4.0
# via google-auth
pydantic==2.8.2
# via anthropic
# via browserbase
# via burr
# via fastapi
# via fastapi-pagination

View File

@ -37,6 +37,8 @@ boto3==1.34.146
botocore==1.34.146
# via boto3
# via s3transfer
browserbase==0.3.0
# via scrapegraphai
cachetools==5.4.0
# via google-auth
certifi==2024.7.4
@ -153,6 +155,7 @@ httplib2==0.22.0
# via google-auth-httplib2
httpx==0.27.0
# via anthropic
# via browserbase
# via fireworks-ai
# via groq
# via openai
@ -275,6 +278,7 @@ pillow==10.4.0
# via langchain-nvidia-ai-endpoints
# via sentence-transformers
playwright==1.45.0
# via browserbase
# via scrapegraphai
# via undetected-playwright
proto-plus==1.24.0
@ -299,6 +303,7 @@ pyasn1-modules==0.4.0
# via google-auth
pydantic==2.8.2
# via anthropic
# via browserbase
# via fireworks-ai
# via google-cloud-aiplatform
# via google-generativeai

View File

@ -1,4 +1,4 @@
"""__init__.py file for docloaders folder"""
from .chromium import ChromiumLoader
from .broswer_base import browser_base_fetch
from .browser_base import browser_base_fetch

View File

@ -1,9 +1,10 @@
"""
browserbase integration module
"""
from typing import List
from browserbase import Browserbase
def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]:
"""
BrowserBase Fetch
@ -15,7 +16,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
- `link`: The URL or link that you want to fetch data from.
It initializes a Browserbase object with the given API key and project ID,
then uses this object to load the specified link. It returns the result of the loading operation.
then uses this object to load the specified link.
It returns the result of the loading operation.
Example usage:
@ -41,6 +43,6 @@ def browser_base_fetch(api_key: str, project_id: str, link: str) -> object:
browserbase = Browserbase(api_key=api_key, project_id=project_id)
result = browserbase.load(link)
result = browserbase.load([link])
return result

View File

@ -1,3 +1,6 @@
"""
Chromium module
"""
import asyncio
from typing import Any, AsyncIterator, Iterator, List, Optional

View File

@ -59,10 +59,11 @@ class AbstractGraph(ABC):
self.llm_model = self._create_llm(config["llm"])
self.verbose = False if config is None else config.get(
"verbose", False)
self.headless = True if config is None else config.get(
self.headless = True if self.config is None else config.get(
"headless", True)
self.loader_kwargs = config.get("loader_kwargs", {})
self.cache_path = config.get("cache_path", False)
self.loader_kwargs = self.config.get("loader_kwargs", {})
self.cache_path = self.config.get("cache_path", False)
self.browser_base = self.config.get("browser_base")
# Create the graph
self.graph = self._create_graph()

View File

@ -11,6 +11,7 @@ from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from ..utils.cleanup_html import cleanup_html
from ..docloaders import ChromiumLoader
from ..docloaders.browser_base import browser_base_fetch
from ..utils.convert_to_md import convert_to_md
from ..utils.logging import get_logger
from .base_node import BaseNode
@ -74,6 +75,8 @@ class FetchNode(BaseNode):
False if node_config is None else node_config.get("cut", True)
)
self.browser_base = node_config.get("browser_base")
def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
@ -164,7 +167,7 @@ class FetchNode(BaseNode):
parsed_content = source
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
parsed_content = convert_to_md(source)
compressed_document = [
@ -177,7 +180,7 @@ class FetchNode(BaseNode):
if response.status_code == 200:
if not response.text.strip():
raise ValueError("No HTML body content found in the response.")
parsed_content = response
if not self.cut:
@ -198,8 +201,14 @@ class FetchNode(BaseNode):
if self.node_config is not None:
loader_kwargs = self.node_config.get("loader_kwargs", {})
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()
if self.browser_base is not None:
data = browser_base_fetch(self.browser_base.get("api_key"),
self.browser_base.get("project_id"), [source])
document = [Document(page_content=content, metadata={"source": source}) for content in data]
else:
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()
if not document or not document[0].page_content.strip():
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")