From 5ecdbe715f4bb223fa1be834fda07ccea2a51cb9 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 1 Aug 2024 12:51:18 +0200 Subject: [PATCH] feat: add integration in the abstract grapgh --- ...ser_base.py => browser_base_integration.py} | 6 ++++-- scrapegraphai/docloaders/__init__.py | 2 +- scrapegraphai/graphs/abstract_graph.py | 15 ++++++++------- scrapegraphai/nodes/fetch_node.py | 18 ++++++++++++++---- 4 files changed, 27 insertions(+), 14 deletions(-) rename examples/extras/{browser_base.py => browser_base_integration.py} (98%) diff --git a/examples/extras/browser_base.py b/examples/extras/browser_base_integration.py similarity index 98% rename from examples/extras/browser_base.py rename to examples/extras/browser_base_integration.py index 465c80ba..97529879 100644 --- a/examples/extras/browser_base.py +++ b/examples/extras/browser_base_integration.py @@ -2,10 +2,12 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json +from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info -from dotenv import load_dotenv + load_dotenv() # ************************************************ diff --git a/scrapegraphai/docloaders/__init__.py b/scrapegraphai/docloaders/__init__.py index 51561a42..45a3783d 100644 --- a/scrapegraphai/docloaders/__init__.py +++ b/scrapegraphai/docloaders/__init__.py @@ -1,4 +1,4 @@ """__init__.py file for docloaders folder""" from .chromium import ChromiumLoader -from .broswer_base import browser_base_fetch +from .browser_base import browser_base_fetch diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 50de0a94..2ccc988b 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -72,15 +72,16 @@ class AbstractGraph(ABC): self.source = source self.config = config self.schema = schema - self.llm_model = self._create_llm(config["llm"], chat=True) - self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder( - config["embeddings"]) - self.verbose = False if config is None else config.get( + self.llm_model = self._create_llm(self.config["llm"], chat=True) + self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder( + self.config["embeddings"]) + self.verbose = False if self.config is None else self.config.get( "verbose", False) - self.headless = True if config is None else config.get( + self.headless = True if self.config is None else config.get( "headless", True) - self.loader_kwargs = config.get("loader_kwargs", {}) - self.cache_path = config.get("cache_path", False) + self.loader_kwargs = self.config.get("loader_kwargs", {}) + self.cache_path = self.config.get("cache_path", False) + self.browser_base = self.config.get("browser_base") # Create the graph self.graph = self._create_graph() diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 64a80cfe..95561a66 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -11,6 +11,7 @@ from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from ..utils.cleanup_html import cleanup_html from ..docloaders import ChromiumLoader +from ..docloaders.browser_base import browser_base_fetch from ..utils.convert_to_md import convert_to_md from ..utils.logging import get_logger from .base_node import BaseNode @@ -74,6 +75,8 @@ class FetchNode(BaseNode): False if node_config is None else node_config.get("cut", True) ) + self.browser_base = node_config.get("browser_base") + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -164,7 +167,7 @@ class FetchNode(BaseNode): parsed_content = source - if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: parsed_content = convert_to_md(source) compressed_document = [ @@ -177,7 +180,7 @@ class FetchNode(BaseNode): if response.status_code == 200: if not response.text.strip(): raise ValueError("No HTML body content found in the response.") - + parsed_content = response if not self.cut: @@ -198,8 +201,15 @@ class FetchNode(BaseNode): if self.node_config is not None: loader_kwargs = self.node_config.get("loader_kwargs", {}) - loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) - document = loader.load() + if self.browser_base is not None: + document = [ + Document(page_content= browser_base_fetch(self.browser_base.get("api_key"), + self.browser_base.get("project_id"), source), + metadata={}) + ] + else: + loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs) + document = loader.load() if not document or not document[0].page_content.strip(): raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")