feat: add integration in the abstract grapgh

This commit is contained in:
Marco Vinciguerra 2024-08-01 12:51:18 +02:00
parent 7076ab12d3
commit 5ecdbe715f
4 changed files with 27 additions and 14 deletions

View File

@ -2,10 +2,12 @@
Basic example of scraping pipeline using SmartScraper
"""
import os, json
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from dotenv import load_dotenv
load_dotenv()
# ************************************************

View File

@ -1,4 +1,4 @@
"""__init__.py file for docloaders folder"""
from .chromium import ChromiumLoader
from .broswer_base import browser_base_fetch
from .browser_base import browser_base_fetch

View File

@ -72,15 +72,16 @@ class AbstractGraph(ABC):
self.source = source
self.config = config
self.schema = schema
self.llm_model = self._create_llm(config["llm"], chat=True)
self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder(
config["embeddings"])
self.verbose = False if config is None else config.get(
self.llm_model = self._create_llm(self.config["llm"], chat=True)
self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder(
self.config["embeddings"])
self.verbose = False if self.config is None else self.config.get(
"verbose", False)
self.headless = True if config is None else config.get(
self.headless = True if self.config is None else config.get(
"headless", True)
self.loader_kwargs = config.get("loader_kwargs", {})
self.cache_path = config.get("cache_path", False)
self.loader_kwargs = self.config.get("loader_kwargs", {})
self.cache_path = self.config.get("cache_path", False)
self.browser_base = self.config.get("browser_base")
# Create the graph
self.graph = self._create_graph()

View File

@ -11,6 +11,7 @@ from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from ..utils.cleanup_html import cleanup_html
from ..docloaders import ChromiumLoader
from ..docloaders.browser_base import browser_base_fetch
from ..utils.convert_to_md import convert_to_md
from ..utils.logging import get_logger
from .base_node import BaseNode
@ -74,6 +75,8 @@ class FetchNode(BaseNode):
False if node_config is None else node_config.get("cut", True)
)
self.browser_base = node_config.get("browser_base")
def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
@ -164,7 +167,7 @@ class FetchNode(BaseNode):
parsed_content = source
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
parsed_content = convert_to_md(source)
compressed_document = [
@ -177,7 +180,7 @@ class FetchNode(BaseNode):
if response.status_code == 200:
if not response.text.strip():
raise ValueError("No HTML body content found in the response.")
parsed_content = response
if not self.cut:
@ -198,8 +201,15 @@ class FetchNode(BaseNode):
if self.node_config is not None:
loader_kwargs = self.node_config.get("loader_kwargs", {})
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()
if self.browser_base is not None:
document = [
Document(page_content= browser_base_fetch(self.browser_base.get("api_key"),
self.browser_base.get("project_id"), source),
metadata={})
]
else:
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
document = loader.load()
if not document or not document[0].page_content.strip():
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")