mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
feat: add integration in the abstract grapgh
This commit is contained in:
parent
7076ab12d3
commit
5ecdbe715f
@ -2,10 +2,12 @@
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
|
||||
import os, json
|
||||
import os
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SmartScraperGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# ************************************************
|
||||
@ -1,4 +1,4 @@
|
||||
"""__init__.py file for docloaders folder"""
|
||||
|
||||
from .chromium import ChromiumLoader
|
||||
from .broswer_base import browser_base_fetch
|
||||
from .browser_base import browser_base_fetch
|
||||
|
||||
@ -72,15 +72,16 @@ class AbstractGraph(ABC):
|
||||
self.source = source
|
||||
self.config = config
|
||||
self.schema = schema
|
||||
self.llm_model = self._create_llm(config["llm"], chat=True)
|
||||
self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder(
|
||||
config["embeddings"])
|
||||
self.verbose = False if config is None else config.get(
|
||||
self.llm_model = self._create_llm(self.config["llm"], chat=True)
|
||||
self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder(
|
||||
self.config["embeddings"])
|
||||
self.verbose = False if self.config is None else self.config.get(
|
||||
"verbose", False)
|
||||
self.headless = True if config is None else config.get(
|
||||
self.headless = True if self.config is None else config.get(
|
||||
"headless", True)
|
||||
self.loader_kwargs = config.get("loader_kwargs", {})
|
||||
self.cache_path = config.get("cache_path", False)
|
||||
self.loader_kwargs = self.config.get("loader_kwargs", {})
|
||||
self.cache_path = self.config.get("cache_path", False)
|
||||
self.browser_base = self.config.get("browser_base")
|
||||
|
||||
# Create the graph
|
||||
self.graph = self._create_graph()
|
||||
|
||||
@ -11,6 +11,7 @@ from langchain_community.document_loaders import PyPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from ..utils.cleanup_html import cleanup_html
|
||||
from ..docloaders import ChromiumLoader
|
||||
from ..docloaders.browser_base import browser_base_fetch
|
||||
from ..utils.convert_to_md import convert_to_md
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
@ -74,6 +75,8 @@ class FetchNode(BaseNode):
|
||||
False if node_config is None else node_config.get("cut", True)
|
||||
)
|
||||
|
||||
self.browser_base = node_config.get("browser_base")
|
||||
|
||||
def execute(self, state):
|
||||
"""
|
||||
Executes the node's logic to fetch HTML content from a specified URL and
|
||||
@ -164,7 +167,7 @@ class FetchNode(BaseNode):
|
||||
|
||||
parsed_content = source
|
||||
|
||||
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||
parsed_content = convert_to_md(source)
|
||||
|
||||
compressed_document = [
|
||||
@ -177,7 +180,7 @@ class FetchNode(BaseNode):
|
||||
if response.status_code == 200:
|
||||
if not response.text.strip():
|
||||
raise ValueError("No HTML body content found in the response.")
|
||||
|
||||
|
||||
parsed_content = response
|
||||
|
||||
if not self.cut:
|
||||
@ -198,8 +201,15 @@ class FetchNode(BaseNode):
|
||||
if self.node_config is not None:
|
||||
loader_kwargs = self.node_config.get("loader_kwargs", {})
|
||||
|
||||
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
||||
document = loader.load()
|
||||
if self.browser_base is not None:
|
||||
document = [
|
||||
Document(page_content= browser_base_fetch(self.browser_base.get("api_key"),
|
||||
self.browser_base.get("project_id"), source),
|
||||
metadata={})
|
||||
]
|
||||
else:
|
||||
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
||||
document = loader.load()
|
||||
|
||||
if not document or not document[0].page_content.strip():
|
||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user