mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
feat: add integration in the abstract grapgh
This commit is contained in:
parent
7076ab12d3
commit
5ecdbe715f
@ -2,10 +2,12 @@
|
|||||||
Basic example of scraping pipeline using SmartScraper
|
Basic example of scraping pipeline using SmartScraper
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os, json
|
import os
|
||||||
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
from scrapegraphai.graphs import SmartScraperGraph
|
from scrapegraphai.graphs import SmartScraperGraph
|
||||||
from scrapegraphai.utils import prettify_exec_info
|
from scrapegraphai.utils import prettify_exec_info
|
||||||
from dotenv import load_dotenv
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
@ -1,4 +1,4 @@
|
|||||||
"""__init__.py file for docloaders folder"""
|
"""__init__.py file for docloaders folder"""
|
||||||
|
|
||||||
from .chromium import ChromiumLoader
|
from .chromium import ChromiumLoader
|
||||||
from .broswer_base import browser_base_fetch
|
from .browser_base import browser_base_fetch
|
||||||
|
|||||||
@ -72,15 +72,16 @@ class AbstractGraph(ABC):
|
|||||||
self.source = source
|
self.source = source
|
||||||
self.config = config
|
self.config = config
|
||||||
self.schema = schema
|
self.schema = schema
|
||||||
self.llm_model = self._create_llm(config["llm"], chat=True)
|
self.llm_model = self._create_llm(self.config["llm"], chat=True)
|
||||||
self.embedder_model = self._create_default_embedder(llm_config=config["llm"]) if "embeddings" not in config else self._create_embedder(
|
self.embedder_model = self._create_default_embedder(llm_config=self.config["llm"]) if "embeddings" not in self.config else self._create_embedder(
|
||||||
config["embeddings"])
|
self.config["embeddings"])
|
||||||
self.verbose = False if config is None else config.get(
|
self.verbose = False if self.config is None else self.config.get(
|
||||||
"verbose", False)
|
"verbose", False)
|
||||||
self.headless = True if config is None else config.get(
|
self.headless = True if self.config is None else config.get(
|
||||||
"headless", True)
|
"headless", True)
|
||||||
self.loader_kwargs = config.get("loader_kwargs", {})
|
self.loader_kwargs = self.config.get("loader_kwargs", {})
|
||||||
self.cache_path = config.get("cache_path", False)
|
self.cache_path = self.config.get("cache_path", False)
|
||||||
|
self.browser_base = self.config.get("browser_base")
|
||||||
|
|
||||||
# Create the graph
|
# Create the graph
|
||||||
self.graph = self._create_graph()
|
self.graph = self._create_graph()
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from langchain_community.document_loaders import PyPDFLoader
|
|||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from ..utils.cleanup_html import cleanup_html
|
from ..utils.cleanup_html import cleanup_html
|
||||||
from ..docloaders import ChromiumLoader
|
from ..docloaders import ChromiumLoader
|
||||||
|
from ..docloaders.browser_base import browser_base_fetch
|
||||||
from ..utils.convert_to_md import convert_to_md
|
from ..utils.convert_to_md import convert_to_md
|
||||||
from ..utils.logging import get_logger
|
from ..utils.logging import get_logger
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
@ -74,6 +75,8 @@ class FetchNode(BaseNode):
|
|||||||
False if node_config is None else node_config.get("cut", True)
|
False if node_config is None else node_config.get("cut", True)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.browser_base = node_config.get("browser_base")
|
||||||
|
|
||||||
def execute(self, state):
|
def execute(self, state):
|
||||||
"""
|
"""
|
||||||
Executes the node's logic to fetch HTML content from a specified URL and
|
Executes the node's logic to fetch HTML content from a specified URL and
|
||||||
@ -164,7 +167,7 @@ class FetchNode(BaseNode):
|
|||||||
|
|
||||||
parsed_content = source
|
parsed_content = source
|
||||||
|
|
||||||
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||||
parsed_content = convert_to_md(source)
|
parsed_content = convert_to_md(source)
|
||||||
|
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
@ -177,7 +180,7 @@ class FetchNode(BaseNode):
|
|||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
if not response.text.strip():
|
if not response.text.strip():
|
||||||
raise ValueError("No HTML body content found in the response.")
|
raise ValueError("No HTML body content found in the response.")
|
||||||
|
|
||||||
parsed_content = response
|
parsed_content = response
|
||||||
|
|
||||||
if not self.cut:
|
if not self.cut:
|
||||||
@ -198,8 +201,15 @@ class FetchNode(BaseNode):
|
|||||||
if self.node_config is not None:
|
if self.node_config is not None:
|
||||||
loader_kwargs = self.node_config.get("loader_kwargs", {})
|
loader_kwargs = self.node_config.get("loader_kwargs", {})
|
||||||
|
|
||||||
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
if self.browser_base is not None:
|
||||||
document = loader.load()
|
document = [
|
||||||
|
Document(page_content= browser_base_fetch(self.browser_base.get("api_key"),
|
||||||
|
self.browser_base.get("project_id"), source),
|
||||||
|
metadata={})
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
|
||||||
|
document = loader.load()
|
||||||
|
|
||||||
if not document or not document[0].page_content.strip():
|
if not document or not document[0].page_content.strip():
|
||||||
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
raise ValueError("No HTML body content found in the document fetched by ChromiumLoader.")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user