mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
feat: add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition
This commit is contained in:
parent
931b975d79
commit
cacd9cde00
@ -18,4 +18,5 @@ undetected-playwright>=0.3.0
|
|||||||
semchunk>=1.0.1
|
semchunk>=1.0.1
|
||||||
langchain-ollama>=0.1.3
|
langchain-ollama>=0.1.3
|
||||||
simpleeval>=0.9.13
|
simpleeval>=0.9.13
|
||||||
googlesearch-python>=1.2.5
|
googlesearch-python>=1.2.5
|
||||||
|
async_timeout>=4.0.3
|
||||||
@ -2,7 +2,6 @@
|
|||||||
SmartScraperGraph Module
|
SmartScraperGraph Module
|
||||||
"""
|
"""
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import logging
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from .base_graph import BaseGraph
|
from .base_graph import BaseGraph
|
||||||
from .abstract_graph import AbstractGraph
|
from .abstract_graph import AbstractGraph
|
||||||
@ -10,8 +9,10 @@ from ..nodes import (
|
|||||||
FetchNode,
|
FetchNode,
|
||||||
ParseNode,
|
ParseNode,
|
||||||
ReasoningNode,
|
ReasoningNode,
|
||||||
GenerateAnswerNode
|
GenerateAnswerNode,
|
||||||
|
ConditionalNode
|
||||||
)
|
)
|
||||||
|
from ..prompts import REGEN_ADDITIONAL_INFO
|
||||||
|
|
||||||
class SmartScraperGraph(AbstractGraph):
|
class SmartScraperGraph(AbstractGraph):
|
||||||
"""
|
"""
|
||||||
@ -89,6 +90,28 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cond_node = None
|
||||||
|
regen_node = None
|
||||||
|
if self.config.get("reattempt") is True:
|
||||||
|
cond_node = ConditionalNode(
|
||||||
|
input="results",
|
||||||
|
output=["results"],
|
||||||
|
node_name="ConditionalNode",
|
||||||
|
node_config={
|
||||||
|
"key_name": "results",
|
||||||
|
"condition": 'results and results!="NA"',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
regen_node = GenerateAnswerNode(
|
||||||
|
input="user_prompt & results",
|
||||||
|
output=["answer"],
|
||||||
|
node_config={
|
||||||
|
"llm_model": self.llm_model,
|
||||||
|
"additional_info": REGEN_ADDITIONAL_INFO,
|
||||||
|
"schema": self.schema,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
if self.config.get("html_mode") is False:
|
if self.config.get("html_mode") is False:
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
input="doc",
|
input="doc",
|
||||||
@ -99,6 +122,7 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
reasoning_node = None
|
||||||
if self.config.get("reasoning"):
|
if self.config.get("reasoning"):
|
||||||
reasoning_node = ReasoningNode(
|
reasoning_node = ReasoningNode(
|
||||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||||
@ -109,68 +133,72 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
"schema": self.schema,
|
"schema": self.schema,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Define the graph variation configurations
|
||||||
|
# (html_mode, reasoning, reattempt)
|
||||||
|
graph_variation_config = {
|
||||||
|
(False, True, False): {
|
||||||
|
"nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node],
|
||||||
|
"edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node)]
|
||||||
|
},
|
||||||
|
(True, True, False): {
|
||||||
|
"nodes": [fetch_node, reasoning_node, generate_answer_node],
|
||||||
|
"edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node)]
|
||||||
|
},
|
||||||
|
(True, False, False): {
|
||||||
|
"nodes": [fetch_node, generate_answer_node],
|
||||||
|
"edges": [(fetch_node, generate_answer_node)]
|
||||||
|
},
|
||||||
|
(False, False, False): {
|
||||||
|
"nodes": [fetch_node, parse_node, generate_answer_node],
|
||||||
|
"edges": [(fetch_node, parse_node), (parse_node, generate_answer_node)]
|
||||||
|
},
|
||||||
|
(False, True, True): {
|
||||||
|
"nodes": [fetch_node, parse_node, reasoning_node, generate_answer_node, cond_node, regen_node],
|
||||||
|
"edges": [(fetch_node, parse_node), (parse_node, reasoning_node), (reasoning_node, generate_answer_node),
|
||||||
|
(generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)]
|
||||||
|
},
|
||||||
|
(True, True, True): {
|
||||||
|
"nodes": [fetch_node, reasoning_node, generate_answer_node, cond_node, regen_node],
|
||||||
|
"edges": [(fetch_node, reasoning_node), (reasoning_node, generate_answer_node),
|
||||||
|
(generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)]
|
||||||
|
},
|
||||||
|
(True, False, True): {
|
||||||
|
"nodes": [fetch_node, generate_answer_node, cond_node, regen_node],
|
||||||
|
"edges": [(fetch_node, generate_answer_node), (generate_answer_node, cond_node),
|
||||||
|
(cond_node, regen_node), (cond_node, None)]
|
||||||
|
},
|
||||||
|
(False, False, True): {
|
||||||
|
"nodes": [fetch_node, parse_node, generate_answer_node, cond_node, regen_node],
|
||||||
|
"edges": [(fetch_node, parse_node), (parse_node, generate_answer_node),
|
||||||
|
(generate_answer_node, cond_node), (cond_node, regen_node), (cond_node, None)]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if self.config.get("html_mode") is False and self.config.get("reasoning") is True:
|
# Get the current conditions
|
||||||
|
html_mode = self.config.get("html_mode", False)
|
||||||
|
reasoning = self.config.get("reasoning", False)
|
||||||
|
reattempt = self.config.get("reattempt", False)
|
||||||
|
|
||||||
|
# Retrieve the appropriate graph configuration
|
||||||
|
config = graph_variation_config.get((html_mode, reasoning, reattempt))
|
||||||
|
|
||||||
|
if config:
|
||||||
return BaseGraph(
|
return BaseGraph(
|
||||||
nodes=[
|
nodes=config["nodes"],
|
||||||
fetch_node,
|
edges=config["edges"],
|
||||||
parse_node,
|
|
||||||
reasoning_node,
|
|
||||||
generate_answer_node,
|
|
||||||
],
|
|
||||||
edges=[
|
|
||||||
(fetch_node, parse_node),
|
|
||||||
(parse_node, reasoning_node),
|
|
||||||
(reasoning_node, generate_answer_node)
|
|
||||||
],
|
|
||||||
entry_point=fetch_node,
|
|
||||||
graph_name=self.__class__.__name__
|
|
||||||
)
|
|
||||||
|
|
||||||
elif self.config.get("html_mode") is True and self.config.get("reasoning") is True:
|
|
||||||
|
|
||||||
return BaseGraph(
|
|
||||||
nodes=[
|
|
||||||
fetch_node,
|
|
||||||
reasoning_node,
|
|
||||||
generate_answer_node,
|
|
||||||
],
|
|
||||||
edges=[
|
|
||||||
(fetch_node, reasoning_node),
|
|
||||||
(reasoning_node, generate_answer_node)
|
|
||||||
],
|
|
||||||
entry_point=fetch_node,
|
|
||||||
graph_name=self.__class__.__name__
|
|
||||||
)
|
|
||||||
|
|
||||||
elif self.config.get("html_mode") is True and self.config.get("reasoning") is False:
|
|
||||||
return BaseGraph(
|
|
||||||
nodes=[
|
|
||||||
fetch_node,
|
|
||||||
generate_answer_node,
|
|
||||||
],
|
|
||||||
edges=[
|
|
||||||
(fetch_node, generate_answer_node)
|
|
||||||
],
|
|
||||||
entry_point=fetch_node,
|
entry_point=fetch_node,
|
||||||
graph_name=self.__class__.__name__
|
graph_name=self.__class__.__name__
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Default return if no conditions match
|
||||||
return BaseGraph(
|
return BaseGraph(
|
||||||
nodes=[
|
nodes=[fetch_node, parse_node, generate_answer_node],
|
||||||
fetch_node,
|
edges=[(fetch_node, parse_node), (parse_node, generate_answer_node)],
|
||||||
parse_node,
|
entry_point=fetch_node,
|
||||||
generate_answer_node,
|
graph_name=self.__class__.__name__
|
||||||
],
|
)
|
||||||
edges=[
|
|
||||||
(fetch_node, parse_node),
|
|
||||||
(parse_node, generate_answer_node)
|
|
||||||
],
|
|
||||||
entry_point=fetch_node,
|
|
||||||
graph_name=self.__class__.__name__
|
|
||||||
)
|
|
||||||
|
|
||||||
def run(self) -> str:
|
def run(self) -> str:
|
||||||
"""
|
"""
|
||||||
Executes the scraping process and returns the answer to the prompt.
|
Executes the scraping process and returns the answer to the prompt.
|
||||||
|
|||||||
@ -5,7 +5,7 @@ __init__.py for the prompts folder
|
|||||||
from .generate_answer_node_prompts import (TEMPLATE_CHUNKS,
|
from .generate_answer_node_prompts import (TEMPLATE_CHUNKS,
|
||||||
TEMPLATE_NO_CHUNKS,
|
TEMPLATE_NO_CHUNKS,
|
||||||
TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD,
|
TEMPLATE_MERGE, TEMPLATE_CHUNKS_MD,
|
||||||
TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD)
|
TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD, REGEN_ADDITIONAL_INFO)
|
||||||
from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV,
|
from .generate_answer_node_csv_prompts import (TEMPLATE_CHUKS_CSV,
|
||||||
TEMPLATE_NO_CHUKS_CSV,
|
TEMPLATE_NO_CHUKS_CSV,
|
||||||
TEMPLATE_MERGE_CSV)
|
TEMPLATE_MERGE_CSV)
|
||||||
|
|||||||
@ -86,3 +86,7 @@ OUTPUT INSTRUCTIONS: {format_instructions}\n
|
|||||||
USER QUESTION: {question}\n
|
USER QUESTION: {question}\n
|
||||||
WEBSITE CONTENT: {context}\n
|
WEBSITE CONTENT: {context}\n
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
REGEN_ADDITIONAL_INFO = """
|
||||||
|
You are a scraper and you have just failed to scrape the requested information from a website. \n
|
||||||
|
I want you to try again and provide the missing informations. \n"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user