From e6c7940a57929c2ed8c9fda1a6e375cc87a2b7f4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 12 Jun 2024 12:29:14 +0200 Subject: [PATCH] feat: add Parse_Node --- scrapegraphai/graphs/pdf_scraper_graph.py | 14 +++++++++++++- scrapegraphai/graphs/smart_scraper_graph.py | 3 ++- scrapegraphai/nodes/parse_node.py | 18 +++++++++++++----- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index ca79df41..c476e629 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -11,6 +11,7 @@ from .abstract_graph import AbstractGraph from ..nodes import ( FetchNode, + ParseNode, RAGNode, GenerateAnswerPDFNode ) @@ -66,6 +67,15 @@ class PDFScraperGraph(AbstractGraph): output=["doc"], ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "parse_html": False, + "chunk_size": self.model_token + } + ) + rag_node = RAGNode( input="user_prompt & (parsed_doc | doc)", output=["relevant_chunks"], @@ -86,11 +96,13 @@ class PDFScraperGraph(AbstractGraph): return BaseGraph( nodes=[ fetch_node, + parse_node, rag_node, generate_answer_node_pdf, ], edges=[ - (fetch_node, rag_node), + (fetch_node, parse_node), + (parse_node, rag_node), (rag_node, generate_answer_node_pdf) ], entry_point=fetch_node diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 85b292c3..35ff3df4 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -3,8 +3,8 @@ SmartScraperGraph Module """ from typing import Optional +import logging from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph @@ -70,6 +70,7 @@ class SmartScraperGraph(AbstractGraph): } ) logging.info("FetchNode configured with headless: %s", self.config.get("headless", True)) + parse_node = ParseNode( input="doc", output=["parsed_doc"], diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 3e77b3e9..5585ae80 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -70,12 +70,20 @@ class ParseNode(BaseNode): docs_transformed = input_data[0] if self.parse_html: docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) - docs_transformed = docs_transformed[0] + docs_transformed = docs_transformed[0] - chunks = chunk(text=docs_transformed.page_content, - chunk_size= self.node_config.get("chunk_size", 4096), - token_counter=lambda x: len(x.split()), - memoize=False) + chunks = chunk(text=docs_transformed.page_content, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) + else: + docs_transformed = docs_transformed[0] + + chunks = chunk(text=docs_transformed, + chunk_size= self.node_config.get("chunk_size", 4096), + token_counter=lambda x: len(x.split()), + memoize=False) + state.update({self.output[0]: chunks}) return state