From a6e1813ddd36cc8d7c915e6ea0525835d64d10a2 Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Tue, 14 May 2024 16:51:10 +0200 Subject: [PATCH] fix(fetch_node): bug in handling local files --- examples/openai/custom_graph_openai copy.py | 113 ------------------- examples/openai/omni_scraper_openai.py | 2 +- scrapegraphai/graphs/csv_scraper_graph.py | 4 +- scrapegraphai/graphs/deep_scraper_graph.py | 2 +- scrapegraphai/graphs/json_scraper_graph.py | 4 +- scrapegraphai/graphs/pdf_scraper_graph.py | 4 +- scrapegraphai/graphs/script_creator_graph.py | 2 +- scrapegraphai/graphs/smart_scraper_graph.py | 2 +- scrapegraphai/graphs/speech_graph.py | 2 +- scrapegraphai/graphs/xml_scraper_graph.py | 4 +- scrapegraphai/nodes/fetch_node.py | 30 +++-- 11 files changed, 34 insertions(+), 135 deletions(-) delete mode 100644 examples/openai/custom_graph_openai copy.py diff --git a/examples/openai/custom_graph_openai copy.py b/examples/openai/custom_graph_openai copy.py deleted file mode 100644 index c42bbb5b..00000000 --- a/examples/openai/custom_graph_openai copy.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -Example of custom graph using existing nodes -""" - -import os -from dotenv import load_dotenv - -from langchain_openai import OpenAIEmbeddings -from scrapegraphai.models import OpenAI, OpenAIImageToText -from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchNode, ParseNode, ImageToTextNode, RAGNode, GenerateAnswerOmniNode -load_dotenv() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -openai_key = os.getenv("OPENAI_APIKEY") - -graph_config = { - "llm": { - "api_key": openai_key, - "model": "gpt-4o", - "temperature": 0, - "streaming": False - }, -} - -# ************************************************ -# Define the graph nodes -# ************************************************ - -llm_model = OpenAI(graph_config["llm"]) -iit_model = OpenAIImageToText(graph_config["llm"]) -embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key) - -# define the nodes for the graph - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc", "link_urls", "img_urls"], - node_config={ - "verbose": True, - "headless": True, - } -) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_config={ - "chunk_size": 4096, - "verbose": True, - } -) -image_to_text_node = ImageToTextNode( - input="img_urls", - output=["img_desc"], - node_config={ - "llm_model": iit_model, - "max_images": 4, - } -) -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": llm_model, - "embedder_model": embedder, - "verbose": True, - } -) -generate_answer_omni_node = GenerateAnswerOmniNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc", - output=["answer"], - node_config={ - "llm_model": llm_model, - "verbose": True, - } -) - -# ************************************************ -# Create the graph by defining the connections -# ************************************************ - -graph = BaseGraph( - nodes=[ - fetch_node, - parse_node, - image_to_text_node, - rag_node, - generate_answer_omni_node, - ], - edges=[ - (fetch_node, parse_node), - (parse_node, image_to_text_node), - (image_to_text_node, rag_node), - (rag_node, generate_answer_omni_node) - ], - entry_point=fetch_node -) - -# ************************************************ -# Execute the graph -# ************************************************ - -result, execution_info = graph.execute({ - "user_prompt": "List me all the projects with their titles and image links and descriptions.", - "url": "https://perinim.github.io/projects/" -}) - -# get the answer from the result -result = result.get("answer", "No answer found.") -print(result) diff --git a/examples/openai/omni_scraper_openai.py b/examples/openai/omni_scraper_openai.py index 1d1d86ba..8847fbbc 100644 --- a/examples/openai/omni_scraper_openai.py +++ b/examples/openai/omni_scraper_openai.py @@ -19,7 +19,7 @@ openai_key = os.getenv("OPENAI_APIKEY") graph_config = { "llm": { "api_key": openai_key, - "model": "gpt-4o", + "model": "gpt-4-turbo", }, "verbose": True, "headless": True, diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index 178a9c47..59d74e65 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -30,8 +30,8 @@ class CSVScraperGraph(AbstractGraph): Creates the graph of nodes representing the workflow for web scraping. """ fetch_node = FetchNode( - input="csv", - output=["doc"], + input="csv | csv_dir", + output=["doc", "link_urls", "img_urls"], ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index 4d6d4d4b..4b4e672b 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -61,7 +61,7 @@ class DeepScraperGraph(AbstractGraph): """ fetch_node = FetchNode( input="url | local_dir", - output=["doc"] + output=["doc", "link_urls", "img_urls"] ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index dc341eae..9a272a03 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -54,8 +54,8 @@ class JSONScraperGraph(AbstractGraph): """ fetch_node = FetchNode( - input="json", - output=["doc"], + input="json | json_dir", + output=["doc", "link_urls", "img_urls"], ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 4eb42b37..58a54ab0 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -56,8 +56,8 @@ class PDFScraperGraph(AbstractGraph): """ fetch_node = FetchNode( - input='pdf', - output=["doc"], + input='pdf | pdf_dir', + output=["doc", "link_urls", "img_urls"], ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index eafe4057..773ab2b0 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -59,7 +59,7 @@ class ScriptCreatorGraph(AbstractGraph): fetch_node = FetchNode( input="url | local_dir", - output=["doc"], + output=["doc", "link_urls", "img_urls"], ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index cef674a3..4093e49f 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -57,7 +57,7 @@ class SmartScraperGraph(AbstractGraph): """ fetch_node = FetchNode( input="url | local_dir", - output=["doc"], + output=["doc", "link_urls", "img_urls"], node_config={ "loader_kwargs": self.config.get("loader_kwargs", {}), } diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 3ca2b703..80c09537 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -56,7 +56,7 @@ class SpeechGraph(AbstractGraph): fetch_node = FetchNode( input="url | local_dir", - output=["doc"] + output=["doc", "link_urls", "img_urls"] ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index b487f6ae..90d8dc55 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -56,8 +56,8 @@ class XMLScraperGraph(AbstractGraph): """ fetch_node = FetchNode( - input="xml", - output=["doc"] + input="xml | xml_dir", + output=["doc", "link_urls", "img_urls"] ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 51d366f4..6528f098 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -83,37 +83,49 @@ class FetchNode(BaseNode): source = input_data[0] if ( - self.input == "json_dir" - or self.input == "xml_dir" - or self.input == "csv_dir" + input_keys[0] == "json_dir" + or input_keys[0] == "xml_dir" + or input_keys[0] == "csv_dir" ): compressed_document = [ Document(page_content=source, metadata={"source": "local_dir"}) ] - # if it is a local directory - + state.update({self.output[0]: compressed_document}) + return state + # handling for pdf - elif self.input == "pdf": + elif input_keys[0] == "pdf": loader = PyPDFLoader(source) compressed_document = loader.load() + state.update({self.output[0]: compressed_document}) + return state - elif self.input == "csv": + elif input_keys[0] == "csv": compressed_document = [ Document( page_content=str(pd.read_csv(source)), metadata={"source": "csv"} ) ] - elif self.input == "json": + state.update({self.output[0]: compressed_document}) + return state + + elif input_keys[0] == "json": f = open(source) compressed_document = [ Document(page_content=str(json.load(f)), metadata={"source": "json"}) ] - elif self.input == "xml": + state.update({self.output[0]: compressed_document}) + return state + + elif input_keys[0] == "xml": with open(source, "r", encoding="utf-8") as f: data = f.read() compressed_document = [ Document(page_content=data, metadata={"source": "xml"}) ] + state.update({self.output[0]: compressed_document}) + return state + elif self.input == "pdf_dir": pass