fix: fixed bugs for csv and xml

This commit is contained in:
VinciGit00 2024-05-09 20:46:46 +02:00
parent 84e8d12793
commit 324e977b85
3 changed files with 22 additions and 10 deletions

View File

@ -30,7 +30,7 @@ class CSVScraperGraph(AbstractGraph):
Creates the graph of nodes representing the workflow for web scraping. Creates the graph of nodes representing the workflow for web scraping.
""" """
fetch_node = FetchNode( fetch_node = FetchNode(
input="csv_dir", input="csv",
output=["doc"], output=["doc"],
) )
parse_node = ParseNode( parse_node = ParseNode(
@ -78,4 +78,4 @@ class CSVScraperGraph(AbstractGraph):
inputs = {"user_prompt": self.prompt, self.input_key: self.source} inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs) self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.") return self.final_state.get("answer", "No answer found.")

View File

@ -56,7 +56,7 @@ class XMLScraperGraph(AbstractGraph):
""" """
fetch_node = FetchNode( fetch_node = FetchNode(
input="xml_dir", input="xml",
output=["doc"] output=["doc"]
) )
parse_node = ParseNode( parse_node = ParseNode(
@ -108,4 +108,4 @@ class XMLScraperGraph(AbstractGraph):
inputs = {"user_prompt": self.prompt, self.input_key: self.source} inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs) self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.") return self.final_state.get("answer", "No answer found.")

View File

@ -1,7 +1,7 @@
""" """
FetchNode Module FetchNode Module
""" """
import pandas as pd
from typing import List, Optional from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document from langchain_core.documents import Document
@ -22,7 +22,7 @@ class FetchNode(BaseNode):
Attributes: Attributes:
headless (bool): A flag indicating whether the browser should run in headless mode. headless (bool): A flag indicating whether the browser should run in headless mode.
verbose (bool): A flag indicating whether to print verbose output during execution. verbose (bool): A flag indicating whether to print verbose output during execution.
Args: Args:
input (str): Boolean expression defining the input keys needed from the state. input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state. output (List[str]): List of output keys to be updated in the state.
@ -30,11 +30,13 @@ class FetchNode(BaseNode):
node_name (str): The unique identifier name for the node, defaulting to "Fetch". node_name (str): The unique identifier name for the node, defaulting to "Fetch".
""" """
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"): def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
super().__init__(node_name, "node", input, output, 1) super().__init__(node_name, "node", input, output, 1)
self.headless = True if node_config is None else node_config.get("headless", True) self.headless = True if node_config is None else node_config.get(
self.verbose = False if node_config is None else node_config.get("verbose", False) "headless", True)
self.verbose = False if node_config is None else node_config.get(
"verbose", False)
def execute(self, state): def execute(self, state):
""" """
@ -72,6 +74,16 @@ class FetchNode(BaseNode):
loader = PyPDFLoader(source) loader = PyPDFLoader(source)
compressed_document = loader.load() compressed_document = loader.load()
elif self.input == "csv":
compressed_document = [Document(page_content=pd.read_csv(source), metadata={
"source": "xml"
})]
elif self.input == "xml":
with open(source, 'r', encoding='utf-8') as f:
data = f.read()
compressed_document = [Document(page_content=data, metadata={
"source": "xml"
})]
elif self.input == "pdf_dir": elif self.input == "pdf_dir":
pass pass
@ -82,7 +94,7 @@ class FetchNode(BaseNode):
else: else:
if self.node_config is not None and self.node_config.get("endpoint") is not None: if self.node_config is not None and self.node_config.get("endpoint") is not None:
loader = AsyncChromiumLoader( loader = AsyncChromiumLoader(
[source], [source],
proxies={"http": self.node_config["endpoint"]}, proxies={"http": self.node_config["endpoint"]},