mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
fix: fixed bugs for csv and xml
This commit is contained in:
parent
84e8d12793
commit
324e977b85
@ -30,7 +30,7 @@ class CSVScraperGraph(AbstractGraph):
|
|||||||
Creates the graph of nodes representing the workflow for web scraping.
|
Creates the graph of nodes representing the workflow for web scraping.
|
||||||
"""
|
"""
|
||||||
fetch_node = FetchNode(
|
fetch_node = FetchNode(
|
||||||
input="csv_dir",
|
input="csv",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
@ -78,4 +78,4 @@ class CSVScraperGraph(AbstractGraph):
|
|||||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
return self.final_state.get("answer", "No answer found.")
|
return self.final_state.get("answer", "No answer found.")
|
||||||
|
|||||||
@ -56,7 +56,7 @@ class XMLScraperGraph(AbstractGraph):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
fetch_node = FetchNode(
|
fetch_node = FetchNode(
|
||||||
input="xml_dir",
|
input="xml",
|
||||||
output=["doc"]
|
output=["doc"]
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
@ -108,4 +108,4 @@ class XMLScraperGraph(AbstractGraph):
|
|||||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||||
|
|
||||||
return self.final_state.get("answer", "No answer found.")
|
return self.final_state.get("answer", "No answer found.")
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
FetchNode Module
|
FetchNode Module
|
||||||
"""
|
"""
|
||||||
|
import pandas as pd
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -22,7 +22,7 @@ class FetchNode(BaseNode):
|
|||||||
Attributes:
|
Attributes:
|
||||||
headless (bool): A flag indicating whether the browser should run in headless mode.
|
headless (bool): A flag indicating whether the browser should run in headless mode.
|
||||||
verbose (bool): A flag indicating whether to print verbose output during execution.
|
verbose (bool): A flag indicating whether to print verbose output during execution.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input (str): Boolean expression defining the input keys needed from the state.
|
input (str): Boolean expression defining the input keys needed from the state.
|
||||||
output (List[str]): List of output keys to be updated in the state.
|
output (List[str]): List of output keys to be updated in the state.
|
||||||
@ -30,11 +30,13 @@ class FetchNode(BaseNode):
|
|||||||
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
|
node_name (str): The unique identifier name for the node, defaulting to "Fetch".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
|
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
|
||||||
super().__init__(node_name, "node", input, output, 1)
|
super().__init__(node_name, "node", input, output, 1)
|
||||||
|
|
||||||
self.headless = True if node_config is None else node_config.get("headless", True)
|
self.headless = True if node_config is None else node_config.get(
|
||||||
self.verbose = False if node_config is None else node_config.get("verbose", False)
|
"headless", True)
|
||||||
|
self.verbose = False if node_config is None else node_config.get(
|
||||||
|
"verbose", False)
|
||||||
|
|
||||||
def execute(self, state):
|
def execute(self, state):
|
||||||
"""
|
"""
|
||||||
@ -72,6 +74,16 @@ class FetchNode(BaseNode):
|
|||||||
loader = PyPDFLoader(source)
|
loader = PyPDFLoader(source)
|
||||||
compressed_document = loader.load()
|
compressed_document = loader.load()
|
||||||
|
|
||||||
|
elif self.input == "csv":
|
||||||
|
compressed_document = [Document(page_content=pd.read_csv(source), metadata={
|
||||||
|
"source": "xml"
|
||||||
|
})]
|
||||||
|
elif self.input == "xml":
|
||||||
|
with open(source, 'r', encoding='utf-8') as f:
|
||||||
|
data = f.read()
|
||||||
|
compressed_document = [Document(page_content=data, metadata={
|
||||||
|
"source": "xml"
|
||||||
|
})]
|
||||||
elif self.input == "pdf_dir":
|
elif self.input == "pdf_dir":
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -82,7 +94,7 @@ class FetchNode(BaseNode):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
||||||
|
|
||||||
loader = AsyncChromiumLoader(
|
loader = AsyncChromiumLoader(
|
||||||
[source],
|
[source],
|
||||||
proxies={"http": self.node_config["endpoint"]},
|
proxies={"http": self.node_config["endpoint"]},
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user