fix: refactoring of fetch_node

2026-07-01 21:00:48 +08:00 · 2024-08-07 11:56:10 +02:00 · 2024-08-07 11:56:10 +02:00 · 29ad140fa3
commit 29ad140fa3
parent 82e63213ae
5 changed files with 228 additions and 75 deletions
--- a/examples/local_models/package-lock.json
+++ b/examples/local_models/package-lock.json
@ -0,0 +1,6 @@
+{
+  "name": "local_models",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {}
+}
--- a/examples/local_models/package.json
+++ b/examples/local_models/package.json
@ -0,0 +1 @@
+{}
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -6,6 +6,8 @@
 #   features: []
 #   all-features: false
 #   with-sources: false
+#   generate-hashes: false
+#   universal: false

 -e file:.
 aiofiles==24.1.0
@ -110,6 +112,7 @@ filelock==3.15.4
    # via huggingface-hub
    # via torch
    # via transformers
+    # via triton
 fireworks-ai==0.14.0
    # via langchain-fireworks
 fonttools==4.53.1
@ -185,6 +188,7 @@ graphviz==0.20.3
    # via scrapegraphai
 greenlet==3.0.3
    # via playwright
+    # via sqlalchemy
 groq==0.9.0
    # via langchain-groq
 grpc-google-iam-v1==0.13.1
@ -353,6 +357,34 @@ numpy==1.26.4
    # via shapely
    # via streamlit
    # via transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via nvidia-cudnn-cu12
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-nccl-cu12==2.19.3
+    # via torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via nvidia-cusolver-cu12
+    # via nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 openai==1.37.0
    # via burr
    # via langchain-fireworks
@ -593,6 +625,8 @@ tqdm==4.66.4
 transformers==4.43.3
    # via langchain-huggingface
    # via sentence-transformers
+triton==2.2.0
+    # via torch
 typer==0.12.3
    # via fastapi-cli
 typing-extensions==4.12.2
@ -635,6 +669,8 @@ uvicorn==0.30.3
    # via fastapi
 uvloop==0.19.0
    # via uvicorn
+watchdog==4.0.1
+    # via streamlit
 watchfiles==0.22.0
    # via uvicorn
 websockets==12.0
--- a/requirements.lock
+++ b/requirements.lock
@ -6,6 +6,8 @@
 #   features: []
 #   all-features: false
 #   with-sources: false
+#   generate-hashes: false
+#   universal: false

 -e file:.
 aiohttp==3.9.5
@ -67,6 +69,7 @@ filelock==3.15.4
    # via huggingface-hub
    # via torch
    # via transformers
+    # via triton
 fireworks-ai==0.14.0
    # via langchain-fireworks
 free-proxy==1.1.1
@ -133,6 +136,7 @@ graphviz==0.20.3
    # via scrapegraphai
 greenlet==3.0.3
    # via playwright
+    # via sqlalchemy
 groq==0.9.0
    # via langchain-groq
 grpc-google-iam-v1==0.13.1
@ -258,6 +262,34 @@ numpy==1.26.4
    # via sentence-transformers
    # via shapely
    # via transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via nvidia-cudnn-cu12
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-nccl-cu12==2.19.3
+    # via torch
+nvidia-nvjitlink-cu12==12.6.20
+    # via nvidia-cusolver-cu12
+    # via nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 openai==1.37.0
    # via langchain-fireworks
    # via langchain-openai
@ -408,6 +440,8 @@ tqdm==4.66.4
 transformers==4.43.3
    # via langchain-huggingface
    # via sentence-transformers
+triton==2.2.0
+    # via torch
 typing-extensions==4.12.2
    # via anthropic
    # via anyio
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -102,81 +102,150 @@ class FetchNode(BaseNode):
        input_data = [state[key] for key in input_keys]

        source = input_data[0]
-        if (
-            input_keys[0] == "json_dir"
-            or input_keys[0] == "xml_dir"
-            or input_keys[0] == "csv_dir"
-            or input_keys[0] == "pdf_dir"
-            or input_keys[0] == "md_dir"
-        ):
-            compressed_document = [
-                source
-            ]
-
-            state.update({self.output[0]: compressed_document})
-            return state
-        # handling pdf
-        elif input_keys[0] == "pdf":
-            loader = PyPDFLoader(source)
-            compressed_document = loader.load()
-            state.update({self.output[0]: compressed_document})
-            return state
-
-        elif input_keys[0] == "csv":
-            compressed_document = [
-                Document(
-                    page_content=str(pd.read_csv(source)), metadata={"source": "csv"}
-                )
-            ]
-            state.update({self.output[0]: compressed_document})
-            return state
-        elif input_keys[0] == "json":
-            f = open(source, encoding="utf-8")
-            compressed_document = [
-                Document(page_content=str(json.load(f)), metadata={"source": "json"})
-            ]
-            state.update({self.output[0]: compressed_document})
-            return state
-
-        elif input_keys[0] == "xml":
-            with open(source, "r", encoding="utf-8") as f:
-                data = f.read()
-            compressed_document = [
-                Document(page_content=data, metadata={"source": "xml"})
-            ]
-            state.update({self.output[0]: compressed_document})
-            return state
-        elif input_keys[0] == "md":
-            with open(source, "r", encoding="utf-8") as f:
-                data = f.read()
-            compressed_document = [
-                Document(page_content=data, metadata={"source": "md"})
-            ]
-            state.update({self.output[0]: compressed_document})
-            return state
-
+        input_type = input_keys[0]
+        
+        handlers = {
+            "json_dir": self.handle_directory,
+            "xml_dir": self.handle_directory,
+            "csv_dir": self.handle_directory,
+            "pdf_dir": self.handle_directory,
+            "md_dir": self.handle_directory,
+            "pdf": self.handle_file,
+            "csv": self.handle_file,
+            "json": self.handle_file,
+            "xml": self.handle_file,
+            "md": self.handle_file,
+        }
+        
+        if input_type in handlers:
+            return handlers[input_type](state, input_type, source)
        elif self.input == "pdf_dir":
            pass
-
        elif not source.startswith("http"):
-            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
-            if not source.strip():
-                raise ValueError("No HTML body content found in the local source.")
+            return self.handle_local_source(state, source)
+        else:
+            return self.handle_web_source(state, source)
+    
+    
+    def handle_directory(self, state, input_type, source):
+        """
+        Handles the directory by compressing the source document and updating the state.

+        Parameters:
+        state (dict): The current state of the graph.
+        input_type (str): The type of input being processed.
+        source (str): The source document to be compressed.
+
+        Returns:
+        dict: The updated state with the compressed document.
+        """
+        
+        compressed_document = [
+            source
+        ]
+        state.update({self.output[0]: compressed_document})
+        return state
+
+    def handle_file(self, state, input_type, source):
+        """
+        Loads the content of a file based on its input type.
+
+        Parameters:
+        state (dict): The current state of the graph.
+        input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md").
+        source (str): The path to the source file.
+
+        Returns:
+        dict: The updated state with the compressed document.
+
+        The function supports the following input types:
+        - "pdf": Uses PyPDFLoader to load the content of a PDF file.
+        - "csv": Reads the content of a CSV file using pandas and converts it to a string.
+        - "json": Loads the content of a JSON file.
+        - "xml": Reads the content of an XML file as a string.
+        - "md": Reads the content of a Markdown file as a string.
+        """
+        
+        compressed_document = self.load_file_content(source, input_type)
+        
+        return self.update_state(state, compressed_document)
+        
+    def load_file_content(self, source, input_type):
+        """
+        Loads the content of a file based on its input type.
+
+        Parameters:
+        source (str): The path to the source file.
+        input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md").
+
+        Returns:
+        list: A list containing a Document object with the loaded content and metadata.
+        """
+        
+        if input_type == "pdf":
+            loader = PyPDFLoader(source)
+            return loader.load()
+        elif input_type == "csv":
+            return [Document(page_content=str(pd.read_csv(source)), metadata={"source": "csv"})]
+        elif input_type == "json":
+            with open(source, encoding="utf-8") as f:
+                return [Document(page_content=str(json.load(f)), metadata={"source": "json"})]
+        elif input_type == "xml" or input_type == "md":
+            with open(source, "r", encoding="utf-8") as f:
+                data = f.read()
+            return [Document(page_content=data, metadata={"source": input_type})]
+    
+    def handle_local_source(self, state, source):
+        """
+        Handles the local source by fetching HTML content, optionally converting it to Markdown,
+        and updating the state.
+
+        Parameters:
+        state (dict): The current state of the graph.
+        source (str): The HTML content from the local source.
+
+        Returns:
+        dict: The updated state with the processed content.
+
+        Raises:
+        ValueError: If the source is empty or contains only whitespace.
+        """
+    
+        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        if not source.strip():
+            raise ValueError("No HTML body content found in the local source.")
+        
+        parsed_content = source
+
+        if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
+            parsed_content = convert_to_md(source)
+        else:
            parsed_content = source

-            if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
+        compressed_document = [
+            Document(page_content=parsed_content, metadata={"source": "local_dir"})
+        ]
+        
+        return self.update_state(state, compressed_document)
+    
+    def handle_web_source(self, state, source):
+        """
+        Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown,
+        and updating the state.

-                parsed_content = convert_to_md(source)
-            else:
-                parsed_content = source
+        Parameters:
+        state (dict): The current state of the graph.
+        source (str): The URL of the web source to fetch HTML content from.

-            compressed_document = [
-                Document(page_content=parsed_content, metadata={"source": "local_dir"})
-            ]
+        Returns:
+        dict: The updated state with the processed content.

-        elif self.use_soup:
-            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        Raises:
+        ValueError: If the fetched HTML content is empty or contains only whitespace.
+        """
+        
+        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        if self.use_soup:
            response = requests.get(source)
            if response.status_code == 200:
                if not response.text.strip():
@ -194,9 +263,7 @@ class FetchNode(BaseNode):
                self.logger.warning(
                    f"Failed to retrieve contents from the webpage at url: {source}"
                )
-
        else:
-            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
            loader_kwargs = {}

            if self.node_config is not None:
@ -219,15 +286,24 @@ class FetchNode(BaseNode):
            if  isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
                parsed_content = convert_to_md(document[0].page_content, input_data[0])

-
            compressed_document = [
                Document(page_content=parsed_content, metadata={"source": "html file"})
            ]
+        
+        return self.update_state(state, compressed_document)
+        
+    def update_state(self, state, compressed_document):
+        """
+        Updates the state with the output data from the node.

-        state.update(
-            {
-                self.output[0]: compressed_document,
-            }
-        )
+        Args:
+            state (dict): The current state of the graph.
+            compressed_document (List[Document]): The compressed document content fetched
+                                                    by the node.

-        return state
+        Returns:
+            dict: The updated state with the output data.
+        """
+        
+        state.update({self.output[0]: compressed_document,})
+        return state