fix: refactoring of fetch_node

2026-07-04 21:00:36 +08:00 · 2024-08-07 11:56:10 +02:00 · 2024-08-07 11:56:10 +02:00 · 29ad140fa3
commit 29ad140fa3
parent 82e63213ae
5 changed files with 228 additions and 75 deletions
--- a/examples/local_models/package-lock.json
+++ b/examples/local_models/package-lock.json
@ -0,0 +1,6 @@
 {
  "name": "local_models",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {}
 }
--- a/examples/local_models/package.json
+++ b/examples/local_models/package.json
@ -0,0 +1 @@
 {}
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -6,6 +6,8 @@
 #   features: []
 #   all-features: false
 #   with-sources: false
 #   generate-hashes: false
 #   universal: false
 -e file:.
 aiofiles==24.1.0
@ -110,6 +112,7 @@ filelock==3.15.4
    # via huggingface-hub
    # via torch
    # via transformers
    # via triton
 fireworks-ai==0.14.0
    # via langchain-fireworks
 fonttools==4.53.1
@ -185,6 +188,7 @@ graphviz==0.20.3
    # via scrapegraphai
 greenlet==3.0.3
    # via playwright
    # via sqlalchemy
 groq==0.9.0
    # via langchain-groq
 grpc-google-iam-v1==0.13.1
@ -353,6 +357,34 @@ numpy==1.26.4
    # via shapely
    # via streamlit
    # via transformers
 nvidia-cublas-cu12==12.1.3.1
    # via nvidia-cudnn-cu12
    # via nvidia-cusolver-cu12
    # via torch
 nvidia-cuda-cupti-cu12==12.1.105
    # via torch
 nvidia-cuda-nvrtc-cu12==12.1.105
    # via torch
 nvidia-cuda-runtime-cu12==12.1.105
    # via torch
 nvidia-cudnn-cu12==8.9.2.26
    # via torch
 nvidia-cufft-cu12==11.0.2.54
    # via torch
 nvidia-curand-cu12==10.3.2.106
    # via torch
 nvidia-cusolver-cu12==11.4.5.107
    # via torch
 nvidia-cusparse-cu12==12.1.0.106
    # via nvidia-cusolver-cu12
    # via torch
 nvidia-nccl-cu12==2.19.3
    # via torch
 nvidia-nvjitlink-cu12==12.6.20
    # via nvidia-cusolver-cu12
    # via nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
    # via torch
 openai==1.37.0
    # via burr
    # via langchain-fireworks
@ -593,6 +625,8 @@ tqdm==4.66.4
 transformers==4.43.3
    # via langchain-huggingface
    # via sentence-transformers
 triton==2.2.0
    # via torch
 typer==0.12.3
    # via fastapi-cli
 typing-extensions==4.12.2
@ -635,6 +669,8 @@ uvicorn==0.30.3
    # via fastapi
 uvloop==0.19.0
    # via uvicorn
 watchdog==4.0.1
    # via streamlit
 watchfiles==0.22.0
    # via uvicorn
 websockets==12.0
--- a/requirements.lock
+++ b/requirements.lock
@ -6,6 +6,8 @@
 #   features: []
 #   all-features: false
 #   with-sources: false
 #   generate-hashes: false
 #   universal: false
 -e file:.
 aiohttp==3.9.5
@ -67,6 +69,7 @@ filelock==3.15.4
    # via huggingface-hub
    # via torch
    # via transformers
    # via triton
 fireworks-ai==0.14.0
    # via langchain-fireworks
 free-proxy==1.1.1
@ -133,6 +136,7 @@ graphviz==0.20.3
    # via scrapegraphai
 greenlet==3.0.3
    # via playwright
    # via sqlalchemy
 groq==0.9.0
    # via langchain-groq
 grpc-google-iam-v1==0.13.1
@ -258,6 +262,34 @@ numpy==1.26.4
    # via sentence-transformers
    # via shapely
    # via transformers
 nvidia-cublas-cu12==12.1.3.1
    # via nvidia-cudnn-cu12
    # via nvidia-cusolver-cu12
    # via torch
 nvidia-cuda-cupti-cu12==12.1.105
    # via torch
 nvidia-cuda-nvrtc-cu12==12.1.105
    # via torch
 nvidia-cuda-runtime-cu12==12.1.105
    # via torch
 nvidia-cudnn-cu12==8.9.2.26
    # via torch
 nvidia-cufft-cu12==11.0.2.54
    # via torch
 nvidia-curand-cu12==10.3.2.106
    # via torch
 nvidia-cusolver-cu12==11.4.5.107
    # via torch
 nvidia-cusparse-cu12==12.1.0.106
    # via nvidia-cusolver-cu12
    # via torch
 nvidia-nccl-cu12==2.19.3
    # via torch
 nvidia-nvjitlink-cu12==12.6.20
    # via nvidia-cusolver-cu12
    # via nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
    # via torch
 openai==1.37.0
    # via langchain-fireworks
    # via langchain-openai
@ -408,6 +440,8 @@ tqdm==4.66.4
 transformers==4.43.3
    # via langchain-huggingface
    # via sentence-transformers
 triton==2.2.0
    # via torch
 typing-extensions==4.12.2
    # via anthropic
    # via anyio
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -102,81 +102,150 @@ class FetchNode(BaseNode):
        input_data = [state[key] for key in input_keys]
        source = input_data[0]
-        if (
+        input_type = input_keys[0]
            input_keys[0] == "json_dir"
            or input_keys[0] == "xml_dir"
            or input_keys[0] == "csv_dir"
            or input_keys[0] == "pdf_dir"
            or input_keys[0] == "md_dir"
        ):
            compressed_document = [
                source
            ]
-            state.update({self.output[0]: compressed_document})
+        handlers = {
-            return state
+            "json_dir": self.handle_directory,
-        # handling pdf
+            "xml_dir": self.handle_directory,
-        elif input_keys[0] == "pdf":
+            "csv_dir": self.handle_directory,
-            loader = PyPDFLoader(source)
+            "pdf_dir": self.handle_directory,
-            compressed_document = loader.load()
+            "md_dir": self.handle_directory,
-            state.update({self.output[0]: compressed_document})
+            "pdf": self.handle_file,
-            return state
+            "csv": self.handle_file,
-
+            "json": self.handle_file,
-        elif input_keys[0] == "csv":
+            "xml": self.handle_file,
-            compressed_document = [
+            "md": self.handle_file,
-                Document(
+        }
                    page_content=str(pd.read_csv(source)), metadata={"source": "csv"}
                )
            ]
            state.update({self.output[0]: compressed_document})
            return state
        elif input_keys[0] == "json":
            f = open(source, encoding="utf-8")
            compressed_document = [
                Document(page_content=str(json.load(f)), metadata={"source": "json"})
            ]
            state.update({self.output[0]: compressed_document})
            return state
        elif input_keys[0] == "xml":
            with open(source, "r", encoding="utf-8") as f:
                data = f.read()
            compressed_document = [
                Document(page_content=data, metadata={"source": "xml"})
            ]
            state.update({self.output[0]: compressed_document})
            return state
        elif input_keys[0] == "md":
            with open(source, "r", encoding="utf-8") as f:
                data = f.read()
            compressed_document = [
                Document(page_content=data, metadata={"source": "md"})
            ]
            state.update({self.output[0]: compressed_document})
            return state
        if input_type in handlers:
            return handlers[input_type](state, input_type, source)
        elif self.input == "pdf_dir":
            pass
        elif not source.startswith("http"):
-            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+            return self.handle_local_source(state, source)
-            if not source.strip():
+        else:
-                raise ValueError("No HTML body content found in the local source.")
+            return self.handle_web_source(state, source)
    def handle_directory(self, state, input_type, source):
        """
        Handles the directory by compressing the source document and updating the state.
        Parameters:
        state (dict): The current state of the graph.
        input_type (str): The type of input being processed.
        source (str): The source document to be compressed.
        Returns:
        dict: The updated state with the compressed document.
        """
        compressed_document = [
            source
        ]
        state.update({self.output[0]: compressed_document})
        return state
    def handle_file(self, state, input_type, source):
        """
        Loads the content of a file based on its input type.
        Parameters:
        state (dict): The current state of the graph.
        input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md").
        source (str): The path to the source file.
        Returns:
        dict: The updated state with the compressed document.
        The function supports the following input types:
        - "pdf": Uses PyPDFLoader to load the content of a PDF file.
        - "csv": Reads the content of a CSV file using pandas and converts it to a string.
        - "json": Loads the content of a JSON file.
        - "xml": Reads the content of an XML file as a string.
        - "md": Reads the content of a Markdown file as a string.
        """
        compressed_document = self.load_file_content(source, input_type)
        return self.update_state(state, compressed_document)
    def load_file_content(self, source, input_type):
        """
        Loads the content of a file based on its input type.
        Parameters:
        source (str): The path to the source file.
        input_type (str): The type of the input file (e.g., "pdf", "csv", "json", "xml", "md").
        Returns:
        list: A list containing a Document object with the loaded content and metadata.
        """
        if input_type == "pdf":
            loader = PyPDFLoader(source)
            return loader.load()
        elif input_type == "csv":
            return [Document(page_content=str(pd.read_csv(source)), metadata={"source": "csv"})]
        elif input_type == "json":
            with open(source, encoding="utf-8") as f:
                return [Document(page_content=str(json.load(f)), metadata={"source": "json"})]
        elif input_type == "xml" or input_type == "md":
            with open(source, "r", encoding="utf-8") as f:
                data = f.read()
            return [Document(page_content=data, metadata={"source": input_type})]
    def handle_local_source(self, state, source):
        """
        Handles the local source by fetching HTML content, optionally converting it to Markdown,
        and updating the state.
        Parameters:
        state (dict): The current state of the graph.
        source (str): The HTML content from the local source.
        Returns:
        dict: The updated state with the processed content.
        Raises:
        ValueError: If the source is empty or contains only whitespace.
        """
        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
        if not source.strip():
            raise ValueError("No HTML body content found in the local source.")
        parsed_content = source
        if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
            parsed_content = convert_to_md(source)
        else:
            parsed_content = source
-            if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
+        compressed_document = [
            Document(page_content=parsed_content, metadata={"source": "local_dir"})
        ]
-                parsed_content = convert_to_md(source)
+        return self.update_state(state, compressed_document)
            else:
                parsed_content = source
-            compressed_document = [
+    def handle_web_source(self, state, source):
-                Document(page_content=parsed_content, metadata={"source": "local_dir"})
+        """
-            ]
+        Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown,
        and updating the state.
-        elif self.use_soup:
+        Parameters:
-            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
+        state (dict): The current state of the graph.
        source (str): The URL of the web source to fetch HTML content from.
        Returns:
        dict: The updated state with the processed content.
        Raises:
        ValueError: If the fetched HTML content is empty or contains only whitespace.
        """
        self.logger.info(f"--- (Fetching HTML from: {source}) ---")
        if self.use_soup:
            response = requests.get(source)
            if response.status_code == 200:
                if not response.text.strip():
@ -194,9 +263,7 @@ class FetchNode(BaseNode):
                self.logger.warning(
                    f"Failed to retrieve contents from the webpage at url: {source}"
                )
        else:
            self.logger.info(f"--- (Fetching HTML from: {source}) ---")
            loader_kwargs = {}
            if self.node_config is not None:
@ -219,15 +286,24 @@ class FetchNode(BaseNode):
            if  isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
                parsed_content = convert_to_md(document[0].page_content, input_data[0])
            compressed_document = [
                Document(page_content=parsed_content, metadata={"source": "html file"})
            ]
-        state.update(
+        return self.update_state(state, compressed_document)
            {
                self.output[0]: compressed_document,
            }
        )
    def update_state(self, state, compressed_document):
        """
        Updates the state with the output data from the node.
        Args:
            state (dict): The current state of the graph.
            compressed_document (List[Document]): The compressed document content fetched
                                                    by the node.
        Returns:
            dict: The updated state with the output data.
        """
        state.update({self.output[0]: compressed_document,})
        return state