fix: fetch node regex

2026-07-01 21:00:48 +08:00 · 2024-11-21 09:00:45 +01:00 · 2024-11-21 09:00:45 +01:00 · e2af2326f6
commit e2af2326f6
parent 86bf4f2402
3 changed files with 42 additions and 20 deletions
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -30,6 +30,8 @@ anyio==4.4.0
 astroid==3.2.4
    # via pylint
 async-timeout==4.0.3
    # via aiohttp
    # via langchain
    # via scrapegraphai
 attrs==24.2.0
    # via aiohttp
@ -78,6 +80,9 @@ distro==1.9.0
    # via openai
 docutils==0.19
    # via sphinx
 exceptiongroup==1.2.2
    # via anyio
    # via pytest
 fastapi==0.112.0
    # via burr
 fastapi-pagination==0.12.26
@ -131,7 +136,6 @@ graphviz==0.20.3
    # via burr
 greenlet==3.0.3
    # via playwright
    # via sqlalchemy
 grpcio==1.65.4
    # via google-api-core
    # via grpcio-status
@ -500,6 +504,9 @@ tokenizers==0.19.1
    # via transformers
 toml==0.10.2
    # via streamlit
 tomli==2.1.0
    # via pylint
    # via pytest
 tomlkit==0.13.0
    # via pylint
 tornado==6.4.1
@ -517,6 +524,8 @@ transformers==4.44.2
    # via scrapegraphai
 typing-extensions==4.12.2
    # via altair
    # via anyio
    # via astroid
    # via fastapi
    # via fastapi-pagination
    # via google-generativeai
@ -531,6 +540,7 @@ typing-extensions==4.12.2
    # via sqlalchemy
    # via streamlit
    # via typing-inspect
    # via uvicorn
 typing-inspect==0.9.0
    # via dataclasses-json
    # via sf-hamilton
--- a/requirements.lock
+++ b/requirements.lock
@ -19,6 +19,8 @@ anyio==4.4.0
    # via httpx
    # via openai
 async-timeout==4.0.3
    # via aiohttp
    # via langchain
    # via scrapegraphai
 attrs==23.2.0
    # via aiohttp
@ -48,6 +50,8 @@ dill==0.3.8
    # via multiprocess
 distro==1.9.0
    # via openai
 exceptiongroup==1.2.2
    # via anyio
 fastembed==0.3.6
    # via scrapegraphai
 filelock==3.15.4
@ -87,7 +91,6 @@ googlesearch-python==1.2.5
    # via scrapegraphai
 greenlet==3.0.3
    # via playwright
    # via sqlalchemy
 grpcio==1.65.1
    # via google-api-core
    # via grpcio-status
@ -368,6 +371,7 @@ tqdm==4.66.4
 transformers==4.44.2
    # via scrapegraphai
 typing-extensions==4.12.2
    # via anyio
    # via google-generativeai
    # via huggingface-hub
    # via langchain-core
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -80,28 +80,30 @@ class FetchNode(BaseNode):
            None if node_config is None else node_config.get("scrape_do", None)
        )
    def is_valid_url(self, source: str) -> bool:
        """
        Validates if the source string is a valid URL using regex.
        Parameters:
        source (str): The URL string to validate
        Raises:
        ValueError: If the URL is invalid
        """
        import re
        url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
        if not bool(re.match(url_pattern, source)):
            raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
        return True
    def execute(self, state):
        """
        Executes the node's logic to fetch HTML content from a specified URL and
        update the state with this content.
        Args:
            state (dict): The current state of the graph. The input keys will be used
                            to fetch the correct data types from the state.
        Returns:
            dict: The updated state with a new output key containing the fetched HTML content.
        Raises:
            KeyError: If the input key is not found in the state, indicating that the
                    necessary information to perform the operation is missing.
        """
        self.logger.info(f"--- Executing {self.node_name} Node ---")
        # Interpret input keys based on the provided input expression
        input_keys = self.get_input_keys(state)
        # Fetching data from the state based on the input keys
        input_data = [state[key] for key in input_keys]
        source = input_data[0]
@ -124,10 +126,16 @@ class FetchNode(BaseNode):
            return handlers[input_type](state, input_type, source)
        elif self.input == "pdf_dir":
            return state
-        elif not source.startswith("http") and not source.startswith("www"):
+        
-            return self.handle_local_source(state, source)
+        # For web sources, validate URL before proceeding
-        else:
+        try:
-            return self.handle_web_source(state, source)
+            if self.is_valid_url(source):
                return self.handle_web_source(state, source)
        except ValueError as e:
            # Re-raise the exception from is_valid_url
            raise
        return self.handle_local_source(state, source)
    def handle_directory(self, state, input_type, source):
        """