fix: fetch node regex
Some checks failed
/ build (push) Has been cancelled

This commit is contained in:
Marco Vinciguerra 2024-11-21 09:00:45 +01:00
parent 86bf4f2402
commit e2af2326f6
3 changed files with 42 additions and 20 deletions

View File

@ -30,6 +30,8 @@ anyio==4.4.0
astroid==3.2.4 astroid==3.2.4
# via pylint # via pylint
async-timeout==4.0.3 async-timeout==4.0.3
# via aiohttp
# via langchain
# via scrapegraphai # via scrapegraphai
attrs==24.2.0 attrs==24.2.0
# via aiohttp # via aiohttp
@ -78,6 +80,9 @@ distro==1.9.0
# via openai # via openai
docutils==0.19 docutils==0.19
# via sphinx # via sphinx
exceptiongroup==1.2.2
# via anyio
# via pytest
fastapi==0.112.0 fastapi==0.112.0
# via burr # via burr
fastapi-pagination==0.12.26 fastapi-pagination==0.12.26
@ -131,7 +136,6 @@ graphviz==0.20.3
# via burr # via burr
greenlet==3.0.3 greenlet==3.0.3
# via playwright # via playwright
# via sqlalchemy
grpcio==1.65.4 grpcio==1.65.4
# via google-api-core # via google-api-core
# via grpcio-status # via grpcio-status
@ -500,6 +504,9 @@ tokenizers==0.19.1
# via transformers # via transformers
toml==0.10.2 toml==0.10.2
# via streamlit # via streamlit
tomli==2.1.0
# via pylint
# via pytest
tomlkit==0.13.0 tomlkit==0.13.0
# via pylint # via pylint
tornado==6.4.1 tornado==6.4.1
@ -517,6 +524,8 @@ transformers==4.44.2
# via scrapegraphai # via scrapegraphai
typing-extensions==4.12.2 typing-extensions==4.12.2
# via altair # via altair
# via anyio
# via astroid
# via fastapi # via fastapi
# via fastapi-pagination # via fastapi-pagination
# via google-generativeai # via google-generativeai
@ -531,6 +540,7 @@ typing-extensions==4.12.2
# via sqlalchemy # via sqlalchemy
# via streamlit # via streamlit
# via typing-inspect # via typing-inspect
# via uvicorn
typing-inspect==0.9.0 typing-inspect==0.9.0
# via dataclasses-json # via dataclasses-json
# via sf-hamilton # via sf-hamilton

View File

@ -19,6 +19,8 @@ anyio==4.4.0
# via httpx # via httpx
# via openai # via openai
async-timeout==4.0.3 async-timeout==4.0.3
# via aiohttp
# via langchain
# via scrapegraphai # via scrapegraphai
attrs==23.2.0 attrs==23.2.0
# via aiohttp # via aiohttp
@ -48,6 +50,8 @@ dill==0.3.8
# via multiprocess # via multiprocess
distro==1.9.0 distro==1.9.0
# via openai # via openai
exceptiongroup==1.2.2
# via anyio
fastembed==0.3.6 fastembed==0.3.6
# via scrapegraphai # via scrapegraphai
filelock==3.15.4 filelock==3.15.4
@ -87,7 +91,6 @@ googlesearch-python==1.2.5
# via scrapegraphai # via scrapegraphai
greenlet==3.0.3 greenlet==3.0.3
# via playwright # via playwright
# via sqlalchemy
grpcio==1.65.1 grpcio==1.65.1
# via google-api-core # via google-api-core
# via grpcio-status # via grpcio-status
@ -368,6 +371,7 @@ tqdm==4.66.4
transformers==4.44.2 transformers==4.44.2
# via scrapegraphai # via scrapegraphai
typing-extensions==4.12.2 typing-extensions==4.12.2
# via anyio
# via google-generativeai # via google-generativeai
# via huggingface-hub # via huggingface-hub
# via langchain-core # via langchain-core

View File

@ -80,28 +80,30 @@ class FetchNode(BaseNode):
None if node_config is None else node_config.get("scrape_do", None) None if node_config is None else node_config.get("scrape_do", None)
) )
def is_valid_url(self, source: str) -> bool:
"""
Validates if the source string is a valid URL using regex.
Parameters:
source (str): The URL string to validate
Raises:
ValueError: If the URL is invalid
"""
import re
url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
if not bool(re.match(url_pattern, source)):
raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
return True
def execute(self, state): def execute(self, state):
""" """
Executes the node's logic to fetch HTML content from a specified URL and Executes the node's logic to fetch HTML content from a specified URL and
update the state with this content. update the state with this content.
Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data types from the state.
Returns:
dict: The updated state with a new output key containing the fetched HTML content.
Raises:
KeyError: If the input key is not found in the state, indicating that the
necessary information to perform the operation is missing.
""" """
self.logger.info(f"--- Executing {self.node_name} Node ---") self.logger.info(f"--- Executing {self.node_name} Node ---")
# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state) input_keys = self.get_input_keys(state)
# Fetching data from the state based on the input keys
input_data = [state[key] for key in input_keys] input_data = [state[key] for key in input_keys]
source = input_data[0] source = input_data[0]
@ -124,10 +126,16 @@ class FetchNode(BaseNode):
return handlers[input_type](state, input_type, source) return handlers[input_type](state, input_type, source)
elif self.input == "pdf_dir": elif self.input == "pdf_dir":
return state return state
elif not source.startswith("http") and not source.startswith("www"):
return self.handle_local_source(state, source) # For web sources, validate URL before proceeding
else: try:
return self.handle_web_source(state, source) if self.is_valid_url(source):
return self.handle_web_source(state, source)
except ValueError as e:
# Re-raise the exception from is_valid_url
raise
return self.handle_local_source(state, source)
def handle_directory(self, state, input_type, source): def handle_directory(self, state, input_type, source):
""" """