mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
This commit is contained in:
parent
86bf4f2402
commit
e2af2326f6
@ -30,6 +30,8 @@ anyio==4.4.0
|
|||||||
astroid==3.2.4
|
astroid==3.2.4
|
||||||
# via pylint
|
# via pylint
|
||||||
async-timeout==4.0.3
|
async-timeout==4.0.3
|
||||||
|
# via aiohttp
|
||||||
|
# via langchain
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
attrs==24.2.0
|
attrs==24.2.0
|
||||||
# via aiohttp
|
# via aiohttp
|
||||||
@ -78,6 +80,9 @@ distro==1.9.0
|
|||||||
# via openai
|
# via openai
|
||||||
docutils==0.19
|
docutils==0.19
|
||||||
# via sphinx
|
# via sphinx
|
||||||
|
exceptiongroup==1.2.2
|
||||||
|
# via anyio
|
||||||
|
# via pytest
|
||||||
fastapi==0.112.0
|
fastapi==0.112.0
|
||||||
# via burr
|
# via burr
|
||||||
fastapi-pagination==0.12.26
|
fastapi-pagination==0.12.26
|
||||||
@ -131,7 +136,6 @@ graphviz==0.20.3
|
|||||||
# via burr
|
# via burr
|
||||||
greenlet==3.0.3
|
greenlet==3.0.3
|
||||||
# via playwright
|
# via playwright
|
||||||
# via sqlalchemy
|
|
||||||
grpcio==1.65.4
|
grpcio==1.65.4
|
||||||
# via google-api-core
|
# via google-api-core
|
||||||
# via grpcio-status
|
# via grpcio-status
|
||||||
@ -500,6 +504,9 @@ tokenizers==0.19.1
|
|||||||
# via transformers
|
# via transformers
|
||||||
toml==0.10.2
|
toml==0.10.2
|
||||||
# via streamlit
|
# via streamlit
|
||||||
|
tomli==2.1.0
|
||||||
|
# via pylint
|
||||||
|
# via pytest
|
||||||
tomlkit==0.13.0
|
tomlkit==0.13.0
|
||||||
# via pylint
|
# via pylint
|
||||||
tornado==6.4.1
|
tornado==6.4.1
|
||||||
@ -517,6 +524,8 @@ transformers==4.44.2
|
|||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
typing-extensions==4.12.2
|
typing-extensions==4.12.2
|
||||||
# via altair
|
# via altair
|
||||||
|
# via anyio
|
||||||
|
# via astroid
|
||||||
# via fastapi
|
# via fastapi
|
||||||
# via fastapi-pagination
|
# via fastapi-pagination
|
||||||
# via google-generativeai
|
# via google-generativeai
|
||||||
@ -531,6 +540,7 @@ typing-extensions==4.12.2
|
|||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
# via streamlit
|
# via streamlit
|
||||||
# via typing-inspect
|
# via typing-inspect
|
||||||
|
# via uvicorn
|
||||||
typing-inspect==0.9.0
|
typing-inspect==0.9.0
|
||||||
# via dataclasses-json
|
# via dataclasses-json
|
||||||
# via sf-hamilton
|
# via sf-hamilton
|
||||||
|
|||||||
@ -19,6 +19,8 @@ anyio==4.4.0
|
|||||||
# via httpx
|
# via httpx
|
||||||
# via openai
|
# via openai
|
||||||
async-timeout==4.0.3
|
async-timeout==4.0.3
|
||||||
|
# via aiohttp
|
||||||
|
# via langchain
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
attrs==23.2.0
|
attrs==23.2.0
|
||||||
# via aiohttp
|
# via aiohttp
|
||||||
@ -48,6 +50,8 @@ dill==0.3.8
|
|||||||
# via multiprocess
|
# via multiprocess
|
||||||
distro==1.9.0
|
distro==1.9.0
|
||||||
# via openai
|
# via openai
|
||||||
|
exceptiongroup==1.2.2
|
||||||
|
# via anyio
|
||||||
fastembed==0.3.6
|
fastembed==0.3.6
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
filelock==3.15.4
|
filelock==3.15.4
|
||||||
@ -87,7 +91,6 @@ googlesearch-python==1.2.5
|
|||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
greenlet==3.0.3
|
greenlet==3.0.3
|
||||||
# via playwright
|
# via playwright
|
||||||
# via sqlalchemy
|
|
||||||
grpcio==1.65.1
|
grpcio==1.65.1
|
||||||
# via google-api-core
|
# via google-api-core
|
||||||
# via grpcio-status
|
# via grpcio-status
|
||||||
@ -368,6 +371,7 @@ tqdm==4.66.4
|
|||||||
transformers==4.44.2
|
transformers==4.44.2
|
||||||
# via scrapegraphai
|
# via scrapegraphai
|
||||||
typing-extensions==4.12.2
|
typing-extensions==4.12.2
|
||||||
|
# via anyio
|
||||||
# via google-generativeai
|
# via google-generativeai
|
||||||
# via huggingface-hub
|
# via huggingface-hub
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
|
|||||||
@ -80,28 +80,30 @@ class FetchNode(BaseNode):
|
|||||||
None if node_config is None else node_config.get("scrape_do", None)
|
None if node_config is None else node_config.get("scrape_do", None)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def is_valid_url(self, source: str) -> bool:
|
||||||
|
"""
|
||||||
|
Validates if the source string is a valid URL using regex.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
source (str): The URL string to validate
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the URL is invalid
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
url_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
|
||||||
|
if not bool(re.match(url_pattern, source)):
|
||||||
|
raise ValueError(f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain.")
|
||||||
|
return True
|
||||||
|
|
||||||
def execute(self, state):
|
def execute(self, state):
|
||||||
"""
|
"""
|
||||||
Executes the node's logic to fetch HTML content from a specified URL and
|
Executes the node's logic to fetch HTML content from a specified URL and
|
||||||
update the state with this content.
|
update the state with this content.
|
||||||
|
|
||||||
Args:
|
|
||||||
state (dict): The current state of the graph. The input keys will be used
|
|
||||||
to fetch the correct data types from the state.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: The updated state with a new output key containing the fetched HTML content.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
KeyError: If the input key is not found in the state, indicating that the
|
|
||||||
necessary information to perform the operation is missing.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.logger.info(f"--- Executing {self.node_name} Node ---")
|
self.logger.info(f"--- Executing {self.node_name} Node ---")
|
||||||
|
|
||||||
# Interpret input keys based on the provided input expression
|
|
||||||
input_keys = self.get_input_keys(state)
|
input_keys = self.get_input_keys(state)
|
||||||
# Fetching data from the state based on the input keys
|
|
||||||
input_data = [state[key] for key in input_keys]
|
input_data = [state[key] for key in input_keys]
|
||||||
|
|
||||||
source = input_data[0]
|
source = input_data[0]
|
||||||
@ -124,10 +126,16 @@ class FetchNode(BaseNode):
|
|||||||
return handlers[input_type](state, input_type, source)
|
return handlers[input_type](state, input_type, source)
|
||||||
elif self.input == "pdf_dir":
|
elif self.input == "pdf_dir":
|
||||||
return state
|
return state
|
||||||
elif not source.startswith("http") and not source.startswith("www"):
|
|
||||||
return self.handle_local_source(state, source)
|
# For web sources, validate URL before proceeding
|
||||||
else:
|
try:
|
||||||
return self.handle_web_source(state, source)
|
if self.is_valid_url(source):
|
||||||
|
return self.handle_web_source(state, source)
|
||||||
|
except ValueError as e:
|
||||||
|
# Re-raise the exception from is_valid_url
|
||||||
|
raise
|
||||||
|
|
||||||
|
return self.handle_local_source(state, source)
|
||||||
|
|
||||||
def handle_directory(self, state, input_type, source):
|
def handle_directory(self, state, input_type, source):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user