mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
fix: Parse Node scraping link and img urls allowing OmniScraper to work
This commit is contained in:
parent
13efd4e3a4
commit
66a3b6d6a3
@ -65,16 +65,17 @@ class OmniScraperGraph(AbstractGraph):
|
||||
"""
|
||||
fetch_node = FetchNode(
|
||||
input="url | local_dir",
|
||||
output=["doc", "link_urls", "img_urls"],
|
||||
output=["doc"],
|
||||
node_config={
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
input="doc & (url | local_dir)",
|
||||
output=["parsed_doc", "link_urls", "img_urls"],
|
||||
node_config={
|
||||
"chunk_size": self.model_token,
|
||||
"parse_urls": True,
|
||||
"llm_model": self.llm_model
|
||||
}
|
||||
)
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
"""
|
||||
ImageToTextNode Module
|
||||
"""
|
||||
import traceback
|
||||
from typing import List, Optional
|
||||
from ..utils.logging import get_logger
|
||||
from .base_node import BaseNode
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
class ImageToTextNode(BaseNode):
|
||||
"""
|
||||
@ -58,16 +60,25 @@ class ImageToTextNode(BaseNode):
|
||||
if isinstance(urls, str):
|
||||
urls = [urls]
|
||||
elif len(urls) == 0:
|
||||
return state
|
||||
return state.update({self.output[0]: []})
|
||||
|
||||
# Skip the image-to-text conversion
|
||||
if self.max_images < 1:
|
||||
return state
|
||||
|
||||
return state.update({self.output[0]: []})
|
||||
|
||||
img_desc = []
|
||||
for url in urls[: self.max_images]:
|
||||
try:
|
||||
text_answer = self.llm_model.run(url)
|
||||
message = HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": "Describe the provided image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": url},
|
||||
},
|
||||
]
|
||||
)
|
||||
text_answer = self.llm_model.invoke([message]).content
|
||||
except Exception as e:
|
||||
text_answer = f"Error: incompatible image format or model failure."
|
||||
img_desc.append(text_answer)
|
||||
|
||||
@ -1,11 +1,15 @@
|
||||
"""
|
||||
ParseNode Module
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from typing import Tuple, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
from semchunk import chunk
|
||||
from langchain_community.document_transformers import Html2TextTransformer
|
||||
from langchain_core.documents import Document
|
||||
from .base_node import BaseNode
|
||||
from ..helpers import default_filters
|
||||
|
||||
import re
|
||||
|
||||
class ParseNode(BaseNode):
|
||||
"""
|
||||
@ -41,6 +45,66 @@ class ParseNode(BaseNode):
|
||||
True if node_config is None else node_config.get("parse_html", True)
|
||||
)
|
||||
self.llm_model = node_config['llm_model']
|
||||
self.parse_urls = (
|
||||
False if node_config is None else node_config.get("parse_urls", False)
|
||||
)
|
||||
|
||||
def _clean_urls(self, urls: List[str]) -> List[str]:
|
||||
"""
|
||||
Cleans the URLs extracted from the text.
|
||||
|
||||
Args:
|
||||
urls (List[str]): The list of URLs to clean.
|
||||
|
||||
Returns:
|
||||
List[str]: The cleaned URLs.
|
||||
"""
|
||||
cleaned_urls = []
|
||||
for url in urls:
|
||||
# Remove any leading 'thumbnail](' or similar patterns
|
||||
url = re.sub(r'.*?\]\(', '', url)
|
||||
|
||||
# Remove any trailing parentheses or brackets
|
||||
url = url.rstrip(').')
|
||||
|
||||
cleaned_urls.append(url)
|
||||
|
||||
return cleaned_urls
|
||||
|
||||
def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
|
||||
"""
|
||||
Extracts URLs from the given text.
|
||||
|
||||
Args:
|
||||
text (str): The text to extract URLs from.
|
||||
|
||||
Returns:
|
||||
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
|
||||
"""
|
||||
# Return empty lists if the URLs are not to be parsed
|
||||
if not self.parse_urls:
|
||||
return [], []
|
||||
|
||||
# Regular expression to find URLs (both links and images)
|
||||
image_extensions = default_filters.filter_dict["img_exts"]
|
||||
image_extension_seq = '|'.join(image_extensions).replace('.','')
|
||||
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
|
||||
|
||||
# Find all URLs in the string
|
||||
all_urls = url_pattern.findall(text)
|
||||
all_urls = self._clean_urls(all_urls)
|
||||
|
||||
if not source.startswith("http"):
|
||||
# Remove any URLs that is not complete
|
||||
all_urls = [url for url in all_urls if url.startswith("http")]
|
||||
else:
|
||||
# Add to local URLs the source URL
|
||||
all_urls = [urljoin(source, url) for url in all_urls]
|
||||
|
||||
images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
|
||||
links = [url for url in all_urls if url not in images]
|
||||
|
||||
return links, images
|
||||
|
||||
def execute(self, state: dict) -> dict:
|
||||
"""
|
||||
@ -63,7 +127,9 @@ class ParseNode(BaseNode):
|
||||
input_keys = self.get_input_keys(state)
|
||||
|
||||
input_data = [state[key] for key in input_keys]
|
||||
|
||||
docs_transformed = input_data[0]
|
||||
source = input_data[1] if self.parse_urls else None
|
||||
|
||||
def count_tokens(text):
|
||||
from ..utils import token_count
|
||||
@ -73,12 +139,17 @@ class ParseNode(BaseNode):
|
||||
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
|
||||
|
||||
chunks = chunk(text=docs_transformed.page_content,
|
||||
chunk_size=self.node_config.get("chunk_size", 4096)-250,
|
||||
token_counter=count_tokens,
|
||||
memoize=False)
|
||||
else:
|
||||
docs_transformed = docs_transformed[0]
|
||||
|
||||
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
|
||||
|
||||
chunk_size = self.node_config.get("chunk_size", 4096)
|
||||
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
|
||||
|
||||
@ -94,4 +165,8 @@ class ParseNode(BaseNode):
|
||||
memoize=False)
|
||||
|
||||
state.update({self.output[0]: chunks})
|
||||
if self.parse_urls:
|
||||
state.update({self.output[1]: link_urls})
|
||||
state.update({self.output[2]: img_urls})
|
||||
|
||||
return state
|
||||
|
||||
Loading…
Reference in New Issue
Block a user