diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index f31dd1b7..1e990400 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -22,4 +22,4 @@ from .generate_answer_omni_node import GenerateAnswerOmniNode from .merge_generated_scripts import MergeGeneratedScriptsNode from .fetch_screen_node import FetchScreenNode from .generate_answer_from_image_node import GenerateAnswerFromImageNode -from .concat_answers_node import ConcatAnswersNode \ No newline at end of file +from .concat_answers_node import ConcatAnswersNode diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 579d34ea..3e8ed5ac 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -1,14 +1,11 @@ """ ParseNode Module """ -from typing import Tuple, List, Optional -from urllib.parse import urljoin -import re +from typing import List, Optional from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document from .base_node import BaseNode -from ..helpers import default_filters class ParseNode(BaseNode): """ @@ -43,60 +40,6 @@ class ParseNode(BaseNode): self.parse_html = ( True if node_config is None else node_config.get("parse_html", True) ) - self.llm_model = node_config['llm_model'] - self.parse_urls = ( - False if node_config is None else node_config.get("parse_urls", False) - ) - - def _clean_urls(self, urls: List[str]) -> List[str]: - """ - Cleans the URLs extracted from the text. - - Args: - urls (List[str]): The list of URLs to clean. - - Returns: - List[str]: The cleaned URLs. - """ - cleaned_urls = [] - for url in urls: - url = re.sub(r'.*?\]\(', '', url) - - url = url.rstrip(').') - - cleaned_urls.append(url) - - return cleaned_urls - - def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: - """ - Extracts URLs from the given text. - - Args: - text (str): The text to extract URLs from. - - Returns: - Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs. - """ - if not self.parse_urls: - return [], [] - - image_extensions = default_filters.filter_dict["img_exts"] - image_extension_seq = '|'.join(image_extensions).replace('.','') - url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))') - - all_urls = url_pattern.findall(text) - all_urls = self._clean_urls(all_urls) - - if not source.startswith("http"): - all_urls = [url for url in all_urls if url.startswith("http")] - else: - all_urls = [urljoin(source, url) for url in all_urls] - - images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)] - links = [url for url in all_urls if url not in images] - - return links, images def execute(self, state: dict) -> dict: """ @@ -119,46 +62,33 @@ class ParseNode(BaseNode): input_keys = self.get_input_keys(state) input_data = [state[key] for key in input_keys] - docs_transformed = input_data[0] - source = input_data[1] if self.parse_urls else None - - def count_tokens(text): - from ..utils import token_count - return token_count(text, self.llm_model.model_name) if self.parse_html: - docs_transformed = Html2TextTransformer(ignore_links=False).transform_documents(input_data[0]) + docs_transformed = Html2TextTransformer().transform_documents(input_data[0]) docs_transformed = docs_transformed[0] - link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source) - chunks = chunk(text=docs_transformed.page_content, chunk_size=self.node_config.get("chunk_size", 4096)-250, - token_counter=count_tokens, + token_counter=lambda text: len(text.split()), memoize=False) else: docs_transformed = docs_transformed[0] - link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source) - chunk_size = self.node_config.get("chunk_size", 4096) chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) if isinstance(docs_transformed, Document): chunks = chunk(text=docs_transformed.page_content, chunk_size=chunk_size, - token_counter=count_tokens, + token_counter=lambda text: len(text.split()), memoize=False) else: chunks = chunk(text=docs_transformed, chunk_size=chunk_size, - token_counter=count_tokens, + token_counter=lambda text: len(text.split()), memoize=False) state.update({self.output[0]: chunks}) - if self.parse_urls: - state.update({self.output[1]: link_urls}) - state.update({self.output[2]: img_urls}) return state