diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 39a0b55f..f1260aa5 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -72,7 +72,7 @@ class FetchNode(BaseNode): # if it is a local directory if not source.startswith("http"): - document = [Document(page_content=remover(source), metadata={ + compressedDocument = [Document(page_content=remover(source), metadata={ "source": "local_dir" })] @@ -80,5 +80,7 @@ class FetchNode(BaseNode): else: loader = AsyncHtmlLoader(source) document = loader.load() - state.update({self.output[0]: document}) + compressedDocument = [Document(page_content=remover(str(document)))] + + state.update({self.output[0]: compressedDocument}) return state diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py index 5b4ff83e..75aa2e5d 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/remover.py @@ -24,16 +24,24 @@ def remover(html_content: str) -> str: title_tag = soup.find('title') title = title_tag.get_text() if title_tag else "" - # Script and Style Tag Removal + # Script and Style Tag Removal for tag in soup.find_all(['script', 'style']): tag.extract() # Body Extraction (if it exists) body_content = soup.find('body') if body_content: + # Remove some attributes from tags + """ tagsToRemove = ['style', 'rel', 'width', + 'height', 'target', 'media', + 'onerror', 'onload', 'onclick'] + for tag in body_content.find_all(): + for attr in tagsToRemove: + if tag.has_attr(attr): + del tag.attrs[attr] """ + # Minify the HTML within the body tag minimized_body = minify(str(body_content)) - return "Title: " + title + ", Body: " + minimized_body + return "Title: " + title + ", Body: " + minimized_body else: - return "Title: " + title + ", Body: No body content found" - + return "Title: " + title + ", Body: No body content found" diff --git a/tests/Readme.md b/tests/Readme.md index 2c9dbe1d..1e2a9bf1 100644 --- a/tests/Readme.md +++ b/tests/Readme.md @@ -1,3 +1,3 @@ # Test section -Regarding the tests for the folder graphs and nodes it was created a specific repo as a example +Regarding the tests for the folder graphs and nodes it was created a specific repo as a example ([link of the repo](https://github.com/VinciGit00/Scrapegrah-ai-website-for-tests)). The test website is hosted [here](https://scrapegrah-ai-website-for-tests.onrender.com). \ No newline at end of file