Merge pull request #68 from andrearoota/optimize-remover

feat: apply remove to the document before updating the state
This commit is contained in:
Marco Vinciguerra 2024-04-17 11:34:13 +02:00 committed by GitHub
commit f17f129378
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 17 additions and 7 deletions

View File

@ -72,7 +72,7 @@ class FetchNode(BaseNode):
# if it is a local directory
if not source.startswith("http"):
document = [Document(page_content=remover(source), metadata={
compressedDocument = [Document(page_content=remover(source), metadata={
"source": "local_dir"
})]
@ -80,5 +80,7 @@ class FetchNode(BaseNode):
else:
loader = AsyncHtmlLoader(source)
document = loader.load()
state.update({self.output[0]: document})
compressedDocument = [Document(page_content=remover(str(document)))]
state.update({self.output[0]: compressedDocument})
return state

View File

@ -24,16 +24,24 @@ def remover(html_content: str) -> str:
title_tag = soup.find('title')
title = title_tag.get_text() if title_tag else ""
# Script and Style Tag Removal
# Script and Style Tag Removal
for tag in soup.find_all(['script', 'style']):
tag.extract()
# Body Extraction (if it exists)
body_content = soup.find('body')
if body_content:
# Remove some attributes from tags
""" tagsToRemove = ['style', 'rel', 'width',
'height', 'target', 'media',
'onerror', 'onload', 'onclick']
for tag in body_content.find_all():
for attr in tagsToRemove:
if tag.has_attr(attr):
del tag.attrs[attr] """
# Minify the HTML within the body tag
minimized_body = minify(str(body_content))
return "Title: " + title + ", Body: " + minimized_body
return "Title: " + title + ", Body: " + minimized_body
else:
return "Title: " + title + ", Body: No body content found"
return "Title: " + title + ", Body: No body content found"

View File

@ -1,3 +1,3 @@
# Test section
Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
([link of the repo](https://github.com/VinciGit00/Scrapegrah-ai-website-for-tests)). The test website is hosted [here](https://scrapegrah-ai-website-for-tests.onrender.com).