mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge pull request #68 from andrearoota/optimize-remover
feat: apply remove to the document before updating the state
This commit is contained in:
commit
f17f129378
@ -72,7 +72,7 @@ class FetchNode(BaseNode):
|
||||
|
||||
# if it is a local directory
|
||||
if not source.startswith("http"):
|
||||
document = [Document(page_content=remover(source), metadata={
|
||||
compressedDocument = [Document(page_content=remover(source), metadata={
|
||||
"source": "local_dir"
|
||||
})]
|
||||
|
||||
@ -80,5 +80,7 @@ class FetchNode(BaseNode):
|
||||
else:
|
||||
loader = AsyncHtmlLoader(source)
|
||||
document = loader.load()
|
||||
state.update({self.output[0]: document})
|
||||
compressedDocument = [Document(page_content=remover(str(document)))]
|
||||
|
||||
state.update({self.output[0]: compressedDocument})
|
||||
return state
|
||||
|
||||
@ -24,16 +24,24 @@ def remover(html_content: str) -> str:
|
||||
title_tag = soup.find('title')
|
||||
title = title_tag.get_text() if title_tag else ""
|
||||
|
||||
# Script and Style Tag Removal
|
||||
# Script and Style Tag Removal
|
||||
for tag in soup.find_all(['script', 'style']):
|
||||
tag.extract()
|
||||
|
||||
# Body Extraction (if it exists)
|
||||
body_content = soup.find('body')
|
||||
if body_content:
|
||||
# Remove some attributes from tags
|
||||
""" tagsToRemove = ['style', 'rel', 'width',
|
||||
'height', 'target', 'media',
|
||||
'onerror', 'onload', 'onclick']
|
||||
for tag in body_content.find_all():
|
||||
for attr in tagsToRemove:
|
||||
if tag.has_attr(attr):
|
||||
del tag.attrs[attr] """
|
||||
|
||||
# Minify the HTML within the body tag
|
||||
minimized_body = minify(str(body_content))
|
||||
return "Title: " + title + ", Body: " + minimized_body
|
||||
return "Title: " + title + ", Body: " + minimized_body
|
||||
else:
|
||||
return "Title: " + title + ", Body: No body content found"
|
||||
|
||||
return "Title: " + title + ", Body: No body content found"
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# Test section
|
||||
Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
|
||||
Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
|
||||
([link of the repo](https://github.com/VinciGit00/Scrapegrah-ai-website-for-tests)). The test website is hosted [here](https://scrapegrah-ai-website-for-tests.onrender.com).
|
||||
Loading…
Reference in New Issue
Block a user