mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
Merge pull request #203 from mayurdb/fetchNodeFix
fix: Augment the information getting fetched from a webpage
This commit is contained in:
commit
4e62689eaa
@ -1,5 +1,3 @@
|
||||
## [0.10.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.5...v0.10.0-beta.6) (2024-05-09)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
@ -8,8 +6,10 @@
|
||||
## [0.10.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.4...v0.10.0-beta.5) (2024-05-09)
|
||||
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
|
||||
* fixed bugs for csv and xml ([324e977](https://github.com/VinciGit00/Scrapegraph-ai/commit/324e977b853ecaa55bac4bf86e7cd927f7f43d0d))
|
||||
|
||||
## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09)
|
||||
|
||||
@ -44,9 +44,12 @@ Local models
|
||||
|
||||
Remember to have installed in your pc ollama `ollama <https://ollama.com/>`
|
||||
Remember to pull the right model for LLM and for the embeddings, like:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ollama pull llama3
|
||||
ollama pull nomic-embed-text
|
||||
ollama pull mistral
|
||||
|
||||
After that, you can run the following code, using only your machine resources brum brum brum:
|
||||
|
||||
|
||||
@ -8,7 +8,9 @@ from langchain_community.document_loaders import AsyncChromiumLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from .base_node import BaseNode
|
||||
from ..utils.remover import remover
|
||||
from ..utils.cleanup_html import cleanup_html
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class FetchNode(BaseNode):
|
||||
@ -34,6 +36,7 @@ class FetchNode(BaseNode):
|
||||
def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
|
||||
super().__init__(node_name, "node", input, output, 1)
|
||||
|
||||
|
||||
self.headless = True if node_config is None else node_config.get(
|
||||
"headless", True)
|
||||
self.verbose = False if node_config is None else node_config.get(
|
||||
@ -94,10 +97,22 @@ class FetchNode(BaseNode):
|
||||
pass
|
||||
|
||||
elif not source.startswith("http"):
|
||||
compressed_document = [Document(page_content=remover(source), metadata={
|
||||
compressed_document = [Document(page_content=cleanup_html(source), metadata={
|
||||
"source": "local_dir"
|
||||
})]
|
||||
|
||||
elif self.useSoup:
|
||||
response = requests.get(source)
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
links = soup.find_all('a')
|
||||
link_urls = []
|
||||
for link in links:
|
||||
if 'href' in link.attrs:
|
||||
link_urls.append(link['href'])
|
||||
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
|
||||
else:
|
||||
print(f"Failed to retrieve contents from the webpage at url: {url}")
|
||||
else:
|
||||
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
||||
|
||||
@ -114,7 +129,7 @@ class FetchNode(BaseNode):
|
||||
|
||||
document = loader.load()
|
||||
compressed_document = [
|
||||
Document(page_content=remover(str(document[0].page_content)))]
|
||||
Document(page_content=cleanup_html(str(document[0].page_content)))]
|
||||
|
||||
state.update({self.output[0]: compressed_document})
|
||||
return state
|
||||
|
||||
@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
||||
from minify_html import minify
|
||||
|
||||
|
||||
def remover(html_content: str) -> str:
|
||||
def cleanup_html(html_content: str, urls: list = []) -> str:
|
||||
"""
|
||||
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
|
||||
|
||||
@ -17,7 +17,7 @@ def remover(html_content: str) -> str:
|
||||
|
||||
Example:
|
||||
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
|
||||
>>> remover(html_content)
|
||||
>>> cleanup_html(html_content)
|
||||
'Title: Example, Body: <body><p>Hello World!</p></body>'
|
||||
|
||||
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
|
||||
@ -35,9 +35,12 @@ def remover(html_content: str) -> str:
|
||||
|
||||
# Body Extraction (if it exists)
|
||||
body_content = soup.find('body')
|
||||
urls_content = ""
|
||||
if urls:
|
||||
urls_content = f", URLs in page: {urls}"
|
||||
if body_content:
|
||||
# Minify the HTML within the body tag
|
||||
minimized_body = minify(str(body_content))
|
||||
return "Title: " + title + ", Body: " + minimized_body
|
||||
return "Title: " + title + ", Body: " + minimized_body + urls_content
|
||||
|
||||
return "Title: " + title + ", Body: No body content found"
|
||||
return "Title: " + title + ", Body: No body content found" + urls_content
|
||||
Loading…
Reference in New Issue
Block a user