Merge pull request #203 from mayurdb/fetchNodeFix

fix: Augment the information getting fetched from a webpage
2026-06-23 21:00:30 +08:00 · 2024-05-10 11:14:01 +02:00 · 2024-05-10 11:14:01 +02:00 · 4e62689eaa
commit 4e62689eaa
parent 460d292af2 99adc9799f
4 changed files with 30 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,3 @@
-## [0.10.0-beta.6](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.5...v0.10.0-beta.6) (2024-05-09)
-

 ### Bug Fixes

@ -8,8 +6,10 @@
 ## [0.10.0-beta.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.4...v0.10.0-beta.5) (2024-05-09)


+
 ### Bug Fixes

+
 * fixed bugs for csv and xml ([324e977](https://github.com/VinciGit00/Scrapegraph-ai/commit/324e977b853ecaa55bac4bf86e7cd927f7f43d0d))

 ## [0.10.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.10.0-beta.3...v0.10.0-beta.4) (2024-05-09)
--- a/docs/source/getting_started/examples.rst
+++ b/docs/source/getting_started/examples.rst
@ -44,9 +44,12 @@ Local models

 Remember to have installed in your pc ollama `ollama <https://ollama.com/>`
 Remember to pull the right model for LLM and for the embeddings, like:
+
 .. code-block:: bash

   ollama pull llama3
+   ollama pull nomic-embed-text
+   ollama pull mistral

 After that, you can run the following code, using only your machine resources brum brum brum:

--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@ -8,7 +8,9 @@ from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
 from langchain_community.document_loaders import PyPDFLoader
 from .base_node import BaseNode
-from ..utils.remover import remover
+from ..utils.cleanup_html import cleanup_html
+import requests
+from bs4 import BeautifulSoup


 class FetchNode(BaseNode):
@ -34,6 +36,7 @@ class FetchNode(BaseNode):
    def __init__(self, input: str, output: List[str], node_config: Optional[dict] = None, node_name: str = "Fetch"):
        super().__init__(node_name, "node", input, output, 1)

+
        self.headless = True if node_config is None else node_config.get(
            "headless", True)
        self.verbose = False if node_config is None else node_config.get(
@ -94,10 +97,22 @@ class FetchNode(BaseNode):
            pass

        elif not source.startswith("http"):
-            compressed_document = [Document(page_content=remover(source), metadata={
+            compressed_document = [Document(page_content=cleanup_html(source), metadata={
                "source": "local_dir"
            })]

+        elif self.useSoup:
+            response = requests.get(source)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, 'html.parser')
+                links = soup.find_all('a')
+                link_urls = []
+                for link in links:
+                    if 'href' in link.attrs:
+                        link_urls.append(link['href'])
+                compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
+            else:
+                print(f"Failed to retrieve contents from the webpage at url: {url}")
        else:
            if self.node_config is not None and self.node_config.get("endpoint") is not None:

@ -114,7 +129,7 @@ class FetchNode(BaseNode):

            document = loader.load()
            compressed_document = [
-                Document(page_content=remover(str(document[0].page_content)))]
+                Document(page_content=cleanup_html(str(document[0].page_content)))]

        state.update({self.output[0]: compressed_document})
        return state
--- a/scrapegraphai/utils/cleanup_html.py
+++ b/scrapegraphai/utils/cleanup_html.py
@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
 from minify_html import minify


-def remover(html_content: str) -> str:
+def cleanup_html(html_content: str, urls: list = []) -> str:
    """
    Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.

@ -17,7 +17,7 @@ def remover(html_content: str) -> str:

    Example:
        >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
-        >>> remover(html_content)
+        >>> cleanup_html(html_content)
        'Title: Example, Body: <body><p>Hello World!</p></body>'

    This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
@ -35,9 +35,12 @@ def remover(html_content: str) -> str:

    # Body Extraction (if it exists)
    body_content = soup.find('body')
+    urls_content = ""
+    if urls:
+        urls_content = f", URLs in page: {urls}"
    if body_content:
        # Minify the HTML within the body tag
        minimized_body = minify(str(body_content))
-        return "Title: " + title + ", Body: " + minimized_body
+        return "Title: " + title + ", Body: " + minimized_body + urls_content

-    return "Title: " + title + ", Body: No body content found"
+    return "Title: " + title + ", Body: No body content found" + urls_content