Merge branch 'pre/beta' into add-openai-supported-model-gpt-4o-mini

2026-07-04 21:00:36 +08:00 · 2024-07-19 10:08:41 +02:00 · 2024-07-19 10:08:41 +02:00 · 3a9d556e1b
commit 3a9d556e1b
parent 4f079cfc74 a3d0aacff5
38 changed files with 778 additions and 277 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,47 +1,5 @@
-## [1.9.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0...v1.9.1) (2024-07-12)


-### Bug Fixes
-
-* solve a burr integration ([881290b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/881290b5066b39c505532656671fbf65f8fc312c))
-
-## [1.9.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.8.0...v1.9.0) (2024-07-09)
-
-
-### Features
-
-* add fireworks integration ([df0e310](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df0e3108299071b849d7e055bd11d72764d24f08))
-* add integration for infos ([3bf5f57](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3bf5f570a8f8e1b037a7ad3c9f583261a1536421))
-* add integrations for markdown files ([2804434](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2804434a9ee12c52ae8956a88b1778a4dd3ec32f))
-* add vertexai integration ([119514b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/119514bdfc2a16dfb8918b0c34ae7cc43a01384c))
-* improve md prompt recognition ([5fe694b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe694b6b4545a5091d16110318b992acfca4f58))
-
-
-### Bug Fixes
-
-* add test ([3a537ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3a537eec6fef1743924a9aa5cef0ba2f8d44bf11))
-* fix pyproject.toml ([7570bf8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7570bf8294e49bc54ec9e296aaadb763873390ca))
-
-
-### chore
-
-* **Docker:** fix port number ([afeb81f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/afeb81f77a884799192d79dcac85666190fb1c9d))
-* **CI:** fix pylint workflow ([583c321](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/583c32106e827f50235d8fc69511652fd4b07a35))
-* **rye:** rebuild lockfiles ([27c2dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/27c2dd23517a7e4b14fafd00320a8b81f73145dc))
-
-
-### Docs
-
-* **roadmap:** fix urls ([14faba4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/14faba4f00dd9f947f8dc5e0b51be49ea684179f))
-* **roadmap:** next steps ([3e644f4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e644f498f05eb505fbd4e94b144c81567569aaa))
-
-
-### CI
-
-* **release:** 1.8.1-beta.1 [skip ci] ([8f9f96f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f9f96f7e7ff41d2fff5bbbf18bf4fc85d4f98b3))
-* **release:** 1.9.0-beta.1 [skip ci] ([146432d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/146432d476f775510441b062935adc47190141e2))
-* **release:** 1.9.0-beta.2 [skip ci] ([5cb5fbf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5cb5fbf5503eec9b34a6691eb993716cc9a821d6))
-
 ## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-07-05)


--- a/examples/anthropic/search_link_graph_haiku.py
+++ b/examples/anthropic/search_link_graph_haiku.py
@ -0,0 +1,57 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+from langchain_openai import AzureChatOpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+load_dotenv()
+
+llm_model_instance = AzureChatOpenAI(
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
+)
+
+embedder_model_instance = AzureOpenAIEmbeddings(
+    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
+    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
+)
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/azure/search_link_graph_azure.py
+++ b/examples/azure/search_link_graph_azure.py
@ -0,0 +1,52 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+load_dotenv()
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+     "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "headless": False
+}
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/bedrock/search_link_graph_bedrock.py
+++ b/examples/bedrock/search_link_graph_bedrock.py
@ -0,0 +1,45 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "client": "client_name",
+        "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+        "temperature": 0.0
+    },
+    "embeddings": {
+        "model": "bedrock/cohere.embed-multilingual-v3"
+    }
+}
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/deepseek/search_link_graph_deepseek.py
+++ b/examples/deepseek/search_link_graph_deepseek.py
@ -0,0 +1,52 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+load_dotenv()
+
+deepseek_key = os.getenv("DEEPSEEK_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "deepseek-chat",
+        "openai_api_key": deepseek_key,
+        "openai_api_base": 'https://api.deepseek.com/v1',
+    },
+     "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "verbose": True,
+}
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/ernie/search_graph_ernie.py
+++ b/examples/ernie/search_graph_ernie.py
@ -12,15 +12,18 @@ load_dotenv()
 # Define the configuration for the graph
 # ************************************************

-openai_key = os.getenv("OPENAI_APIKEY")
-
 graph_config = {
    "llm": {
-        "api_key": openai_key,
-        "model": "gpt-3.5-turbo",
-    },
-    "max_results": 2,
-    "verbose": True,
+            "model": "ernie-bot-turbo",
+            "ernie_client_id": "<ernie_client_id>",
+            "ernie_client_secret": "<ernie_client_secret>",
+            "temperature": 0.1
+        },
+        "embeddings": {
+            "model": "ollama/nomic-embed-text",
+            "temperature": 0,
+            "base_url": "http://localhost:11434"},
+    "library": "beautifulsoup"
 }

 # ************************************************
--- a/examples/ernie/search_link_graph_ernie.py
+++ b/examples/ernie/search_link_graph_ernie.py
@ -0,0 +1,46 @@
+"""
+Example of Search Graph
+"""
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+            "model": "ernie-bot-turbo",
+            "ernie_client_id": "<ernie_client_id>",
+            "ernie_client_secret": "<ernie_client_secret>",
+            "temperature": 0.1
+        },
+        "embeddings": {
+            "model": "ollama/nomic-embed-text",
+            "temperature": 0,
+            "base_url": "http://localhost:11434"},
+    "library": "beautifulsoup"
+}
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/fireworks/search_link_graph_fireworks.py
+++ b/examples/fireworks/search_link_graph_fireworks.py
@ -0,0 +1,52 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+load_dotenv()
+
+fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": fireworks_api_key,
+        "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
+    },
+     "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "max_results": 2,
+    "verbose": True,
+    "headless": False,
+}
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/gemini/search_link_graph_gemini.py
+++ b/examples/gemini/search_link_graph_gemini.py
@ -0,0 +1,44 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+load_dotenv()
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "gemini-pro",
+    },
+}
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/groq/search_link_graph_groq.py
+++ b/examples/groq/search_link_graph_groq.py
@ -0,0 +1,52 @@
+"""
+Example of Search Graph
+"""
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+load_dotenv()
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+     "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "headless": False
+}
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/groq/smart_scraper_groq.py
+++ b/examples/groq/smart_scraper_groq.py
@ -9,7 +9,6 @@ from scrapegraphai.utils import prettify_exec_info

 load_dotenv()

-
 # ************************************************
 # Define the configuration for the graph
 # ************************************************
--- a/examples/huggingfacehub/search_link_graph_huggingfacehub.py
+++ b/examples/huggingfacehub/search_link_graph_huggingfacehub.py
@ -0,0 +1,54 @@
+"""
+Example of Search Graph
+"""
+import os
+from scrapegraphai.graphs import SearchGraph
+from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+# ************************************************
+
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+
+
+# ************************************************
+# Create the SearchGraph instance and run it
+# ************************************************
+
+search_graph = SearchGraph(
+    prompt="List me the best escursions near Trento",
+    config=graph_config
+)
+
+result = search_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = search_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+# Save to json and csv
+convert_to_csv(result, "result")
+convert_to_json(result, "result")
--- a/examples/local_models/search_link_graph_ollama.py
+++ b/examples/local_models/search_link_graph_ollama.py
@ -0,0 +1,43 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+from scrapegraphai.graphs import SearchLinkGraph
+from scrapegraphai.utils import prettify_exec_info
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/llama3",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "base_url": "http://localhost:11434", # set ollama URL arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        # "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "verbose": True,
+    "headless": False
+}
+
+# ************************************************
+# Create the SearchLinkGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SearchLinkGraph(
+    source="https://sport.sky.it/nba?gr=www",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
--- a/examples/openai/search_link_graph_openai.py
+++ b/examples/openai/search_link_graph_openai.py
@ -0,0 +1,36 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+from scrapegraphai.graphs import SearchLinkGraph
+from scrapegraphai.utils import prettify_exec_info
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": "s",
+        "model": "gpt-3.5-turbo",
+    },
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SearchLinkGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SearchLinkGraph(
+    source="https://sport.sky.it/nba?gr=www",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
--- a/pyproject.toml
+++ b/pyproject.toml
@ -2,7 +2,9 @@
 name = "scrapegraphai"


-version = "1.9.1"
+
+version = "1.9.0b6"
+


 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
@ -24,7 +26,7 @@ dependencies = [
    "beautifulsoup4==4.12.3",
    "pandas==2.2.2",
    "python-dotenv==1.0.1",
-    "tiktoken==0.6.0",
+    "tiktoken==0.7",
    "tqdm==4.66.4",
    "graphviz==0.20.3",
    "minify-html==0.15.0",
@ -34,7 +36,6 @@ dependencies = [
    "undetected-playwright==0.3.0",
    "semchunk==1.0.1",
    "html2text==2024.2.26",
-    "trafilatura==1.10.0",
    "langchain-fireworks==0.1.3"
 ]

--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -41,7 +41,6 @@ attrs==23.2.0
    # via jsonschema
    # via referencing
 babel==2.15.0
-    # via courlan
    # via sphinx
 beautifulsoup4==4.12.3
    # via furo
@ -63,11 +62,8 @@ certifi==2024.2.2
    # via httpcore
    # via httpx
    # via requests
-    # via trafilatura
 charset-normalizer==3.3.2
-    # via htmldate
    # via requests
-    # via trafilatura
 click==8.1.7
    # via burr
    # via streamlit
@ -75,15 +71,11 @@ click==8.1.7
    # via uvicorn
 contourpy==1.2.1
    # via matplotlib
-courlan==1.2.0
-    # via trafilatura
 cycler==0.12.1
    # via matplotlib
 dataclasses-json==0.6.6
    # via langchain
    # via langchain-community
-dateparser==1.2.0
-    # via htmldate
 defusedxml==0.7.1
    # via langchain-anthropic
 dill==0.3.8
@ -204,8 +196,6 @@ h11==0.14.0
    # via uvicorn
 html2text==2024.2.26
    # via scrapegraphai
-htmldate==1.8.1
-    # via trafilatura
 httpcore==1.0.5
    # via httpx
 httplib2==0.22.0
@ -259,8 +249,6 @@ jsonschema==4.22.0
    # via altair
 jsonschema-specifications==2023.12.1
    # via jsonschema
-justext==3.0.1
-    # via trafilatura
 kiwisolver==1.4.5
    # via matplotlib
 langchain==0.1.15
@ -302,12 +290,6 @@ loguru==0.7.2
    # via burr
 lxml==5.2.2
    # via free-proxy
-    # via htmldate
-    # via justext
-    # via lxml-html-clean
-    # via trafilatura
-lxml-html-clean==0.1.1
-    # via lxml
 markdown-it-py==3.0.0
    # via rich
 markupsafe==2.1.5
@ -430,9 +412,7 @@ pytest==8.0.0
 pytest-mock==3.14.0
 python-dateutil==2.9.0.post0
    # via botocore
-    # via dateparser
    # via google-cloud-bigquery
-    # via htmldate
    # via matplotlib
    # via pandas
 python-dotenv==1.0.1
@ -441,7 +421,6 @@ python-dotenv==1.0.1
 python-multipart==0.0.9
    # via fastapi
 pytz==2024.1
-    # via dateparser
    # via pandas
 pyyaml==6.0.1
    # via huggingface-hub
@ -453,7 +432,6 @@ referencing==0.35.1
    # via jsonschema
    # via jsonschema-specifications
 regex==2024.5.15
-    # via dateparser
    # via tiktoken
 requests==2.32.2
    # via burr
@ -531,11 +509,9 @@ tenacity==8.3.0
    # via langchain-community
    # via langchain-core
    # via streamlit
-tiktoken==0.6.0
+tiktoken==0.7.0
    # via langchain-openai
    # via scrapegraphai
-tld==0.13
-    # via courlan
 tokenizers==0.19.1
    # via anthropic
 toml==0.10.2
@ -555,8 +531,6 @@ tqdm==4.66.4
    # via openai
    # via scrapegraphai
    # via semchunk
-trafilatura==1.10.0
-    # via scrapegraphai
 typer==0.12.3
    # via fastapi-cli
 typing-extensions==4.12.0
@ -586,8 +560,6 @@ typing-inspect==0.9.0
    # via sf-hamilton
 tzdata==2024.1
    # via pandas
-tzlocal==5.2
-    # via dateparser
 ujson==5.10.0
    # via fastapi
 undetected-playwright==0.3.0
@ -596,10 +568,7 @@ uritemplate==4.1.1
    # via google-api-python-client
 urllib3==1.26.18
    # via botocore
-    # via courlan
-    # via htmldate
    # via requests
-    # via trafilatura
 uvicorn==0.29.0
    # via burr
    # via fastapi
--- a/requirements.lock
+++ b/requirements.lock
@ -28,8 +28,6 @@ async-timeout==4.0.3
    # via langchain
 attrs==23.2.0
    # via aiohttp
-babel==2.15.0
-    # via courlan
 beautifulsoup4==4.12.3
    # via google
    # via scrapegraphai
@ -44,18 +42,11 @@ certifi==2024.2.2
    # via httpcore
    # via httpx
    # via requests
-    # via trafilatura
 charset-normalizer==3.3.2
-    # via htmldate
    # via requests
-    # via trafilatura
-courlan==1.2.0
-    # via trafilatura
 dataclasses-json==0.6.6
    # via langchain
    # via langchain-community
-dateparser==1.2.0
-    # via htmldate
 defusedxml==0.7.1
    # via langchain-anthropic
 distro==1.9.0
@ -150,8 +141,6 @@ h11==0.14.0
    # via httpcore
 html2text==2024.2.26
    # via scrapegraphai
-htmldate==1.8.1
-    # via trafilatura
 httpcore==1.0.5
    # via httpx
 httplib2==0.22.0
@ -181,8 +170,6 @@ jsonpatch==1.33
    # via langchain-core
 jsonpointer==2.4
    # via jsonpatch
-justext==3.0.1
-    # via trafilatura
 langchain==0.1.15
    # via scrapegraphai
 langchain-anthropic==0.1.11
@ -220,12 +207,6 @@ langsmith==0.1.63
    # via langchain-core
 lxml==5.2.2
    # via free-proxy
-    # via htmldate
-    # via justext
-    # via lxml-html-clean
-    # via trafilatura
-lxml-html-clean==0.1.1
-    # via lxml
 marshmallow==3.21.2
    # via dataclasses-json
 minify-html==0.15.0
@ -298,14 +279,11 @@ pyparsing==3.1.2
    # via httplib2
 python-dateutil==2.9.0.post0
    # via botocore
-    # via dateparser
    # via google-cloud-bigquery
-    # via htmldate
    # via pandas
 python-dotenv==1.0.1
    # via scrapegraphai
 pytz==2024.1
-    # via dateparser
    # via pandas
 pyyaml==6.0.1
    # via huggingface-hub
@ -313,7 +291,6 @@ pyyaml==6.0.1
    # via langchain-community
    # via langchain-core
 regex==2024.5.15
-    # via dateparser
    # via tiktoken
 requests==2.32.2
    # via free-proxy
@ -351,11 +328,9 @@ tenacity==8.3.0
    # via langchain
    # via langchain-community
    # via langchain-core
-tiktoken==0.6.0
+tiktoken==0.7.0
    # via langchain-openai
    # via scrapegraphai
-tld==0.13
-    # via courlan
 tokenizers==0.19.1
    # via anthropic
 tqdm==4.66.4
@ -364,8 +339,6 @@ tqdm==4.66.4
    # via openai
    # via scrapegraphai
    # via semchunk
-trafilatura==1.10.0
-    # via scrapegraphai
 typing-extensions==4.12.0
    # via anthropic
    # via anyio
@ -382,17 +355,12 @@ typing-inspect==0.9.0
    # via dataclasses-json
 tzdata==2024.1
    # via pandas
-tzlocal==5.2
-    # via dateparser
 undetected-playwright==0.3.0
    # via scrapegraphai
 uritemplate==4.1.1
    # via google-api-python-client
 urllib3==1.26.18
    # via botocore
-    # via courlan
-    # via htmldate
    # via requests
-    # via trafilatura
 yarl==1.9.4
    # via aiohttp
--- a/scrapegraphai/graphs/init.py
+++ b/scrapegraphai/graphs/init.py
@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph
 from .script_creator_multi_graph import ScriptCreatorMultiGraph
 from .markdown_scraper_graph import MDScraperGraph
 from .markdown_scraper_multi_graph import MDScraperMultiGraph
+from .search_link_graph import SearchLinkGraph
--- a/scrapegraphai/graphs/csv_scraper_graph.py
+++ b/scrapegraphai/graphs/csv_scraper_graph.py
@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph

 from ..nodes import (
    FetchNode,
-    RAGNode,
    GenerateAnswerCSVNode
 )

@ -37,14 +36,7 @@ class CSVScraperGraph(AbstractGraph):
            input="csv | csv_dir",
            output=["doc"],
        )
-        rag_node = RAGNode(
-            input="user_prompt & doc",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model,
-            }
-        )
+     
        generate_answer_node = GenerateAnswerCSVNode(
            input="user_prompt & (relevant_chunks | doc)",
            output=["answer"],
@ -58,12 +50,10 @@ class CSVScraperGraph(AbstractGraph):
        return BaseGraph(
            nodes=[
                fetch_node,
-                rag_node,
                generate_answer_node,
            ],
            edges=[
-                (fetch_node, rag_node),
-                (rag_node, generate_answer_node)
+                (fetch_node, generate_answer_node)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
--- a/scrapegraphai/graphs/json_scraper_graph.py
+++ b/scrapegraphai/graphs/json_scraper_graph.py
@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph

 from ..nodes import (
    FetchNode,
-    RAGNode,
    GenerateAnswerNode
 )

@ -62,14 +61,7 @@ class JSONScraperGraph(AbstractGraph):
            input="json | json_dir",
            output=["doc", "link_urls", "img_urls"],
        )
-        rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
-            }
-        )
+     
        generate_answer_node = GenerateAnswerNode(
            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
            output=["answer"],
@ -83,12 +75,10 @@ class JSONScraperGraph(AbstractGraph):
        return BaseGraph(
            nodes=[
                fetch_node,
-                rag_node,
                generate_answer_node,
            ],
            edges=[
-                (fetch_node, rag_node),
-                (rag_node, generate_answer_node)
+                (fetch_node, generate_answer_node)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
--- a/scrapegraphai/graphs/markdown_scraper_graph.py
+++ b/scrapegraphai/graphs/markdown_scraper_graph.py
@ -3,7 +3,7 @@ import logging
 from pydantic import BaseModel
 from .base_graph import BaseGraph
 from .abstract_graph import AbstractGraph
-from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
+from ..nodes import FetchNode, ParseNode, GenerateAnswerNode

 class MDScraperGraph(AbstractGraph):
    """
@ -63,14 +63,6 @@ class MDScraperGraph(AbstractGraph):
                "chunk_size": self.model_token
            }
        )
-        rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
-            }
-        )
        generate_answer_node = GenerateAnswerNode(
            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
            output=["answer"],
@ -86,13 +78,11 @@ class MDScraperGraph(AbstractGraph):
            nodes=[
                fetch_node,
                parse_node,
-                rag_node,
                generate_answer_node,
            ],
            edges=[
                (fetch_node, parse_node),
-                (parse_node, rag_node),
-                (rag_node, generate_answer_node)
+                (parse_node, generate_answer_node)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
--- a/scrapegraphai/graphs/omni_scraper_graph.py
+++ b/scrapegraphai/graphs/omni_scraper_graph.py
@ -12,7 +12,6 @@ from ..nodes import (
    FetchNode,
    ParseNode,
    ImageToTextNode,
-    RAGNode,
    GenerateAnswerOmniNode
 )

@ -89,14 +88,7 @@ class OmniScraperGraph(AbstractGraph):
                "max_images": self.max_images
            }
        )
-        rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
-            }
-        )
+      
        generate_answer_omni_node = GenerateAnswerOmniNode(
            input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
            output=["answer"],
@ -112,14 +104,12 @@ class OmniScraperGraph(AbstractGraph):
                fetch_node,
                parse_node,
                image_to_text_node,
-                rag_node,
                generate_answer_omni_node,
            ],
            edges=[
                (fetch_node, parse_node),
                (parse_node, image_to_text_node),
-                (image_to_text_node, rag_node),
-                (rag_node, generate_answer_omni_node)
+                (image_to_text_node, generate_answer_omni_node)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
@ -136,4 +126,4 @@ class OmniScraperGraph(AbstractGraph):
        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
        self.final_state, self.execution_info = self.graph.execute(inputs)

-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found.")
--- a/scrapegraphai/graphs/pdf_scraper_graph.py
+++ b/scrapegraphai/graphs/pdf_scraper_graph.py
@ -12,7 +12,6 @@ from .abstract_graph import AbstractGraph
 from ..nodes import (
    FetchNode,
    ParseNode,
-    RAGNode,
    GenerateAnswerPDFNode
 )

@ -76,14 +75,6 @@ class PDFScraperGraph(AbstractGraph):
            }
        )

-        rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
-            }
-        )
        generate_answer_node_pdf = GenerateAnswerPDFNode(
            input="user_prompt & (relevant_chunks | doc)",
            output=["answer"],
@ -98,13 +89,11 @@ class PDFScraperGraph(AbstractGraph):
            nodes=[
                fetch_node,
                parse_node,
-                rag_node,
                generate_answer_node_pdf,
            ],
            edges=[
                (fetch_node, parse_node),
-                (parse_node, rag_node),
-                (rag_node, generate_answer_node_pdf)
+                (parse_node, generate_answer_node_pdf)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
--- a/scrapegraphai/graphs/search_link_graph.py
+++ b/scrapegraphai/graphs/search_link_graph.py
@ -0,0 +1,104 @@
+""" SearchLinkGraph Module """
+from typing import Optional
+import logging
+from pydantic import BaseModel
+from .base_graph import BaseGraph
+from .abstract_graph import AbstractGraph
+
+
+from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
+
+class SearchLinkGraph(AbstractGraph): 
+    """ 
+    SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel): The schema for the graph output.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, 
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        schema (BaseModel, optional): The schema for the graph output. Defaults to None.
+
+    Example:
+        >>> smart_scraper = SearchLinkGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = smart_scraper.run()
+    """
+
+    def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
+        super().__init__("", config, source, schema)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+
+        fetch_node = FetchNode(
+            input="url| local_dir",
+            output=["doc", "link_urls", "img_urls"],
+            node_config={
+                "llm_model": self.llm_model,
+                "force": self.config.get("force", False),
+                "cut": self.config.get("cut", True),
+                "loader_kwargs": self.config.get("loader_kwargs", {}),
+            }
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "chunk_size": self.model_token
+            }
+        )
+        search_link_node = SearchLinkNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "llm_model": self.llm_model,
+                "chunk_size": self.model_token
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                search_link_node
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, search_link_node)
+            ],
+            entry_point=fetch_node,
+            graph_name=self.__class__.__name__
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("parsed_doc", "No answer found.")
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@ -11,7 +11,6 @@ from .abstract_graph import AbstractGraph
 from ..nodes import (
    FetchNode,
    ParseNode,
-    RAGNode,
    GenerateAnswerNode
 )

@ -78,14 +77,7 @@ class SmartScraperGraph(AbstractGraph):
                "chunk_size": self.model_token
            }
        )
-        rag_node = RAGNode(
-            input="user_prompt & (parsed_doc | doc)",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
-            }
-        )
+
        generate_answer_node = GenerateAnswerNode(
            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
            output=["answer"],
@ -100,13 +92,11 @@ class SmartScraperGraph(AbstractGraph):
            nodes=[
                fetch_node,
                parse_node,
-                rag_node,
                generate_answer_node,
            ],
            edges=[
                (fetch_node, parse_node),
-                (parse_node, rag_node),
-                (rag_node, generate_answer_node)
+                (parse_node, generate_answer_node)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
--- a/scrapegraphai/graphs/xml_scraper_graph.py
+++ b/scrapegraphai/graphs/xml_scraper_graph.py
@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph

 from ..nodes import (
    FetchNode,
-    RAGNode,
    GenerateAnswerNode
 )

@ -64,14 +63,7 @@ class XMLScraperGraph(AbstractGraph):
            input="xml | xml_dir",
            output=["doc", "link_urls", "img_urls"]
        )
-        rag_node = RAGNode(
-            input="user_prompt & doc",
-            output=["relevant_chunks"],
-            node_config={
-                "llm_model": self.llm_model,
-                "embedder_model": self.embedder_model
-            }
-        )
+     
        generate_answer_node = GenerateAnswerNode(
            input="user_prompt & (relevant_chunks | doc)",
            output=["answer"],
@ -85,12 +77,10 @@ class XMLScraperGraph(AbstractGraph):
        return BaseGraph(
            nodes=[
                fetch_node,
-                rag_node,
                generate_answer_node,
            ],
            edges=[
-                (fetch_node, rag_node),
-                (rag_node, generate_answer_node)
+                (fetch_node, generate_answer_node)
            ],
            entry_point=fetch_node,
            graph_name=self.__class__.__name__
--- a/scrapegraphai/models/bedrock.py
+++ b/scrapegraphai/models/bedrock.py
@ -1,5 +1,5 @@
 """ 
-bedrock configuration wrapper
+Bedrock Module
 """
 from langchain_aws import ChatBedrock

--- a/scrapegraphai/models/ernie.py
+++ b/scrapegraphai/models/ernie.py
@ -1,5 +1,5 @@
 """ 
-Ollama Module
+Ernie Module
 """
 from langchain_community.chat_models import ErnieBotChat

--- a/scrapegraphai/models/oneapi.py
+++ b/scrapegraphai/models/oneapi.py
@ -1,5 +1,5 @@
 """ 
-OpenAI Module
+OneAPI Module
 """
 from langchain_openai import ChatOpenAI

--- a/scrapegraphai/nodes/generate_answer_csv_node.py
+++ b/scrapegraphai/nodes/generate_answer_csv_node.py
@ -125,7 +125,7 @@ class GenerateAnswerCSVNode(BaseNode):
                    template=template_no_chunks_csv_prompt,
                    input_variables=["question"],
                    partial_variables={
-                        "context": chunk.page_content,
+                        "context": chunk,
                        "format_instructions": format_instructions,
                    },
                )
@ -137,7 +137,7 @@ class GenerateAnswerCSVNode(BaseNode):
                    template=template_chunks_csv_prompt,
                    input_variables=["question"],
                    partial_variables={
-                        "context": chunk.page_content,
+                        "context": chunk,
                        "chunk_id": i + 1,
                        "format_instructions": format_instructions,
                    },
--- a/scrapegraphai/nodes/generate_answer_node.py
+++ b/scrapegraphai/nodes/generate_answer_node.py
@ -115,7 +115,7 @@ class GenerateAnswerNode(BaseNode):
                prompt = PromptTemplate(
                    template=template_no_chunks_prompt,
                    input_variables=["question"],
-                    partial_variables={"context": chunk.page_content,
+                    partial_variables={"context": chunk,
                                       "format_instructions": format_instructions})
                chain =  prompt | self.llm_model | output_parser
                answer = chain.invoke({"question": user_prompt})
@ -124,7 +124,7 @@ class GenerateAnswerNode(BaseNode):
                prompt = PromptTemplate(
                    template=template_chunks_prompt,
                    input_variables=["question"],
-                    partial_variables={"context": chunk.page_content,
+                    partial_variables={"context": chunk,
                                        "chunk_id": i + 1,
                                        "format_instructions": format_instructions})
            # Dynamically name the chains based on their index
--- a/scrapegraphai/nodes/generate_answer_omni_node.py
+++ b/scrapegraphai/nodes/generate_answer_omni_node.py
@ -110,7 +110,7 @@ class GenerateAnswerOmniNode(BaseNode):
                    template=template_no_chunk_omni_prompt,
                    input_variables=["question"],
                    partial_variables={
-                        "context": chunk.page_content,
+                        "context": chunk,
                        "format_instructions": format_instructions,
                        "img_desc": imag_desc,
                    },
@ -123,7 +123,7 @@ class GenerateAnswerOmniNode(BaseNode):
                    template=template_chunks_omni_prompt,
                    input_variables=["question"],
                    partial_variables={
-                        "context": chunk.page_content,
+                        "context": chunk,
                        "chunk_id": i + 1,
                        "format_instructions": format_instructions,
                    },
--- a/scrapegraphai/nodes/generate_answer_pdf_node.py
+++ b/scrapegraphai/nodes/generate_answer_pdf_node.py
@ -124,7 +124,7 @@ class GenerateAnswerPDFNode(BaseNode):
                    template=template_no_chunks_pdf_prompt,
                    input_variables=["question"],
                    partial_variables={
-                        "context":chunk.page_content,
+                        "context":chunk,
                        "format_instructions": format_instructions,
                    },
                )
--- a/scrapegraphai/nodes/search_link_node.py
+++ b/scrapegraphai/nodes/search_link_node.py
@ -4,6 +4,7 @@ SearchLinkNode Module

 # Imports from standard library
 from typing import List, Optional
+import re
 from tqdm import tqdm

 # Imports from Langchain
@ -20,7 +21,7 @@ from .base_node import BaseNode
 class SearchLinkNode(BaseNode):
    """
    A node that can filter out the relevant links in the webpage content for the user prompt.
-    Node expects the aleready scrapped links on the webpage and hence it is expected
+    Node expects the already scrapped links on the webpage and hence it is expected
    that this node be used after the FetchNode.

    Attributes:
@ -67,39 +68,10 @@ class SearchLinkNode(BaseNode):

        self.logger.info(f"--- Executing {self.node_name} Node ---")

-        # Interpret input keys based on the provided input expression
-        input_keys = self.get_input_keys(state)

-        user_prompt = state[input_keys[0]]
-        parsed_content_chunks = state[input_keys[1]]
+        parsed_content_chunks = state.get("doc")
        output_parser = JsonOutputParser()

-        prompt_relevant_links = """
-            You are a website scraper and you have just scraped the following content from a website.
-            Content: {content}
-            
-            You are now tasked with identifying all hyper links within the content that are potentially
-            relevant to the user task: {user_prompt}
-            
-            Assume relevance broadly, including any links that might be related or potentially useful 
-            in relation to the task.
-
-            Sort it in order of importance, the first one should be the most important one, the last one
-            the least important
-            
-            Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
-            whether the content at the link is directly relevant.
-
-            Output only a list of relevant links in the format:
-            [
-                "link1",
-                "link2",
-                "link3",
-                .
-                .
-                .
-            ]
-            """
        relevant_links = []

        for i, chunk in enumerate(
@ -109,15 +81,47 @@ class SearchLinkNode(BaseNode):
                disable=not self.verbose,
            )
        ):
-            merge_prompt = PromptTemplate(
-                template=prompt_relevant_links,
-                input_variables=["content", "user_prompt"],
-            )
-            merge_chain = merge_prompt | self.llm_model | output_parser
-            # merge_chain = merge_prompt | self.llm_model
-            answer = merge_chain.invoke(
-                {"content": chunk.page_content, "user_prompt": user_prompt}
-            )
-            relevant_links += answer
+            try:
+                # Primary approach: Regular expression to extract links
+                links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
+
+                relevant_links += links
+            except Exception as e:
+                # Fallback approach: Using the LLM to extract links
+                self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
+                prompt_relevant_links = """
+                    You are a website scraper and you have just scraped the following content from a website.
+                    Content: {content}
+                    
+                    Assume relevance broadly, including any links that might be related or potentially useful 
+                    in relation to the task.
+
+                    Sort it in order of importance, the first one should be the most important one, the last one
+                    the least important
+                    
+                    Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain 
+                    whether the content at the link is directly relevant.
+
+                    Output only a list of relevant links in the format:
+                    [
+                        "link1",
+                        "link2",
+                        "link3",
+                        .
+                        .
+                        .
+                    ]
+                    """
+                
+                merge_prompt = PromptTemplate(
+                    template=prompt_relevant_links,
+                    input_variables=["content", "user_prompt"],
+                )
+                merge_chain = merge_prompt | self.llm_model | output_parser
+                answer = merge_chain.invoke(
+                    {"content": chunk.page_content}
+                )
+                relevant_links += answer
+
        state.update({self.output[0]: relevant_links})
        return state
--- a/scrapegraphai/utils/convert_to_md.py
+++ b/scrapegraphai/utils/convert_to_md.py
@ -2,8 +2,6 @@
 convert_to_md modul
 """
 import html2text
-from trafilatura import extract
-

 def convert_to_md(html):
    """ Convert HTML to Markdown.
@ -20,6 +18,6 @@ def convert_to_md(html):
    'This is a paragraph.\n\n# This is a heading.'

    Note: All the styles and links are ignored during the conversion. """
-
-    return extract(filecontent=html,include_images=True,
-                       include_links=True, include_tables=True, output_format="markdown")
+    h = html2text.HTML2Text()
+    h.ignore_links = False
+    return h.handle(html)
--- a/scrapegraphai/utils/research_web.py
+++ b/scrapegraphai/utils/research_web.py
@ -1,6 +1,3 @@
-"""
-research web module
-"""
 import re
 from typing import List
 from langchain_community.tools import DuckDuckGoSearchResults
@ -8,41 +5,39 @@ from googlesearch import search as google_search
 import requests
 from bs4 import BeautifulSoup

-def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
+def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]:
    """
    Searches the web for a given query using specified search engine options.

    Args:
        query (str): The search query to find on the internet.
-        search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
+        search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
        max_results (int, optional): The maximum number of search results to return.
+        port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.

    Returns:
        List[str]: A list of URLs as strings that are the search results.

    Raises:
-        ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
+        ValueError: If the search engine specified is not supported.

    Example:
        >>> search_on_web("example query", search_engine="Google", max_results=5)
        ['http://example.com', 'http://example.org', ...]
-
-    This function allows switching between Google, DuckDuckGo, and Bing to perform 
-    internet searches, returning a list of result URLs.
    """
-
+    
    if search_engine.lower() == "google":
        res = []
        for url in google_search(query, stop=max_results):
            res.append(url)
        return res
-
+    
    elif search_engine.lower() == "duckduckgo":
        research = DuckDuckGoSearchResults(max_results=max_results)
        res = research.run(query)
        links = re.findall(r'https?://[^\s,\]]+', res)
        return links
-
+    
    elif search_engine.lower() == "bing":
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
-
+    
        search_results = []
        for result in soup.find_all('li', class_='b_algo', limit=max_results):
            link = result.find('a')['href']
            search_results.append(link)
        return search_results
-
-    raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
+    
+    elif search_engine.lower() == "searxng":
+        url = f"http://localhost:{port}"
+        params = {"q": query, "format": "json"}
+    
+        # Send the GET request to the server
+        response = requests.get(url, params=params)
+    
+        # Parse the response and limit to the specified max_results
+        data = response.json()
+        limited_results = data["results"][:max_results]
+        return limited_results
+    
+    else:
+        raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")
--- a/tests/graphs/search_link_ollama.py
+++ b/tests/graphs/search_link_ollama.py
@ -0,0 +1,26 @@
+from scrapegraphai.graphs import SearchLinkGraph
+from scrapegraphai.utils import prettify_exec_info
+
+def test_smart_scraper_pipeline():
+    graph_config = {
+        "llm": {
+            "model": "ollama/llama3",
+            "temperature": 0,
+            "format": "json",
+        },
+        "embeddings": {
+            "model": "ollama/nomic-embed-text",
+            "temperature": 0,
+        },
+        "verbose": True,
+        "headless": False
+    }
+
+    smart_scraper_graph = SearchLinkGraph(
+        source="https://sport.sky.it/nba?gr=www",
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+
+    assert result is not None
--- a/tests/utils/convert_to_md_test.py
+++ b/tests/utils/convert_to_md_test.py
@ -7,7 +7,7 @@ def test_basic_html_to_md():

 def test_html_with_links_and_images():
    html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
-    assert convert_to_md(html) is  None
+    assert convert_to_md(html) is not None

 def test_html_with_tables():
    html = '''
@ -17,11 +17,11 @@ def test_html_with_tables():
        <tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
    </table>
    '''
-    assert convert_to_md(html) is  None
+    assert convert_to_md(html) is not None

 def test_empty_html():
    html = ""
-    assert convert_to_md(html) is None
+    assert convert_to_md(html) is not None

 def test_complex_html_structure():
    html = '''