mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-04 21:00:36 +08:00
Merge branch 'pre/beta' into add-openai-supported-model-gpt-4o-mini
This commit is contained in:
commit
3a9d556e1b
42
CHANGELOG.md
42
CHANGELOG.md
@ -1,47 +1,5 @@
|
||||
## [1.9.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0...v1.9.1) (2024-07-12)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* solve a burr integration ([881290b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/881290b5066b39c505532656671fbf65f8fc312c))
|
||||
|
||||
## [1.9.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.8.0...v1.9.0) (2024-07-09)
|
||||
|
||||
|
||||
### Features
|
||||
|
||||
* add fireworks integration ([df0e310](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df0e3108299071b849d7e055bd11d72764d24f08))
|
||||
* add integration for infos ([3bf5f57](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3bf5f570a8f8e1b037a7ad3c9f583261a1536421))
|
||||
* add integrations for markdown files ([2804434](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2804434a9ee12c52ae8956a88b1778a4dd3ec32f))
|
||||
* add vertexai integration ([119514b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/119514bdfc2a16dfb8918b0c34ae7cc43a01384c))
|
||||
* improve md prompt recognition ([5fe694b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe694b6b4545a5091d16110318b992acfca4f58))
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add test ([3a537ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3a537eec6fef1743924a9aa5cef0ba2f8d44bf11))
|
||||
* fix pyproject.toml ([7570bf8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7570bf8294e49bc54ec9e296aaadb763873390ca))
|
||||
|
||||
|
||||
### chore
|
||||
|
||||
* **Docker:** fix port number ([afeb81f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/afeb81f77a884799192d79dcac85666190fb1c9d))
|
||||
* **CI:** fix pylint workflow ([583c321](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/583c32106e827f50235d8fc69511652fd4b07a35))
|
||||
* **rye:** rebuild lockfiles ([27c2dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/27c2dd23517a7e4b14fafd00320a8b81f73145dc))
|
||||
|
||||
|
||||
### Docs
|
||||
|
||||
* **roadmap:** fix urls ([14faba4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/14faba4f00dd9f947f8dc5e0b51be49ea684179f))
|
||||
* **roadmap:** next steps ([3e644f4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e644f498f05eb505fbd4e94b144c81567569aaa))
|
||||
|
||||
|
||||
### CI
|
||||
|
||||
* **release:** 1.8.1-beta.1 [skip ci] ([8f9f96f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f9f96f7e7ff41d2fff5bbbf18bf4fc85d4f98b3))
|
||||
* **release:** 1.9.0-beta.1 [skip ci] ([146432d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/146432d476f775510441b062935adc47190141e2))
|
||||
* **release:** 1.9.0-beta.2 [skip ci] ([5cb5fbf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5cb5fbf5503eec9b34a6691eb993716cc9a821d6))
|
||||
|
||||
## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-07-05)
|
||||
|
||||
|
||||
|
||||
57
examples/anthropic/search_link_graph_haiku.py
Normal file
57
examples/anthropic/search_link_graph_haiku.py
Normal file
@ -0,0 +1,57 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
from langchain_openai import AzureChatOpenAI
|
||||
from langchain_openai import AzureOpenAIEmbeddings
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
load_dotenv()
|
||||
|
||||
llm_model_instance = AzureChatOpenAI(
|
||||
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
||||
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
|
||||
)
|
||||
|
||||
embedder_model_instance = AzureOpenAIEmbeddings(
|
||||
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
|
||||
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
|
||||
)
|
||||
|
||||
# ************************************************
|
||||
# Create the SmartScraperGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {"model_instance": llm_model_instance},
|
||||
"embeddings": {"model_instance": embedder_model_instance}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
52
examples/azure/search_link_graph_azure.py
Normal file
52
examples/azure/search_link_graph_azure.py
Normal file
@ -0,0 +1,52 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
load_dotenv()
|
||||
|
||||
groq_key = os.getenv("GROQ_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "groq/gemma-7b-it",
|
||||
"api_key": groq_key,
|
||||
"temperature": 0
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"headless": False
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
45
examples/bedrock/search_link_graph_bedrock.py
Normal file
45
examples/bedrock/search_link_graph_bedrock.py
Normal file
@ -0,0 +1,45 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"client": "client_name",
|
||||
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
|
||||
"temperature": 0.0
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "bedrock/cohere.embed-multilingual-v3"
|
||||
}
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
52
examples/deepseek/search_link_graph_deepseek.py
Normal file
52
examples/deepseek/search_link_graph_deepseek.py
Normal file
@ -0,0 +1,52 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
load_dotenv()
|
||||
|
||||
deepseek_key = os.getenv("DEEPSEEK_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "deepseek-chat",
|
||||
"openai_api_key": deepseek_key,
|
||||
"openai_api_base": 'https://api.deepseek.com/v1',
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"verbose": True,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -12,15 +12,18 @@ load_dotenv()
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
openai_key = os.getenv("OPENAI_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": openai_key,
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
"max_results": 2,
|
||||
"verbose": True,
|
||||
"model": "ernie-bot-turbo",
|
||||
"ernie_client_id": "<ernie_client_id>",
|
||||
"ernie_client_secret": "<ernie_client_secret>",
|
||||
"temperature": 0.1
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434"},
|
||||
"library": "beautifulsoup"
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
|
||||
46
examples/ernie/search_link_graph_ernie.py
Normal file
46
examples/ernie/search_link_graph_ernie.py
Normal file
@ -0,0 +1,46 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ernie-bot-turbo",
|
||||
"ernie_client_id": "<ernie_client_id>",
|
||||
"ernie_client_secret": "<ernie_client_secret>",
|
||||
"temperature": 0.1
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
"base_url": "http://localhost:11434"},
|
||||
"library": "beautifulsoup"
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
52
examples/fireworks/search_link_graph_fireworks.py
Normal file
52
examples/fireworks/search_link_graph_fireworks.py
Normal file
@ -0,0 +1,52 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
load_dotenv()
|
||||
|
||||
fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": fireworks_api_key,
|
||||
"model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"max_results": 2,
|
||||
"verbose": True,
|
||||
"headless": False,
|
||||
}
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
44
examples/gemini/search_link_graph_gemini.py
Normal file
44
examples/gemini/search_link_graph_gemini.py
Normal file
@ -0,0 +1,44 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
load_dotenv()
|
||||
|
||||
gemini_key = os.getenv("GOOGLE_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": gemini_key,
|
||||
"model": "gemini-pro",
|
||||
},
|
||||
}
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
52
examples/groq/search_link_graph_groq.py
Normal file
52
examples/groq/search_link_graph_groq.py
Normal file
@ -0,0 +1,52 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
load_dotenv()
|
||||
|
||||
groq_key = os.getenv("GROQ_APIKEY")
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "groq/gemma-7b-it",
|
||||
"api_key": groq_key,
|
||||
"temperature": 0
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"headless": False
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
@ -9,7 +9,6 @@ from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
54
examples/huggingfacehub/search_link_graph_huggingfacehub.py
Normal file
54
examples/huggingfacehub/search_link_graph_huggingfacehub.py
Normal file
@ -0,0 +1,54 @@
|
||||
"""
|
||||
Example of Search Graph
|
||||
"""
|
||||
import os
|
||||
from scrapegraphai.graphs import SearchGraph
|
||||
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
|
||||
from langchain_community.llms import HuggingFaceEndpoint
|
||||
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
||||
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
# ************************************************
|
||||
|
||||
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
||||
|
||||
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
|
||||
llm_model_instance = HuggingFaceEndpoint(
|
||||
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
|
||||
)
|
||||
|
||||
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
|
||||
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
|
||||
)
|
||||
|
||||
graph_config = {
|
||||
"llm": {"model_instance": llm_model_instance},
|
||||
"embeddings": {"model_instance": embedder_model_instance}
|
||||
}
|
||||
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
search_graph = SearchGraph(
|
||||
prompt="List me the best escursions near Trento",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = search_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = search_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
|
||||
# Save to json and csv
|
||||
convert_to_csv(result, "result")
|
||||
convert_to_json(result, "result")
|
||||
43
examples/local_models/search_link_graph_ollama.py
Normal file
43
examples/local_models/search_link_graph_ollama.py
Normal file
@ -0,0 +1,43 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
from scrapegraphai.graphs import SearchLinkGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json", # Ollama needs the format to be specified explicitly
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchLinkGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SearchLinkGraph(
|
||||
source="https://sport.sky.it/nba?gr=www",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
36
examples/openai/search_link_graph_openai.py
Normal file
36
examples/openai/search_link_graph_openai.py
Normal file
@ -0,0 +1,36 @@
|
||||
"""
|
||||
Basic example of scraping pipeline using SmartScraper
|
||||
"""
|
||||
from scrapegraphai.graphs import SearchLinkGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
# ************************************************
|
||||
# Define the configuration for the graph
|
||||
# ************************************************
|
||||
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": "s",
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False,
|
||||
}
|
||||
|
||||
# ************************************************
|
||||
# Create the SearchLinkGraph instance and run it
|
||||
# ************************************************
|
||||
|
||||
smart_scraper_graph = SearchLinkGraph(
|
||||
source="https://sport.sky.it/nba?gr=www",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
print(result)
|
||||
|
||||
# ************************************************
|
||||
# Get graph execution info
|
||||
# ************************************************
|
||||
|
||||
graph_exec_info = smart_scraper_graph.get_execution_info()
|
||||
print(prettify_exec_info(graph_exec_info))
|
||||
@ -2,7 +2,9 @@
|
||||
name = "scrapegraphai"
|
||||
|
||||
|
||||
version = "1.9.1"
|
||||
|
||||
version = "1.9.0b6"
|
||||
|
||||
|
||||
|
||||
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
|
||||
@ -24,7 +26,7 @@ dependencies = [
|
||||
"beautifulsoup4==4.12.3",
|
||||
"pandas==2.2.2",
|
||||
"python-dotenv==1.0.1",
|
||||
"tiktoken==0.6.0",
|
||||
"tiktoken==0.7",
|
||||
"tqdm==4.66.4",
|
||||
"graphviz==0.20.3",
|
||||
"minify-html==0.15.0",
|
||||
@ -34,7 +36,6 @@ dependencies = [
|
||||
"undetected-playwright==0.3.0",
|
||||
"semchunk==1.0.1",
|
||||
"html2text==2024.2.26",
|
||||
"trafilatura==1.10.0",
|
||||
"langchain-fireworks==0.1.3"
|
||||
]
|
||||
|
||||
|
||||
@ -41,7 +41,6 @@ attrs==23.2.0
|
||||
# via jsonschema
|
||||
# via referencing
|
||||
babel==2.15.0
|
||||
# via courlan
|
||||
# via sphinx
|
||||
beautifulsoup4==4.12.3
|
||||
# via furo
|
||||
@ -63,11 +62,8 @@ certifi==2024.2.2
|
||||
# via httpcore
|
||||
# via httpx
|
||||
# via requests
|
||||
# via trafilatura
|
||||
charset-normalizer==3.3.2
|
||||
# via htmldate
|
||||
# via requests
|
||||
# via trafilatura
|
||||
click==8.1.7
|
||||
# via burr
|
||||
# via streamlit
|
||||
@ -75,15 +71,11 @@ click==8.1.7
|
||||
# via uvicorn
|
||||
contourpy==1.2.1
|
||||
# via matplotlib
|
||||
courlan==1.2.0
|
||||
# via trafilatura
|
||||
cycler==0.12.1
|
||||
# via matplotlib
|
||||
dataclasses-json==0.6.6
|
||||
# via langchain
|
||||
# via langchain-community
|
||||
dateparser==1.2.0
|
||||
# via htmldate
|
||||
defusedxml==0.7.1
|
||||
# via langchain-anthropic
|
||||
dill==0.3.8
|
||||
@ -204,8 +196,6 @@ h11==0.14.0
|
||||
# via uvicorn
|
||||
html2text==2024.2.26
|
||||
# via scrapegraphai
|
||||
htmldate==1.8.1
|
||||
# via trafilatura
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httplib2==0.22.0
|
||||
@ -259,8 +249,6 @@ jsonschema==4.22.0
|
||||
# via altair
|
||||
jsonschema-specifications==2023.12.1
|
||||
# via jsonschema
|
||||
justext==3.0.1
|
||||
# via trafilatura
|
||||
kiwisolver==1.4.5
|
||||
# via matplotlib
|
||||
langchain==0.1.15
|
||||
@ -302,12 +290,6 @@ loguru==0.7.2
|
||||
# via burr
|
||||
lxml==5.2.2
|
||||
# via free-proxy
|
||||
# via htmldate
|
||||
# via justext
|
||||
# via lxml-html-clean
|
||||
# via trafilatura
|
||||
lxml-html-clean==0.1.1
|
||||
# via lxml
|
||||
markdown-it-py==3.0.0
|
||||
# via rich
|
||||
markupsafe==2.1.5
|
||||
@ -430,9 +412,7 @@ pytest==8.0.0
|
||||
pytest-mock==3.14.0
|
||||
python-dateutil==2.9.0.post0
|
||||
# via botocore
|
||||
# via dateparser
|
||||
# via google-cloud-bigquery
|
||||
# via htmldate
|
||||
# via matplotlib
|
||||
# via pandas
|
||||
python-dotenv==1.0.1
|
||||
@ -441,7 +421,6 @@ python-dotenv==1.0.1
|
||||
python-multipart==0.0.9
|
||||
# via fastapi
|
||||
pytz==2024.1
|
||||
# via dateparser
|
||||
# via pandas
|
||||
pyyaml==6.0.1
|
||||
# via huggingface-hub
|
||||
@ -453,7 +432,6 @@ referencing==0.35.1
|
||||
# via jsonschema
|
||||
# via jsonschema-specifications
|
||||
regex==2024.5.15
|
||||
# via dateparser
|
||||
# via tiktoken
|
||||
requests==2.32.2
|
||||
# via burr
|
||||
@ -531,11 +509,9 @@ tenacity==8.3.0
|
||||
# via langchain-community
|
||||
# via langchain-core
|
||||
# via streamlit
|
||||
tiktoken==0.6.0
|
||||
tiktoken==0.7.0
|
||||
# via langchain-openai
|
||||
# via scrapegraphai
|
||||
tld==0.13
|
||||
# via courlan
|
||||
tokenizers==0.19.1
|
||||
# via anthropic
|
||||
toml==0.10.2
|
||||
@ -555,8 +531,6 @@ tqdm==4.66.4
|
||||
# via openai
|
||||
# via scrapegraphai
|
||||
# via semchunk
|
||||
trafilatura==1.10.0
|
||||
# via scrapegraphai
|
||||
typer==0.12.3
|
||||
# via fastapi-cli
|
||||
typing-extensions==4.12.0
|
||||
@ -586,8 +560,6 @@ typing-inspect==0.9.0
|
||||
# via sf-hamilton
|
||||
tzdata==2024.1
|
||||
# via pandas
|
||||
tzlocal==5.2
|
||||
# via dateparser
|
||||
ujson==5.10.0
|
||||
# via fastapi
|
||||
undetected-playwright==0.3.0
|
||||
@ -596,10 +568,7 @@ uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==1.26.18
|
||||
# via botocore
|
||||
# via courlan
|
||||
# via htmldate
|
||||
# via requests
|
||||
# via trafilatura
|
||||
uvicorn==0.29.0
|
||||
# via burr
|
||||
# via fastapi
|
||||
|
||||
@ -28,8 +28,6 @@ async-timeout==4.0.3
|
||||
# via langchain
|
||||
attrs==23.2.0
|
||||
# via aiohttp
|
||||
babel==2.15.0
|
||||
# via courlan
|
||||
beautifulsoup4==4.12.3
|
||||
# via google
|
||||
# via scrapegraphai
|
||||
@ -44,18 +42,11 @@ certifi==2024.2.2
|
||||
# via httpcore
|
||||
# via httpx
|
||||
# via requests
|
||||
# via trafilatura
|
||||
charset-normalizer==3.3.2
|
||||
# via htmldate
|
||||
# via requests
|
||||
# via trafilatura
|
||||
courlan==1.2.0
|
||||
# via trafilatura
|
||||
dataclasses-json==0.6.6
|
||||
# via langchain
|
||||
# via langchain-community
|
||||
dateparser==1.2.0
|
||||
# via htmldate
|
||||
defusedxml==0.7.1
|
||||
# via langchain-anthropic
|
||||
distro==1.9.0
|
||||
@ -150,8 +141,6 @@ h11==0.14.0
|
||||
# via httpcore
|
||||
html2text==2024.2.26
|
||||
# via scrapegraphai
|
||||
htmldate==1.8.1
|
||||
# via trafilatura
|
||||
httpcore==1.0.5
|
||||
# via httpx
|
||||
httplib2==0.22.0
|
||||
@ -181,8 +170,6 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
justext==3.0.1
|
||||
# via trafilatura
|
||||
langchain==0.1.15
|
||||
# via scrapegraphai
|
||||
langchain-anthropic==0.1.11
|
||||
@ -220,12 +207,6 @@ langsmith==0.1.63
|
||||
# via langchain-core
|
||||
lxml==5.2.2
|
||||
# via free-proxy
|
||||
# via htmldate
|
||||
# via justext
|
||||
# via lxml-html-clean
|
||||
# via trafilatura
|
||||
lxml-html-clean==0.1.1
|
||||
# via lxml
|
||||
marshmallow==3.21.2
|
||||
# via dataclasses-json
|
||||
minify-html==0.15.0
|
||||
@ -298,14 +279,11 @@ pyparsing==3.1.2
|
||||
# via httplib2
|
||||
python-dateutil==2.9.0.post0
|
||||
# via botocore
|
||||
# via dateparser
|
||||
# via google-cloud-bigquery
|
||||
# via htmldate
|
||||
# via pandas
|
||||
python-dotenv==1.0.1
|
||||
# via scrapegraphai
|
||||
pytz==2024.1
|
||||
# via dateparser
|
||||
# via pandas
|
||||
pyyaml==6.0.1
|
||||
# via huggingface-hub
|
||||
@ -313,7 +291,6 @@ pyyaml==6.0.1
|
||||
# via langchain-community
|
||||
# via langchain-core
|
||||
regex==2024.5.15
|
||||
# via dateparser
|
||||
# via tiktoken
|
||||
requests==2.32.2
|
||||
# via free-proxy
|
||||
@ -351,11 +328,9 @@ tenacity==8.3.0
|
||||
# via langchain
|
||||
# via langchain-community
|
||||
# via langchain-core
|
||||
tiktoken==0.6.0
|
||||
tiktoken==0.7.0
|
||||
# via langchain-openai
|
||||
# via scrapegraphai
|
||||
tld==0.13
|
||||
# via courlan
|
||||
tokenizers==0.19.1
|
||||
# via anthropic
|
||||
tqdm==4.66.4
|
||||
@ -364,8 +339,6 @@ tqdm==4.66.4
|
||||
# via openai
|
||||
# via scrapegraphai
|
||||
# via semchunk
|
||||
trafilatura==1.10.0
|
||||
# via scrapegraphai
|
||||
typing-extensions==4.12.0
|
||||
# via anthropic
|
||||
# via anyio
|
||||
@ -382,17 +355,12 @@ typing-inspect==0.9.0
|
||||
# via dataclasses-json
|
||||
tzdata==2024.1
|
||||
# via pandas
|
||||
tzlocal==5.2
|
||||
# via dateparser
|
||||
undetected-playwright==0.3.0
|
||||
# via scrapegraphai
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==1.26.18
|
||||
# via botocore
|
||||
# via courlan
|
||||
# via htmldate
|
||||
# via requests
|
||||
# via trafilatura
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
|
||||
@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph
|
||||
from .script_creator_multi_graph import ScriptCreatorMultiGraph
|
||||
from .markdown_scraper_graph import MDScraperGraph
|
||||
from .markdown_scraper_multi_graph import MDScraperMultiGraph
|
||||
from .search_link_graph import SearchLinkGraph
|
||||
|
||||
@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
RAGNode,
|
||||
GenerateAnswerCSVNode
|
||||
)
|
||||
|
||||
@ -37,14 +36,7 @@ class CSVScraperGraph(AbstractGraph):
|
||||
input="csv | csv_dir",
|
||||
output=["doc"],
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & doc",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model,
|
||||
}
|
||||
)
|
||||
|
||||
generate_answer_node = GenerateAnswerCSVNode(
|
||||
input="user_prompt & (relevant_chunks | doc)",
|
||||
output=["answer"],
|
||||
@ -58,12 +50,10 @@ class CSVScraperGraph(AbstractGraph):
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
(fetch_node, generate_answer_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
|
||||
@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
RAGNode,
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
@ -62,14 +61,7 @@ class JSONScraperGraph(AbstractGraph):
|
||||
input="json | json_dir",
|
||||
output=["doc", "link_urls", "img_urls"],
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||
output=["answer"],
|
||||
@ -83,12 +75,10 @@ class JSONScraperGraph(AbstractGraph):
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
(fetch_node, generate_answer_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
|
||||
@ -3,7 +3,7 @@ import logging
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
|
||||
from ..nodes import FetchNode, ParseNode, GenerateAnswerNode
|
||||
|
||||
class MDScraperGraph(AbstractGraph):
|
||||
"""
|
||||
@ -63,14 +63,6 @@ class MDScraperGraph(AbstractGraph):
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||
output=["answer"],
|
||||
@ -86,13 +78,11 @@ class MDScraperGraph(AbstractGraph):
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
(parse_node, generate_answer_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
|
||||
@ -12,7 +12,6 @@ from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
ImageToTextNode,
|
||||
RAGNode,
|
||||
GenerateAnswerOmniNode
|
||||
)
|
||||
|
||||
@ -89,14 +88,7 @@ class OmniScraperGraph(AbstractGraph):
|
||||
"max_images": self.max_images
|
||||
}
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
|
||||
generate_answer_omni_node = GenerateAnswerOmniNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
|
||||
output=["answer"],
|
||||
@ -112,14 +104,12 @@ class OmniScraperGraph(AbstractGraph):
|
||||
fetch_node,
|
||||
parse_node,
|
||||
image_to_text_node,
|
||||
rag_node,
|
||||
generate_answer_omni_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, image_to_text_node),
|
||||
(image_to_text_node, rag_node),
|
||||
(rag_node, generate_answer_omni_node)
|
||||
(image_to_text_node, generate_answer_omni_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
@ -136,4 +126,4 @@ class OmniScraperGraph(AbstractGraph):
|
||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
return self.final_state.get("answer", "No answer found.")
|
||||
|
||||
@ -12,7 +12,6 @@ from .abstract_graph import AbstractGraph
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
RAGNode,
|
||||
GenerateAnswerPDFNode
|
||||
)
|
||||
|
||||
@ -76,14 +75,6 @@ class PDFScraperGraph(AbstractGraph):
|
||||
}
|
||||
)
|
||||
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
generate_answer_node_pdf = GenerateAnswerPDFNode(
|
||||
input="user_prompt & (relevant_chunks | doc)",
|
||||
output=["answer"],
|
||||
@ -98,13 +89,11 @@ class PDFScraperGraph(AbstractGraph):
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
rag_node,
|
||||
generate_answer_node_pdf,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, rag_node),
|
||||
(rag_node, generate_answer_node_pdf)
|
||||
(parse_node, generate_answer_node_pdf)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
|
||||
104
scrapegraphai/graphs/search_link_graph.py
Normal file
104
scrapegraphai/graphs/search_link_graph.py
Normal file
@ -0,0 +1,104 @@
|
||||
""" SearchLinkGraph Module """
|
||||
from typing import Optional
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
from .base_graph import BaseGraph
|
||||
from .abstract_graph import AbstractGraph
|
||||
|
||||
|
||||
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
|
||||
|
||||
class SearchLinkGraph(AbstractGraph):
|
||||
"""
|
||||
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
|
||||
|
||||
Attributes:
|
||||
prompt (str): The prompt for the graph.
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel): The schema for the graph output.
|
||||
llm_model: An instance of a language model client, configured for generating answers.
|
||||
embedder_model: An instance of an embedding model client,
|
||||
configured for generating embeddings.
|
||||
verbose (bool): A flag indicating whether to show print statements during execution.
|
||||
headless (bool): A flag indicating whether to run the graph in headless mode.
|
||||
|
||||
Args:
|
||||
source (str): The source of the graph.
|
||||
config (dict): Configuration parameters for the graph.
|
||||
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
|
||||
|
||||
Example:
|
||||
>>> smart_scraper = SearchLinkGraph(
|
||||
... "List me all the attractions in Chioggia.",
|
||||
... "https://en.wikipedia.org/wiki/Chioggia",
|
||||
... {"llm": {"model": "gpt-3.5-turbo"}}
|
||||
... )
|
||||
>>> result = smart_scraper.run()
|
||||
"""
|
||||
|
||||
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
|
||||
super().__init__("", config, source, schema)
|
||||
|
||||
self.input_key = "url" if source.startswith("http") else "local_dir"
|
||||
|
||||
def _create_graph(self) -> BaseGraph:
|
||||
"""
|
||||
Creates the graph of nodes representing the workflow for web scraping.
|
||||
|
||||
Returns:
|
||||
BaseGraph: A graph instance representing the web scraping workflow.
|
||||
"""
|
||||
|
||||
fetch_node = FetchNode(
|
||||
input="url| local_dir",
|
||||
output=["doc", "link_urls", "img_urls"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"force": self.config.get("force", False),
|
||||
"cut": self.config.get("cut", True),
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
search_link_node = SearchLinkNode(
|
||||
input="doc",
|
||||
output=["parsed_doc"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
search_link_node
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, search_link_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
)
|
||||
|
||||
def run(self) -> str:
|
||||
"""
|
||||
Executes the scraping process and returns the answer to the prompt.
|
||||
|
||||
Returns:
|
||||
str: The answer to the prompt.
|
||||
"""
|
||||
|
||||
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
|
||||
self.final_state, self.execution_info = self.graph.execute(inputs)
|
||||
|
||||
return self.final_state.get("parsed_doc", "No answer found.")
|
||||
@ -11,7 +11,6 @@ from .abstract_graph import AbstractGraph
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
ParseNode,
|
||||
RAGNode,
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
@ -78,14 +77,7 @@ class SmartScraperGraph(AbstractGraph):
|
||||
"chunk_size": self.model_token
|
||||
}
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & (parsed_doc | doc)",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
|
||||
output=["answer"],
|
||||
@ -100,13 +92,11 @@ class SmartScraperGraph(AbstractGraph):
|
||||
nodes=[
|
||||
fetch_node,
|
||||
parse_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, parse_node),
|
||||
(parse_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
(parse_node, generate_answer_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
|
||||
@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph
|
||||
|
||||
from ..nodes import (
|
||||
FetchNode,
|
||||
RAGNode,
|
||||
GenerateAnswerNode
|
||||
)
|
||||
|
||||
@ -64,14 +63,7 @@ class XMLScraperGraph(AbstractGraph):
|
||||
input="xml | xml_dir",
|
||||
output=["doc", "link_urls", "img_urls"]
|
||||
)
|
||||
rag_node = RAGNode(
|
||||
input="user_prompt & doc",
|
||||
output=["relevant_chunks"],
|
||||
node_config={
|
||||
"llm_model": self.llm_model,
|
||||
"embedder_model": self.embedder_model
|
||||
}
|
||||
)
|
||||
|
||||
generate_answer_node = GenerateAnswerNode(
|
||||
input="user_prompt & (relevant_chunks | doc)",
|
||||
output=["answer"],
|
||||
@ -85,12 +77,10 @@ class XMLScraperGraph(AbstractGraph):
|
||||
return BaseGraph(
|
||||
nodes=[
|
||||
fetch_node,
|
||||
rag_node,
|
||||
generate_answer_node,
|
||||
],
|
||||
edges=[
|
||||
(fetch_node, rag_node),
|
||||
(rag_node, generate_answer_node)
|
||||
(fetch_node, generate_answer_node)
|
||||
],
|
||||
entry_point=fetch_node,
|
||||
graph_name=self.__class__.__name__
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
bedrock configuration wrapper
|
||||
Bedrock Module
|
||||
"""
|
||||
from langchain_aws import ChatBedrock
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Ollama Module
|
||||
Ernie Module
|
||||
"""
|
||||
from langchain_community.chat_models import ErnieBotChat
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
"""
|
||||
OpenAI Module
|
||||
OneAPI Module
|
||||
"""
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
|
||||
@ -125,7 +125,7 @@ class GenerateAnswerCSVNode(BaseNode):
|
||||
template=template_no_chunks_csv_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={
|
||||
"context": chunk.page_content,
|
||||
"context": chunk,
|
||||
"format_instructions": format_instructions,
|
||||
},
|
||||
)
|
||||
@ -137,7 +137,7 @@ class GenerateAnswerCSVNode(BaseNode):
|
||||
template=template_chunks_csv_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={
|
||||
"context": chunk.page_content,
|
||||
"context": chunk,
|
||||
"chunk_id": i + 1,
|
||||
"format_instructions": format_instructions,
|
||||
},
|
||||
|
||||
@ -115,7 +115,7 @@ class GenerateAnswerNode(BaseNode):
|
||||
prompt = PromptTemplate(
|
||||
template=template_no_chunks_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={"context": chunk.page_content,
|
||||
partial_variables={"context": chunk,
|
||||
"format_instructions": format_instructions})
|
||||
chain = prompt | self.llm_model | output_parser
|
||||
answer = chain.invoke({"question": user_prompt})
|
||||
@ -124,7 +124,7 @@ class GenerateAnswerNode(BaseNode):
|
||||
prompt = PromptTemplate(
|
||||
template=template_chunks_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={"context": chunk.page_content,
|
||||
partial_variables={"context": chunk,
|
||||
"chunk_id": i + 1,
|
||||
"format_instructions": format_instructions})
|
||||
# Dynamically name the chains based on their index
|
||||
|
||||
@ -110,7 +110,7 @@ class GenerateAnswerOmniNode(BaseNode):
|
||||
template=template_no_chunk_omni_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={
|
||||
"context": chunk.page_content,
|
||||
"context": chunk,
|
||||
"format_instructions": format_instructions,
|
||||
"img_desc": imag_desc,
|
||||
},
|
||||
@ -123,7 +123,7 @@ class GenerateAnswerOmniNode(BaseNode):
|
||||
template=template_chunks_omni_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={
|
||||
"context": chunk.page_content,
|
||||
"context": chunk,
|
||||
"chunk_id": i + 1,
|
||||
"format_instructions": format_instructions,
|
||||
},
|
||||
|
||||
@ -124,7 +124,7 @@ class GenerateAnswerPDFNode(BaseNode):
|
||||
template=template_no_chunks_pdf_prompt,
|
||||
input_variables=["question"],
|
||||
partial_variables={
|
||||
"context":chunk.page_content,
|
||||
"context":chunk,
|
||||
"format_instructions": format_instructions,
|
||||
},
|
||||
)
|
||||
|
||||
@ -4,6 +4,7 @@ SearchLinkNode Module
|
||||
|
||||
# Imports from standard library
|
||||
from typing import List, Optional
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
|
||||
# Imports from Langchain
|
||||
@ -20,7 +21,7 @@ from .base_node import BaseNode
|
||||
class SearchLinkNode(BaseNode):
|
||||
"""
|
||||
A node that can filter out the relevant links in the webpage content for the user prompt.
|
||||
Node expects the aleready scrapped links on the webpage and hence it is expected
|
||||
Node expects the already scrapped links on the webpage and hence it is expected
|
||||
that this node be used after the FetchNode.
|
||||
|
||||
Attributes:
|
||||
@ -67,39 +68,10 @@ class SearchLinkNode(BaseNode):
|
||||
|
||||
self.logger.info(f"--- Executing {self.node_name} Node ---")
|
||||
|
||||
# Interpret input keys based on the provided input expression
|
||||
input_keys = self.get_input_keys(state)
|
||||
|
||||
user_prompt = state[input_keys[0]]
|
||||
parsed_content_chunks = state[input_keys[1]]
|
||||
parsed_content_chunks = state.get("doc")
|
||||
output_parser = JsonOutputParser()
|
||||
|
||||
prompt_relevant_links = """
|
||||
You are a website scraper and you have just scraped the following content from a website.
|
||||
Content: {content}
|
||||
|
||||
You are now tasked with identifying all hyper links within the content that are potentially
|
||||
relevant to the user task: {user_prompt}
|
||||
|
||||
Assume relevance broadly, including any links that might be related or potentially useful
|
||||
in relation to the task.
|
||||
|
||||
Sort it in order of importance, the first one should be the most important one, the last one
|
||||
the least important
|
||||
|
||||
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
|
||||
whether the content at the link is directly relevant.
|
||||
|
||||
Output only a list of relevant links in the format:
|
||||
[
|
||||
"link1",
|
||||
"link2",
|
||||
"link3",
|
||||
.
|
||||
.
|
||||
.
|
||||
]
|
||||
"""
|
||||
relevant_links = []
|
||||
|
||||
for i, chunk in enumerate(
|
||||
@ -109,15 +81,47 @@ class SearchLinkNode(BaseNode):
|
||||
disable=not self.verbose,
|
||||
)
|
||||
):
|
||||
merge_prompt = PromptTemplate(
|
||||
template=prompt_relevant_links,
|
||||
input_variables=["content", "user_prompt"],
|
||||
)
|
||||
merge_chain = merge_prompt | self.llm_model | output_parser
|
||||
# merge_chain = merge_prompt | self.llm_model
|
||||
answer = merge_chain.invoke(
|
||||
{"content": chunk.page_content, "user_prompt": user_prompt}
|
||||
)
|
||||
relevant_links += answer
|
||||
try:
|
||||
# Primary approach: Regular expression to extract links
|
||||
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
|
||||
|
||||
relevant_links += links
|
||||
except Exception as e:
|
||||
# Fallback approach: Using the LLM to extract links
|
||||
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
|
||||
prompt_relevant_links = """
|
||||
You are a website scraper and you have just scraped the following content from a website.
|
||||
Content: {content}
|
||||
|
||||
Assume relevance broadly, including any links that might be related or potentially useful
|
||||
in relation to the task.
|
||||
|
||||
Sort it in order of importance, the first one should be the most important one, the last one
|
||||
the least important
|
||||
|
||||
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
|
||||
whether the content at the link is directly relevant.
|
||||
|
||||
Output only a list of relevant links in the format:
|
||||
[
|
||||
"link1",
|
||||
"link2",
|
||||
"link3",
|
||||
.
|
||||
.
|
||||
.
|
||||
]
|
||||
"""
|
||||
|
||||
merge_prompt = PromptTemplate(
|
||||
template=prompt_relevant_links,
|
||||
input_variables=["content", "user_prompt"],
|
||||
)
|
||||
merge_chain = merge_prompt | self.llm_model | output_parser
|
||||
answer = merge_chain.invoke(
|
||||
{"content": chunk.page_content}
|
||||
)
|
||||
relevant_links += answer
|
||||
|
||||
state.update({self.output[0]: relevant_links})
|
||||
return state
|
||||
|
||||
@ -2,8 +2,6 @@
|
||||
convert_to_md modul
|
||||
"""
|
||||
import html2text
|
||||
from trafilatura import extract
|
||||
|
||||
|
||||
def convert_to_md(html):
|
||||
""" Convert HTML to Markdown.
|
||||
@ -20,6 +18,6 @@ def convert_to_md(html):
|
||||
'This is a paragraph.\n\n# This is a heading.'
|
||||
|
||||
Note: All the styles and links are ignored during the conversion. """
|
||||
|
||||
return extract(filecontent=html,include_images=True,
|
||||
include_links=True, include_tables=True, output_format="markdown")
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = False
|
||||
return h.handle(html)
|
||||
|
||||
@ -1,6 +1,3 @@
|
||||
"""
|
||||
research web module
|
||||
"""
|
||||
import re
|
||||
from typing import List
|
||||
from langchain_community.tools import DuckDuckGoSearchResults
|
||||
@ -8,41 +5,39 @@ from googlesearch import search as google_search
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
|
||||
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]:
|
||||
"""
|
||||
Searches the web for a given query using specified search engine options.
|
||||
|
||||
Args:
|
||||
query (str): The search query to find on the internet.
|
||||
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
|
||||
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
|
||||
max_results (int, optional): The maximum number of search results to return.
|
||||
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of URLs as strings that are the search results.
|
||||
|
||||
Raises:
|
||||
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
|
||||
ValueError: If the search engine specified is not supported.
|
||||
|
||||
Example:
|
||||
>>> search_on_web("example query", search_engine="Google", max_results=5)
|
||||
['http://example.com', 'http://example.org', ...]
|
||||
|
||||
This function allows switching between Google, DuckDuckGo, and Bing to perform
|
||||
internet searches, returning a list of result URLs.
|
||||
"""
|
||||
|
||||
|
||||
if search_engine.lower() == "google":
|
||||
res = []
|
||||
for url in google_search(query, stop=max_results):
|
||||
res.append(url)
|
||||
return res
|
||||
|
||||
|
||||
elif search_engine.lower() == "duckduckgo":
|
||||
research = DuckDuckGoSearchResults(max_results=max_results)
|
||||
res = research.run(query)
|
||||
links = re.findall(r'https?://[^\s,\]]+', res)
|
||||
return links
|
||||
|
||||
|
||||
elif search_engine.lower() == "bing":
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
|
||||
response = requests.get(search_url, headers=headers)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
|
||||
search_results = []
|
||||
for result in soup.find_all('li', class_='b_algo', limit=max_results):
|
||||
link = result.find('a')['href']
|
||||
search_results.append(link)
|
||||
return search_results
|
||||
|
||||
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
|
||||
|
||||
elif search_engine.lower() == "searxng":
|
||||
url = f"http://localhost:{port}"
|
||||
params = {"q": query, "format": "json"}
|
||||
|
||||
# Send the GET request to the server
|
||||
response = requests.get(url, params=params)
|
||||
|
||||
# Parse the response and limit to the specified max_results
|
||||
data = response.json()
|
||||
limited_results = data["results"][:max_results]
|
||||
return limited_results
|
||||
|
||||
else:
|
||||
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")
|
||||
|
||||
26
tests/graphs/search_link_ollama.py
Normal file
26
tests/graphs/search_link_ollama.py
Normal file
@ -0,0 +1,26 @@
|
||||
from scrapegraphai.graphs import SearchLinkGraph
|
||||
from scrapegraphai.utils import prettify_exec_info
|
||||
|
||||
def test_smart_scraper_pipeline():
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"model": "ollama/llama3",
|
||||
"temperature": 0,
|
||||
"format": "json",
|
||||
},
|
||||
"embeddings": {
|
||||
"model": "ollama/nomic-embed-text",
|
||||
"temperature": 0,
|
||||
},
|
||||
"verbose": True,
|
||||
"headless": False
|
||||
}
|
||||
|
||||
smart_scraper_graph = SearchLinkGraph(
|
||||
source="https://sport.sky.it/nba?gr=www",
|
||||
config=graph_config
|
||||
)
|
||||
|
||||
result = smart_scraper_graph.run()
|
||||
|
||||
assert result is not None
|
||||
@ -7,7 +7,7 @@ def test_basic_html_to_md():
|
||||
|
||||
def test_html_with_links_and_images():
|
||||
html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
|
||||
assert convert_to_md(html) is None
|
||||
assert convert_to_md(html) is not None
|
||||
|
||||
def test_html_with_tables():
|
||||
html = '''
|
||||
@ -17,11 +17,11 @@ def test_html_with_tables():
|
||||
<tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
|
||||
</table>
|
||||
'''
|
||||
assert convert_to_md(html) is None
|
||||
assert convert_to_md(html) is not None
|
||||
|
||||
def test_empty_html():
|
||||
html = ""
|
||||
assert convert_to_md(html) is None
|
||||
assert convert_to_md(html) is not None
|
||||
|
||||
def test_complex_html_structure():
|
||||
html = '''
|
||||
|
||||
Loading…
Reference in New Issue
Block a user