Merge branch 'pre/beta' into add-openai-supported-model-gpt-4o-mini

This commit is contained in:
Marco Vinciguerra 2024-07-19 10:08:41 +02:00 committed by GitHub
commit 3a9d556e1b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
38 changed files with 778 additions and 277 deletions

View File

@ -1,47 +1,5 @@
## [1.9.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0...v1.9.1) (2024-07-12)
### Bug Fixes
* solve a burr integration ([881290b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/881290b5066b39c505532656671fbf65f8fc312c))
## [1.9.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.8.0...v1.9.0) (2024-07-09)
### Features
* add fireworks integration ([df0e310](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/df0e3108299071b849d7e055bd11d72764d24f08))
* add integration for infos ([3bf5f57](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3bf5f570a8f8e1b037a7ad3c9f583261a1536421))
* add integrations for markdown files ([2804434](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2804434a9ee12c52ae8956a88b1778a4dd3ec32f))
* add vertexai integration ([119514b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/119514bdfc2a16dfb8918b0c34ae7cc43a01384c))
* improve md prompt recognition ([5fe694b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5fe694b6b4545a5091d16110318b992acfca4f58))
### Bug Fixes
* add test ([3a537ee](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3a537eec6fef1743924a9aa5cef0ba2f8d44bf11))
* fix pyproject.toml ([7570bf8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7570bf8294e49bc54ec9e296aaadb763873390ca))
### chore
* **Docker:** fix port number ([afeb81f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/afeb81f77a884799192d79dcac85666190fb1c9d))
* **CI:** fix pylint workflow ([583c321](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/583c32106e827f50235d8fc69511652fd4b07a35))
* **rye:** rebuild lockfiles ([27c2dd2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/27c2dd23517a7e4b14fafd00320a8b81f73145dc))
### Docs
* **roadmap:** fix urls ([14faba4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/14faba4f00dd9f947f8dc5e0b51be49ea684179f))
* **roadmap:** next steps ([3e644f4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e644f498f05eb505fbd4e94b144c81567569aaa))
### CI
* **release:** 1.8.1-beta.1 [skip ci] ([8f9f96f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8f9f96f7e7ff41d2fff5bbbf18bf4fc85d4f98b3))
* **release:** 1.9.0-beta.1 [skip ci] ([146432d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/146432d476f775510441b062935adc47190141e2))
* **release:** 1.9.0-beta.2 [skip ci] ([5cb5fbf](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5cb5fbf5503eec9b34a6691eb993716cc9a821d6))
## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-07-05)

View File

@ -0,0 +1,57 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
llm_model_instance = AzureChatOpenAI(
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"]
)
embedder_model_instance = AzureOpenAIEmbeddings(
azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
groq_key = os.getenv("GROQ_APIKEY")
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"headless": False
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,45 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"client": "client_name",
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
deepseek_key = os.getenv("DEEPSEEK_APIKEY")
graph_config = {
"llm": {
"model": "deepseek-chat",
"openai_api_key": deepseek_key,
"openai_api_base": 'https://api.deepseek.com/v1',
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -12,15 +12,18 @@ load_dotenv()
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
"max_results": 2,
"verbose": True,
"model": "ernie-bot-turbo",
"ernie_client_id": "<ernie_client_id>",
"ernie_client_secret": "<ernie_client_secret>",
"temperature": 0.1
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434"},
"library": "beautifulsoup"
}
# ************************************************

View File

@ -0,0 +1,46 @@
"""
Example of Search Graph
"""
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ernie-bot-turbo",
"ernie_client_id": "<ernie_client_id>",
"ernie_client_secret": "<ernie_client_secret>",
"temperature": 0.1
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434"},
"library": "beautifulsoup"
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
graph_config = {
"llm": {
"api_key": fireworks_api_key,
"model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"max_results": 2,
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,44 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
gemini_key = os.getenv("GOOGLE_APIKEY")
graph_config = {
"llm": {
"api_key": gemini_key,
"model": "gemini-pro",
},
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,52 @@
"""
Example of Search Graph
"""
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
load_dotenv()
groq_key = os.getenv("GROQ_APIKEY")
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"headless": False
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -9,7 +9,6 @@ from scrapegraphai.utils import prettify_exec_info
load_dotenv()
# ************************************************
# Define the configuration for the graph
# ************************************************

View File

@ -0,0 +1,54 @@
"""
Example of Search Graph
"""
import os
from scrapegraphai.graphs import SearchGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# ************************************************
# Define the configuration for the graph
# ************************************************
# ************************************************
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# ************************************************
# Create the SearchGraph instance and run it
# ************************************************
search_graph = SearchGraph(
prompt="List me the best escursions near Trento",
config=graph_config
)
result = search_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = search_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))
# Save to json and csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

View File

@ -0,0 +1,43 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
from scrapegraphai.graphs import SearchLinkGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/llama3",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
},
"verbose": True,
"headless": False
}
# ************************************************
# Create the SearchLinkGraph instance and run it
# ************************************************
smart_scraper_graph = SearchLinkGraph(
source="https://sport.sky.it/nba?gr=www",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -0,0 +1,36 @@
"""
Basic example of scraping pipeline using SmartScraper
"""
from scrapegraphai.graphs import SearchLinkGraph
from scrapegraphai.utils import prettify_exec_info
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"api_key": "s",
"model": "gpt-3.5-turbo",
},
"verbose": True,
"headless": False,
}
# ************************************************
# Create the SearchLinkGraph instance and run it
# ************************************************
smart_scraper_graph = SearchLinkGraph(
source="https://sport.sky.it/nba?gr=www",
config=graph_config
)
result = smart_scraper_graph.run()
print(result)
# ************************************************
# Get graph execution info
# ************************************************
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

View File

@ -2,7 +2,9 @@
name = "scrapegraphai"
version = "1.9.1"
version = "1.9.0b6"
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
@ -24,7 +26,7 @@ dependencies = [
"beautifulsoup4==4.12.3",
"pandas==2.2.2",
"python-dotenv==1.0.1",
"tiktoken==0.6.0",
"tiktoken==0.7",
"tqdm==4.66.4",
"graphviz==0.20.3",
"minify-html==0.15.0",
@ -34,7 +36,6 @@ dependencies = [
"undetected-playwright==0.3.0",
"semchunk==1.0.1",
"html2text==2024.2.26",
"trafilatura==1.10.0",
"langchain-fireworks==0.1.3"
]

View File

@ -41,7 +41,6 @@ attrs==23.2.0
# via jsonschema
# via referencing
babel==2.15.0
# via courlan
# via sphinx
beautifulsoup4==4.12.3
# via furo
@ -63,11 +62,8 @@ certifi==2024.2.2
# via httpcore
# via httpx
# via requests
# via trafilatura
charset-normalizer==3.3.2
# via htmldate
# via requests
# via trafilatura
click==8.1.7
# via burr
# via streamlit
@ -75,15 +71,11 @@ click==8.1.7
# via uvicorn
contourpy==1.2.1
# via matplotlib
courlan==1.2.0
# via trafilatura
cycler==0.12.1
# via matplotlib
dataclasses-json==0.6.6
# via langchain
# via langchain-community
dateparser==1.2.0
# via htmldate
defusedxml==0.7.1
# via langchain-anthropic
dill==0.3.8
@ -204,8 +196,6 @@ h11==0.14.0
# via uvicorn
html2text==2024.2.26
# via scrapegraphai
htmldate==1.8.1
# via trafilatura
httpcore==1.0.5
# via httpx
httplib2==0.22.0
@ -259,8 +249,6 @@ jsonschema==4.22.0
# via altair
jsonschema-specifications==2023.12.1
# via jsonschema
justext==3.0.1
# via trafilatura
kiwisolver==1.4.5
# via matplotlib
langchain==0.1.15
@ -302,12 +290,6 @@ loguru==0.7.2
# via burr
lxml==5.2.2
# via free-proxy
# via htmldate
# via justext
# via lxml-html-clean
# via trafilatura
lxml-html-clean==0.1.1
# via lxml
markdown-it-py==3.0.0
# via rich
markupsafe==2.1.5
@ -430,9 +412,7 @@ pytest==8.0.0
pytest-mock==3.14.0
python-dateutil==2.9.0.post0
# via botocore
# via dateparser
# via google-cloud-bigquery
# via htmldate
# via matplotlib
# via pandas
python-dotenv==1.0.1
@ -441,7 +421,6 @@ python-dotenv==1.0.1
python-multipart==0.0.9
# via fastapi
pytz==2024.1
# via dateparser
# via pandas
pyyaml==6.0.1
# via huggingface-hub
@ -453,7 +432,6 @@ referencing==0.35.1
# via jsonschema
# via jsonschema-specifications
regex==2024.5.15
# via dateparser
# via tiktoken
requests==2.32.2
# via burr
@ -531,11 +509,9 @@ tenacity==8.3.0
# via langchain-community
# via langchain-core
# via streamlit
tiktoken==0.6.0
tiktoken==0.7.0
# via langchain-openai
# via scrapegraphai
tld==0.13
# via courlan
tokenizers==0.19.1
# via anthropic
toml==0.10.2
@ -555,8 +531,6 @@ tqdm==4.66.4
# via openai
# via scrapegraphai
# via semchunk
trafilatura==1.10.0
# via scrapegraphai
typer==0.12.3
# via fastapi-cli
typing-extensions==4.12.0
@ -586,8 +560,6 @@ typing-inspect==0.9.0
# via sf-hamilton
tzdata==2024.1
# via pandas
tzlocal==5.2
# via dateparser
ujson==5.10.0
# via fastapi
undetected-playwright==0.3.0
@ -596,10 +568,7 @@ uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.18
# via botocore
# via courlan
# via htmldate
# via requests
# via trafilatura
uvicorn==0.29.0
# via burr
# via fastapi

View File

@ -28,8 +28,6 @@ async-timeout==4.0.3
# via langchain
attrs==23.2.0
# via aiohttp
babel==2.15.0
# via courlan
beautifulsoup4==4.12.3
# via google
# via scrapegraphai
@ -44,18 +42,11 @@ certifi==2024.2.2
# via httpcore
# via httpx
# via requests
# via trafilatura
charset-normalizer==3.3.2
# via htmldate
# via requests
# via trafilatura
courlan==1.2.0
# via trafilatura
dataclasses-json==0.6.6
# via langchain
# via langchain-community
dateparser==1.2.0
# via htmldate
defusedxml==0.7.1
# via langchain-anthropic
distro==1.9.0
@ -150,8 +141,6 @@ h11==0.14.0
# via httpcore
html2text==2024.2.26
# via scrapegraphai
htmldate==1.8.1
# via trafilatura
httpcore==1.0.5
# via httpx
httplib2==0.22.0
@ -181,8 +170,6 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
justext==3.0.1
# via trafilatura
langchain==0.1.15
# via scrapegraphai
langchain-anthropic==0.1.11
@ -220,12 +207,6 @@ langsmith==0.1.63
# via langchain-core
lxml==5.2.2
# via free-proxy
# via htmldate
# via justext
# via lxml-html-clean
# via trafilatura
lxml-html-clean==0.1.1
# via lxml
marshmallow==3.21.2
# via dataclasses-json
minify-html==0.15.0
@ -298,14 +279,11 @@ pyparsing==3.1.2
# via httplib2
python-dateutil==2.9.0.post0
# via botocore
# via dateparser
# via google-cloud-bigquery
# via htmldate
# via pandas
python-dotenv==1.0.1
# via scrapegraphai
pytz==2024.1
# via dateparser
# via pandas
pyyaml==6.0.1
# via huggingface-hub
@ -313,7 +291,6 @@ pyyaml==6.0.1
# via langchain-community
# via langchain-core
regex==2024.5.15
# via dateparser
# via tiktoken
requests==2.32.2
# via free-proxy
@ -351,11 +328,9 @@ tenacity==8.3.0
# via langchain
# via langchain-community
# via langchain-core
tiktoken==0.6.0
tiktoken==0.7.0
# via langchain-openai
# via scrapegraphai
tld==0.13
# via courlan
tokenizers==0.19.1
# via anthropic
tqdm==4.66.4
@ -364,8 +339,6 @@ tqdm==4.66.4
# via openai
# via scrapegraphai
# via semchunk
trafilatura==1.10.0
# via scrapegraphai
typing-extensions==4.12.0
# via anthropic
# via anyio
@ -382,17 +355,12 @@ typing-inspect==0.9.0
# via dataclasses-json
tzdata==2024.1
# via pandas
tzlocal==5.2
# via dateparser
undetected-playwright==0.3.0
# via scrapegraphai
uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.18
# via botocore
# via courlan
# via htmldate
# via requests
# via trafilatura
yarl==1.9.4
# via aiohttp

View File

@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph
from .script_creator_multi_graph import ScriptCreatorMultiGraph
from .markdown_scraper_graph import MDScraperGraph
from .markdown_scraper_multi_graph import MDScraperMultiGraph
from .search_link_graph import SearchLinkGraph

View File

@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
RAGNode,
GenerateAnswerCSVNode
)
@ -37,14 +36,7 @@ class CSVScraperGraph(AbstractGraph):
input="csv | csv_dir",
output=["doc"],
)
rag_node = RAGNode(
input="user_prompt & doc",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model,
}
)
generate_answer_node = GenerateAnswerCSVNode(
input="user_prompt & (relevant_chunks | doc)",
output=["answer"],
@ -58,12 +50,10 @@ class CSVScraperGraph(AbstractGraph):
return BaseGraph(
nodes=[
fetch_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, rag_node),
(rag_node, generate_answer_node)
(fetch_node, generate_answer_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__

View File

@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
RAGNode,
GenerateAnswerNode
)
@ -62,14 +61,7 @@ class JSONScraperGraph(AbstractGraph):
input="json | json_dir",
output=["doc", "link_urls", "img_urls"],
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
@ -83,12 +75,10 @@ class JSONScraperGraph(AbstractGraph):
return BaseGraph(
nodes=[
fetch_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, rag_node),
(rag_node, generate_answer_node)
(fetch_node, generate_answer_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__

View File

@ -3,7 +3,7 @@ import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
from ..nodes import FetchNode, ParseNode, GenerateAnswerNode
class MDScraperGraph(AbstractGraph):
"""
@ -63,14 +63,6 @@ class MDScraperGraph(AbstractGraph):
"chunk_size": self.model_token
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
@ -86,13 +78,11 @@ class MDScraperGraph(AbstractGraph):
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node)
(parse_node, generate_answer_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__

View File

@ -12,7 +12,6 @@ from ..nodes import (
FetchNode,
ParseNode,
ImageToTextNode,
RAGNode,
GenerateAnswerOmniNode
)
@ -89,14 +88,7 @@ class OmniScraperGraph(AbstractGraph):
"max_images": self.max_images
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_omni_node = GenerateAnswerOmniNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
output=["answer"],
@ -112,14 +104,12 @@ class OmniScraperGraph(AbstractGraph):
fetch_node,
parse_node,
image_to_text_node,
rag_node,
generate_answer_omni_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, image_to_text_node),
(image_to_text_node, rag_node),
(rag_node, generate_answer_omni_node)
(image_to_text_node, generate_answer_omni_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
@ -136,4 +126,4 @@ class OmniScraperGraph(AbstractGraph):
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")
return self.final_state.get("answer", "No answer found.")

View File

@ -12,7 +12,6 @@ from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerPDFNode
)
@ -76,14 +75,6 @@ class PDFScraperGraph(AbstractGraph):
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node_pdf = GenerateAnswerPDFNode(
input="user_prompt & (relevant_chunks | doc)",
output=["answer"],
@ -98,13 +89,11 @@ class PDFScraperGraph(AbstractGraph):
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node_pdf,
],
edges=[
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node_pdf)
(parse_node, generate_answer_node_pdf)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__

View File

@ -0,0 +1,104 @@
""" SearchLinkGraph Module """
from typing import Optional
import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
class SearchLinkGraph(AbstractGraph):
"""
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
Example:
>>> smart_scraper = SearchLinkGraph(
... "List me all the attractions in Chioggia.",
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "gpt-3.5-turbo"}}
... )
>>> result = smart_scraper.run()
"""
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
super().__init__("", config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
fetch_node = FetchNode(
input="url| local_dir",
output=["doc", "link_urls", "img_urls"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"loader_kwargs": self.config.get("loader_kwargs", {}),
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token
}
)
search_link_node = SearchLinkNode(
input="doc",
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token
}
)
return BaseGraph(
nodes=[
fetch_node,
parse_node,
search_link_node
],
edges=[
(fetch_node, parse_node),
(parse_node, search_link_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)
def run(self) -> str:
"""
Executes the scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("parsed_doc", "No answer found.")

View File

@ -11,7 +11,6 @@ from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerNode
)
@ -78,14 +77,7 @@ class SmartScraperGraph(AbstractGraph):
"chunk_size": self.model_token
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
@ -100,13 +92,11 @@ class SmartScraperGraph(AbstractGraph):
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node)
(parse_node, generate_answer_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__

View File

@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
RAGNode,
GenerateAnswerNode
)
@ -64,14 +63,7 @@ class XMLScraperGraph(AbstractGraph):
input="xml | xml_dir",
output=["doc", "link_urls", "img_urls"]
)
rag_node = RAGNode(
input="user_prompt & doc",
output=["relevant_chunks"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.embedder_model
}
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | doc)",
output=["answer"],
@ -85,12 +77,10 @@ class XMLScraperGraph(AbstractGraph):
return BaseGraph(
nodes=[
fetch_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, rag_node),
(rag_node, generate_answer_node)
(fetch_node, generate_answer_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__

View File

@ -1,5 +1,5 @@
"""
bedrock configuration wrapper
Bedrock Module
"""
from langchain_aws import ChatBedrock

View File

@ -1,5 +1,5 @@
"""
Ollama Module
Ernie Module
"""
from langchain_community.chat_models import ErnieBotChat

View File

@ -1,5 +1,5 @@
"""
OpenAI Module
OneAPI Module
"""
from langchain_openai import ChatOpenAI

View File

@ -125,7 +125,7 @@ class GenerateAnswerCSVNode(BaseNode):
template=template_no_chunks_csv_prompt,
input_variables=["question"],
partial_variables={
"context": chunk.page_content,
"context": chunk,
"format_instructions": format_instructions,
},
)
@ -137,7 +137,7 @@ class GenerateAnswerCSVNode(BaseNode):
template=template_chunks_csv_prompt,
input_variables=["question"],
partial_variables={
"context": chunk.page_content,
"context": chunk,
"chunk_id": i + 1,
"format_instructions": format_instructions,
},

View File

@ -115,7 +115,7 @@ class GenerateAnswerNode(BaseNode):
prompt = PromptTemplate(
template=template_no_chunks_prompt,
input_variables=["question"],
partial_variables={"context": chunk.page_content,
partial_variables={"context": chunk,
"format_instructions": format_instructions})
chain = prompt | self.llm_model | output_parser
answer = chain.invoke({"question": user_prompt})
@ -124,7 +124,7 @@ class GenerateAnswerNode(BaseNode):
prompt = PromptTemplate(
template=template_chunks_prompt,
input_variables=["question"],
partial_variables={"context": chunk.page_content,
partial_variables={"context": chunk,
"chunk_id": i + 1,
"format_instructions": format_instructions})
# Dynamically name the chains based on their index

View File

@ -110,7 +110,7 @@ class GenerateAnswerOmniNode(BaseNode):
template=template_no_chunk_omni_prompt,
input_variables=["question"],
partial_variables={
"context": chunk.page_content,
"context": chunk,
"format_instructions": format_instructions,
"img_desc": imag_desc,
},
@ -123,7 +123,7 @@ class GenerateAnswerOmniNode(BaseNode):
template=template_chunks_omni_prompt,
input_variables=["question"],
partial_variables={
"context": chunk.page_content,
"context": chunk,
"chunk_id": i + 1,
"format_instructions": format_instructions,
},

View File

@ -124,7 +124,7 @@ class GenerateAnswerPDFNode(BaseNode):
template=template_no_chunks_pdf_prompt,
input_variables=["question"],
partial_variables={
"context":chunk.page_content,
"context":chunk,
"format_instructions": format_instructions,
},
)

View File

@ -4,6 +4,7 @@ SearchLinkNode Module
# Imports from standard library
from typing import List, Optional
import re
from tqdm import tqdm
# Imports from Langchain
@ -20,7 +21,7 @@ from .base_node import BaseNode
class SearchLinkNode(BaseNode):
"""
A node that can filter out the relevant links in the webpage content for the user prompt.
Node expects the aleready scrapped links on the webpage and hence it is expected
Node expects the already scrapped links on the webpage and hence it is expected
that this node be used after the FetchNode.
Attributes:
@ -67,39 +68,10 @@ class SearchLinkNode(BaseNode):
self.logger.info(f"--- Executing {self.node_name} Node ---")
# Interpret input keys based on the provided input expression
input_keys = self.get_input_keys(state)
user_prompt = state[input_keys[0]]
parsed_content_chunks = state[input_keys[1]]
parsed_content_chunks = state.get("doc")
output_parser = JsonOutputParser()
prompt_relevant_links = """
You are a website scraper and you have just scraped the following content from a website.
Content: {content}
You are now tasked with identifying all hyper links within the content that are potentially
relevant to the user task: {user_prompt}
Assume relevance broadly, including any links that might be related or potentially useful
in relation to the task.
Sort it in order of importance, the first one should be the most important one, the last one
the least important
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
whether the content at the link is directly relevant.
Output only a list of relevant links in the format:
[
"link1",
"link2",
"link3",
.
.
.
]
"""
relevant_links = []
for i, chunk in enumerate(
@ -109,15 +81,47 @@ class SearchLinkNode(BaseNode):
disable=not self.verbose,
)
):
merge_prompt = PromptTemplate(
template=prompt_relevant_links,
input_variables=["content", "user_prompt"],
)
merge_chain = merge_prompt | self.llm_model | output_parser
# merge_chain = merge_prompt | self.llm_model
answer = merge_chain.invoke(
{"content": chunk.page_content, "user_prompt": user_prompt}
)
relevant_links += answer
try:
# Primary approach: Regular expression to extract links
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
relevant_links += links
except Exception as e:
# Fallback approach: Using the LLM to extract links
self.logger.error(f"Error extracting links: {e}. Falling back to LLM.")
prompt_relevant_links = """
You are a website scraper and you have just scraped the following content from a website.
Content: {content}
Assume relevance broadly, including any links that might be related or potentially useful
in relation to the task.
Sort it in order of importance, the first one should be the most important one, the last one
the least important
Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
whether the content at the link is directly relevant.
Output only a list of relevant links in the format:
[
"link1",
"link2",
"link3",
.
.
.
]
"""
merge_prompt = PromptTemplate(
template=prompt_relevant_links,
input_variables=["content", "user_prompt"],
)
merge_chain = merge_prompt | self.llm_model | output_parser
answer = merge_chain.invoke(
{"content": chunk.page_content}
)
relevant_links += answer
state.update({self.output[0]: relevant_links})
return state

View File

@ -2,8 +2,6 @@
convert_to_md modul
"""
import html2text
from trafilatura import extract
def convert_to_md(html):
""" Convert HTML to Markdown.
@ -20,6 +18,6 @@ def convert_to_md(html):
'This is a paragraph.\n\n# This is a heading.'
Note: All the styles and links are ignored during the conversion. """
return extract(filecontent=html,include_images=True,
include_links=True, include_tables=True, output_format="markdown")
h = html2text.HTML2Text()
h.ignore_links = False
return h.handle(html)

View File

@ -1,6 +1,3 @@
"""
research web module
"""
import re
from typing import List
from langchain_community.tools import DuckDuckGoSearchResults
@ -8,41 +5,39 @@ from googlesearch import search as google_search
import requests
from bs4 import BeautifulSoup
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]:
def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]:
"""
Searches the web for a given query using specified search engine options.
Args:
query (str): The search query to find on the internet.
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'.
search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
max_results (int, optional): The maximum number of search results to return.
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
Returns:
List[str]: A list of URLs as strings that are the search results.
Raises:
ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'.
ValueError: If the search engine specified is not supported.
Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
['http://example.com', 'http://example.org', ...]
This function allows switching between Google, DuckDuckGo, and Bing to perform
internet searches, returning a list of result URLs.
"""
if search_engine.lower() == "google":
res = []
for url in google_search(query, stop=max_results):
res.append(url)
return res
elif search_engine.lower() == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
links = re.findall(r'https?://[^\s,\]]+', res)
return links
elif search_engine.lower() == "bing":
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int =
response = requests.get(search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
search_results = []
for result in soup.find_all('li', class_='b_algo', limit=max_results):
link = result.find('a')['href']
search_results.append(link)
return search_results
raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing")
elif search_engine.lower() == "searxng":
url = f"http://localhost:{port}"
params = {"q": query, "format": "json"}
# Send the GET request to the server
response = requests.get(url, params=params)
# Parse the response and limit to the specified max_results
data = response.json()
limited_results = data["results"][:max_results]
return limited_results
else:
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG")

View File

@ -0,0 +1,26 @@
from scrapegraphai.graphs import SearchLinkGraph
from scrapegraphai.utils import prettify_exec_info
def test_smart_scraper_pipeline():
graph_config = {
"llm": {
"model": "ollama/llama3",
"temperature": 0,
"format": "json",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
},
"verbose": True,
"headless": False
}
smart_scraper_graph = SearchLinkGraph(
source="https://sport.sky.it/nba?gr=www",
config=graph_config
)
result = smart_scraper_graph.run()
assert result is not None

View File

@ -7,7 +7,7 @@ def test_basic_html_to_md():
def test_html_with_links_and_images():
html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
assert convert_to_md(html) is None
assert convert_to_md(html) is not None
def test_html_with_tables():
html = '''
@ -17,11 +17,11 @@ def test_html_with_tables():
<tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
</table>
'''
assert convert_to_md(html) is None
assert convert_to_md(html) is not None
def test_empty_html():
html = ""
assert convert_to_md(html) is None
assert convert_to_md(html) is not None
def test_complex_html_structure():
html = '''