From ac10128ff3af35c52b48c79d085e458524e8e48a Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Wed, 22 May 2024 18:30:49 +0200 Subject: [PATCH] feat(burr): added burr integration in graphs and optional burr installation --- examples/openai/search_graph_burr.py | 50 +++++ ...ration_openai.py => smart_scraper_burr.py} | 0 pyproject.toml | 22 +-- requirements-dev.lock | 73 +++++-- requirements.lock | 187 ++---------------- scrapegraphai/graphs/abstract_graph.py | 11 ++ scrapegraphai/graphs/base_graph.py | 5 +- scrapegraphai/integrations/__init__.py | 4 + scrapegraphai/integrations/burr_bridge.py | 5 + 9 files changed, 155 insertions(+), 202 deletions(-) create mode 100644 examples/openai/search_graph_burr.py rename examples/openai/{burr_integration_openai.py => smart_scraper_burr.py} (100%) diff --git a/examples/openai/search_graph_burr.py b/examples/openai/search_graph_burr.py new file mode 100644 index 00000000..0919d20c --- /dev/null +++ b/examples/openai/search_graph_burr.py @@ -0,0 +1,50 @@ +""" +Example of Search Graph +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { + "llm": { + "api_key": openai_key, + "model": "gpt-3.5-turbo", + }, + "max_results": 2, + "verbose": True, + "burr_kwargs": { + "project_name": "search-graph-openai", + } +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me Chioggia's attractions.", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/openai/burr_integration_openai.py b/examples/openai/smart_scraper_burr.py similarity index 100% rename from examples/openai/burr_integration_openai.py rename to examples/openai/smart_scraper_burr.py diff --git a/pyproject.toml b/pyproject.toml index 19360e4e..46471433 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,14 +29,13 @@ dependencies = [ "playwright==1.43.0", "google==3.0.0", "yahoo-search-py==0.3", - "burr[start]==0.17.1" ] license = "MIT" readme = "README.md" -homepage = "https://scrapegraph-ai.readthedocs.io/" +homepage = "https://scrapegraphai.com/" repository = "https://github.com/VinciGit00/Scrapegraph-ai" -documentation = "https://scrapegraph-doc.onrender.com/" +documentation = "https://scrapegraph-ai.readthedocs.io/en/latest/" keywords = [ "scrapegraph", "scrapegraphai", @@ -64,6 +63,10 @@ classifiers = [ ] requires-python = ">= 3.9, < 3.12" +[project.optional-dependencies] +burr = ["burr[start]==0.18.0"] +docs = ["sphinx==4.3.0", "sphinx-rtd-theme==1.0.0"] + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" @@ -72,12 +75,7 @@ build-backend = "hatchling.build" managed = true dev-dependencies = [ "pytest==8.0.0", - "pytest-mock==3.14.0" -] - -[tool.rye.group.docs] -optional = true - -[tool.rye.group.docs.dependencies] -sphinx = "7.1.2" -sphinx-rtd-theme = "2.0.0" + "pytest-mock==3.14.0", + "-e file:.[burr]", + "-e file:.[docs]", +] \ No newline at end of file diff --git a/requirements-dev.lock b/requirements-dev.lock index 7458fe01..25be91f4 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -6,7 +6,6 @@ # features: [] # all-features: false # with-sources: false -# generate-hashes: false -e file:. aiofiles==23.2.1 @@ -16,11 +15,13 @@ aiohttp==3.9.5 # via langchain-community aiosignal==1.3.1 # via aiohttp +alabaster==0.7.16 + # via sphinx altair==5.3.0 # via streamlit -annotated-types==0.6.0 +annotated-types==0.7.0 # via pydantic -anthropic==0.25.9 +anthropic==0.26.1 # via langchain-anthropic anyio==4.3.0 # via anthropic @@ -36,17 +37,20 @@ attrs==23.2.0 # via aiohttp # via jsonschema # via referencing +babel==2.15.0 + # via sphinx beautifulsoup4==4.12.3 # via google # via scrapegraphai blinker==1.8.2 # via streamlit -boto3==1.34.105 +boto3==1.34.110 # via langchain-aws -botocore==1.34.105 +botocore==1.34.110 # via boto3 # via s3transfer -burr==0.17.1 +burr==0.18.0 + # via burr # via scrapegraphai cachetools==5.3.3 # via google-auth @@ -66,6 +70,7 @@ colorama==0.4.6 # via click # via loguru # via pytest + # via sphinx # via tqdm # via uvicorn contourpy==1.2.1 @@ -83,6 +88,9 @@ distro==1.9.0 # via openai dnspython==2.6.1 # via email-validator +docutils==0.17.1 + # via sphinx + # via sphinx-rtd-theme email-validator==2.1.1 # via fastapi exceptiongroup==1.2.1 @@ -106,7 +114,7 @@ free-proxy==1.1.1 frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.3.1 +fsspec==2024.5.0 # via huggingface-hub gitdb==4.0.11 # via gitpython @@ -114,7 +122,7 @@ gitpython==3.1.43 # via streamlit google==3.0.0 # via scrapegraphai -google-ai-generativelanguage==0.6.3 +google-ai-generativelanguage==0.6.4 # via google-generativeai google-api-core==2.19.0 # via google-ai-generativelanguage @@ -130,7 +138,7 @@ google-auth==2.29.0 # via google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-generativeai==0.5.3 +google-generativeai==0.5.4 # via langchain-google-genai googleapis-common-protos==1.63.0 # via google-api-core @@ -141,9 +149,9 @@ graphviz==0.20.3 greenlet==3.0.3 # via playwright # via sqlalchemy -groq==0.5.0 +groq==0.7.0 # via langchain-groq -grpcio==1.63.0 +grpcio==1.64.0 # via google-api-core # via grpcio-status grpcio-status==1.62.2 @@ -166,7 +174,7 @@ httpx==0.27.0 # via groq # via openai # via yahoo-search-py -huggingface-hub==0.23.0 +huggingface-hub==0.23.1 # via tokenizers idna==3.7 # via anyio @@ -174,6 +182,8 @@ idna==3.7 # via httpx # via requests # via yarl +imagesize==1.4.1 + # via sphinx iniconfig==2.0.0 # via pytest jinja2==3.1.4 @@ -181,6 +191,9 @@ jinja2==3.1.4 # via burr # via fastapi # via pydeck + # via sphinx +jiter==0.1.0 + # via anthropic jmespath==1.0.1 # via boto3 # via botocore @@ -218,9 +231,9 @@ langchain-groq==0.1.3 # via scrapegraphai langchain-openai==0.1.6 # via scrapegraphai -langchain-text-splitters==0.0.1 +langchain-text-splitters==0.0.2 # via langchain -langsmith==0.1.57 +langsmith==0.1.60 # via langchain # via langchain-community # via langchain-core @@ -271,6 +284,7 @@ packaging==23.2 # via marshmallow # via matplotlib # via pytest + # via sphinx # via streamlit pandas==2.2.2 # via altair @@ -322,6 +336,7 @@ pyee==11.1.0 # via playwright pygments==2.18.0 # via rich + # via sphinx pyparsing==3.1.2 # via httplib2 # via matplotlib @@ -348,9 +363,9 @@ pyyaml==6.0.1 referencing==0.35.1 # via jsonschema # via jsonschema-specifications -regex==2024.5.10 +regex==2024.5.15 # via tiktoken -requests==2.31.0 +requests==2.32.2 # via burr # via free-proxy # via google-api-core @@ -358,6 +373,7 @@ requests==2.31.0 # via langchain # via langchain-community # via langsmith + # via sphinx # via streamlit # via tiktoken rich==13.7.1 @@ -372,7 +388,7 @@ s3transfer==0.10.1 # via boto3 selectolax==0.3.21 # via yahoo-search-py -sf-hamilton==1.62.0 +sf-hamilton==1.63.0 # via burr shellingham==1.5.4 # via typer @@ -386,8 +402,27 @@ sniffio==1.3.1 # via groq # via httpx # via openai +snowballstemmer==2.2.0 + # via sphinx soupsieve==2.5 # via beautifulsoup4 +sphinx==4.3.0 + # via scrapegraphai + # via sphinx-rtd-theme +sphinx-rtd-theme==1.0.0 + # via scrapegraphai +sphinxcontrib-applehelp==1.0.8 + # via sphinx +sphinxcontrib-devhelp==1.0.6 + # via sphinx +sphinxcontrib-htmlhelp==2.0.5 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==1.0.7 + # via sphinx +sphinxcontrib-serializinghtml==1.1.10 + # via sphinx sqlalchemy==2.0.30 # via langchain # via langchain-community @@ -448,7 +483,7 @@ ujson==5.10.0 # via fastapi uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.18 +urllib3==2.2.1 # via botocore # via requests # via yahoo-search-py @@ -467,3 +502,5 @@ yahoo-search-py==0.3 # via scrapegraphai yarl==1.9.4 # via aiohttp +setuptools==70.0.0 + # via sphinx diff --git a/requirements.lock b/requirements.lock index ed73ca98..12cca471 100644 --- a/requirements.lock +++ b/requirements.lock @@ -6,71 +6,45 @@ # features: [] # all-features: false # with-sources: false -# generate-hashes: false -e file:. -aiofiles==23.2.1 - # via burr aiohttp==3.9.5 # via langchain # via langchain-community aiosignal==1.3.1 # via aiohttp -altair==5.3.0 - # via streamlit -annotated-types==0.6.0 +annotated-types==0.7.0 # via pydantic -anthropic==0.25.9 +anthropic==0.26.1 # via langchain-anthropic anyio==4.3.0 # via anthropic # via groq # via httpx # via openai - # via starlette - # via watchfiles async-timeout==4.0.3 # via aiohttp # via langchain attrs==23.2.0 # via aiohttp - # via jsonschema - # via referencing beautifulsoup4==4.12.3 # via google # via scrapegraphai -blinker==1.8.2 - # via streamlit -boto3==1.34.105 +boto3==1.34.110 # via langchain-aws -botocore==1.34.105 +botocore==1.34.110 # via boto3 # via s3transfer -burr==0.17.1 - # via scrapegraphai cachetools==5.3.3 # via google-auth - # via streamlit certifi==2024.2.2 # via httpcore # via httpx # via requests charset-normalizer==3.3.2 # via requests -click==8.1.7 - # via burr - # via streamlit - # via typer - # via uvicorn colorama==0.4.6 - # via click - # via loguru # via tqdm - # via uvicorn -contourpy==1.2.1 - # via matplotlib -cycler==0.12.1 - # via matplotlib dataclasses-json==0.6.6 # via langchain # via langchain-community @@ -80,39 +54,22 @@ distro==1.9.0 # via anthropic # via groq # via openai -dnspython==2.6.1 - # via email-validator -email-validator==2.1.1 - # via fastapi exceptiongroup==1.2.1 # via anyio faiss-cpu==1.8.0 # via scrapegraphai -fastapi==0.111.0 - # via burr - # via fastapi-pagination -fastapi-cli==0.0.4 - # via fastapi -fastapi-pagination==0.12.24 - # via burr filelock==3.14.0 # via huggingface-hub -fonttools==4.51.0 - # via matplotlib free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 # via aiohttp # via aiosignal -fsspec==2024.3.1 +fsspec==2024.5.0 # via huggingface-hub -gitdb==4.0.11 - # via gitpython -gitpython==3.1.43 - # via streamlit google==3.0.0 # via scrapegraphai -google-ai-generativelanguage==0.6.3 +google-ai-generativelanguage==0.6.4 # via google-generativeai google-api-core==2.19.0 # via google-ai-generativelanguage @@ -128,27 +85,25 @@ google-auth==2.29.0 # via google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-generativeai==0.5.3 +google-generativeai==0.5.4 # via langchain-google-genai googleapis-common-protos==1.63.0 # via google-api-core # via grpcio-status graphviz==0.20.3 - # via burr # via scrapegraphai greenlet==3.0.3 # via playwright # via sqlalchemy -groq==0.5.0 +groq==0.7.0 # via langchain-groq -grpcio==1.63.0 +grpcio==1.64.0 # via google-api-core # via grpcio-status grpcio-status==1.62.2 # via google-api-core h11==0.14.0 # via httpcore - # via uvicorn html2text==2024.2.26 # via scrapegraphai httpcore==1.0.5 @@ -156,27 +111,20 @@ httpcore==1.0.5 httplib2==0.22.0 # via google-api-python-client # via google-auth-httplib2 -httptools==0.6.1 - # via uvicorn httpx==0.27.0 # via anthropic - # via fastapi # via groq # via openai # via yahoo-search-py -huggingface-hub==0.23.0 +huggingface-hub==0.23.1 # via tokenizers idna==3.7 # via anyio - # via email-validator # via httpx # via requests # via yarl -jinja2==3.1.4 - # via altair - # via burr - # via fastapi - # via pydeck +jiter==0.1.0 + # via anthropic jmespath==1.0.1 # via boto3 # via botocore @@ -185,12 +133,6 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -jsonschema==4.22.0 - # via altair -jsonschema-specifications==2023.12.1 - # via jsonschema -kiwisolver==1.4.5 - # via matplotlib langchain==0.1.15 # via scrapegraphai langchain-anthropic==0.1.11 @@ -214,26 +156,16 @@ langchain-groq==0.1.3 # via scrapegraphai langchain-openai==0.1.6 # via scrapegraphai -langchain-text-splitters==0.0.1 +langchain-text-splitters==0.0.2 # via langchain -langsmith==0.1.57 +langsmith==0.1.60 # via langchain # via langchain-community # via langchain-core -loguru==0.7.2 - # via burr lxml==5.2.2 # via free-proxy -markdown-it-py==3.0.0 - # via rich -markupsafe==2.1.5 - # via jinja2 marshmallow==3.21.2 # via dataclasses-json -matplotlib==3.9.0 - # via burr -mdurl==0.1.2 - # via markdown-it-py minify-html==0.15.0 # via scrapegraphai multidict==6.0.5 @@ -242,39 +174,21 @@ multidict==6.0.5 mypy-extensions==1.0.0 # via typing-inspect numpy==1.26.4 - # via altair - # via contourpy # via faiss-cpu # via langchain # via langchain-aws # via langchain-community - # via matplotlib # via pandas - # via pyarrow - # via pydeck - # via sf-hamilton - # via streamlit openai==1.30.1 - # via burr # via langchain-openai orjson==3.10.3 - # via fastapi # via langsmith packaging==23.2 - # via altair # via huggingface-hub # via langchain-core # via marshmallow - # via matplotlib - # via streamlit pandas==2.2.2 - # via altair # via scrapegraphai - # via sf-hamilton - # via streamlit -pillow==10.3.0 - # via matplotlib - # via streamlit playwright==1.43.0 # via scrapegraphai proto-plus==1.23.0 @@ -287,9 +201,6 @@ protobuf==4.25.3 # via googleapis-common-protos # via grpcio-status # via proto-plus - # via streamlit -pyarrow==16.1.0 - # via streamlit pyasn1==0.6.0 # via pyasn1-modules # via rsa @@ -297,9 +208,6 @@ pyasn1-modules==0.4.0 # via google-auth pydantic==2.7.1 # via anthropic - # via burr - # via fastapi - # via fastapi-pagination # via google-generativeai # via groq # via langchain @@ -309,24 +217,15 @@ pydantic==2.7.1 # via yahoo-search-py pydantic-core==2.18.2 # via pydantic -pydeck==0.9.1 - # via streamlit pyee==11.1.0 # via playwright -pygments==2.18.0 - # via rich pyparsing==3.1.2 # via httplib2 - # via matplotlib python-dateutil==2.9.0.post0 # via botocore - # via matplotlib # via pandas python-dotenv==1.0.1 # via scrapegraphai - # via uvicorn -python-multipart==0.0.9 - # via fastapi pytz==2024.1 # via pandas pyyaml==6.0.1 @@ -334,42 +233,24 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core - # via uvicorn -referencing==0.35.1 - # via jsonschema - # via jsonschema-specifications -regex==2024.5.10 +regex==2024.5.15 # via tiktoken -requests==2.31.0 - # via burr +requests==2.32.2 # via free-proxy # via google-api-core # via huggingface-hub # via langchain # via langchain-community # via langsmith - # via streamlit # via tiktoken -rich==13.7.1 - # via streamlit - # via typer -rpds-py==0.18.1 - # via jsonschema - # via referencing rsa==4.9 # via google-auth s3transfer==0.10.1 # via boto3 selectolax==0.3.21 # via yahoo-search-py -sf-hamilton==1.62.0 - # via burr -shellingham==1.5.4 - # via typer six==1.16.0 # via python-dateutil -smmap==5.0.1 - # via gitdb sniffio==1.3.1 # via anthropic # via anyio @@ -381,39 +262,23 @@ soupsieve==2.5 sqlalchemy==2.0.30 # via langchain # via langchain-community -starlette==0.37.2 - # via fastapi -streamlit==1.34.0 - # via burr tenacity==8.3.0 # via langchain # via langchain-community # via langchain-core - # via streamlit tiktoken==0.6.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 # via anthropic -toml==0.10.2 - # via streamlit -toolz==0.12.1 - # via altair -tornado==6.4 - # via streamlit tqdm==4.66.4 # via google-generativeai # via huggingface-hub # via openai # via scrapegraphai -typer==0.12.3 - # via fastapi-cli typing-extensions==4.11.0 - # via altair # via anthropic # via anyio - # via fastapi - # via fastapi-pagination # via google-generativeai # via groq # via huggingface-hub @@ -421,36 +286,18 @@ typing-extensions==4.11.0 # via pydantic # via pydantic-core # via pyee - # via sf-hamilton # via sqlalchemy - # via streamlit - # via typer # via typing-inspect - # via uvicorn typing-inspect==0.9.0 # via dataclasses-json - # via sf-hamilton tzdata==2024.1 # via pandas -ujson==5.10.0 - # via fastapi uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.18 +urllib3==2.2.1 # via botocore # via requests # via yahoo-search-py -uvicorn==0.29.0 - # via burr - # via fastapi -watchdog==4.0.0 - # via streamlit -watchfiles==0.21.0 - # via uvicorn -websockets==12.0 - # via uvicorn -win32-setctime==1.1.0 - # via loguru yahoo-search-py==0.3 # via scrapegraphai yarl==1.9.4 diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 28eb27b2..b11f8cf9 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -3,6 +3,7 @@ AbstractGraph Module """ from abc import ABC, abstractmethod from typing import Optional +import uuid from langchain_aws import BedrockEmbeddings from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings @@ -69,6 +70,16 @@ class AbstractGraph(ABC): "embedder_model": self.embedder_model} self.set_common_params(common_params, overwrite=False) + # set burr config + self.burr_kwargs = config.get("burr_kwargs", None) + if self.burr_kwargs is not None: + self.graph.use_burr = True + if "app_instance_id" not in self.burr_kwargs: + # set a random uuid for the app_instance_id to avoid conflicts + self.burr_kwargs["app_instance_id"] = str(uuid.uuid4()) + + self.graph.burr_config = self.burr_kwargs + def set_common_params(self, params: dict, overwrite=False): """ Pass parameters to every node in the graph unless otherwise defined in the graph. diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 07615a78..625e8f12 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -7,8 +7,6 @@ import warnings from langchain_community.callbacks import get_openai_callback from typing import Tuple -from ..integrations import BurrBridge - class BaseGraph: """ @@ -163,6 +161,9 @@ class BaseGraph: self.initial_state = initial_state if self.use_burr: + + from ..integrations import BurrBridge + bridge = BurrBridge(self, self.burr_config) result = bridge.execute(initial_state) return (result["_state"], []) diff --git a/scrapegraphai/integrations/__init__.py b/scrapegraphai/integrations/__init__.py index 97589cd0..556ccc2f 100644 --- a/scrapegraphai/integrations/__init__.py +++ b/scrapegraphai/integrations/__init__.py @@ -1 +1,5 @@ +""" +Init file for integrations module +""" + from .burr_bridge import BurrBridge \ No newline at end of file diff --git a/scrapegraphai/integrations/burr_bridge.py b/scrapegraphai/integrations/burr_bridge.py index bd8df466..746fbdb7 100644 --- a/scrapegraphai/integrations/burr_bridge.py +++ b/scrapegraphai/integrations/burr_bridge.py @@ -6,6 +6,11 @@ Bridge class to integrate Burr into ScrapeGraphAI graphs import re from typing import Any, Dict, List, Tuple +try: + import burr +except ImportError: + raise ImportError("burr package is not installed. Please install it with 'pip install scrapegraphai[burr]'") + from burr import tracking from burr.core import Application, ApplicationBuilder, State, Action, default from burr.lifecycle import PostRunStepHook, PreRunStepHook