From 5d97edfc5b6b02c5a57fa640ae98b1dc08d6b973 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:51:43 +0100 Subject: [PATCH 1/4] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8d69b2f5..ce409dcc 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ # 🕷️ ScrapeGraphAI: You Only Scrape Once [![Downloads](https://static.pepy.tech/badge/scrapegraphai)](https://pepy.tech/project/scrapegraphai) [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint) +[![Pylint](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml/badge.svg)](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) From 0f3ed3d27bf1a6bb6bd282396f922099237ad6af Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Wed, 13 Mar 2024 23:54:59 +0100 Subject: [PATCH 2/4] Update README.md --- README.md | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index ce409dcc..adcb92c0 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Official streamlit demo: [![My Skills](https://skillicons.dev/icons?i=react)](https://scrapegraph-ai-demo.streamlit.app/) -Is it possible to try also the colab version +Try it directly on the web using Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sEZBonBMGP44CtO6GQTwAlL0BGJXjtfd?usp=sharing) @@ -37,7 +37,9 @@ Follow the procedure on the following link to setup your OpenAI API key: [link]( ## 📖 Documentation The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.readthedocs.io/en/latest/). -Behind this there is also the docusaurus documentation [here](https://scrapegraph-doc.onrender.com/). + +Check out also the docusaurus [documentation](https://scrapegraph-doc.onrender.com/). + ## 💻 Usage ### Case 1: Extracting information using a prompt @@ -78,25 +80,18 @@ The output will be a dictionary with the extracted information, for example: ## 🤝 Contributing -Scrapegraph-ai is [MIT LICENSED](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/LICENSE). - -Contributions are welcome! Please check out the todos below, and feel free to open a pull request. +Fell free to contribute and join our Discord server to discuss with us improvements and give us suggestions! For more information, please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/CONTRIBUTING.md). -Join our Discord server to discuss with us improvements and give us suggestions! - [![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/DujC7HG8) - - -You can also follow all the updates on linkedin! - [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) +## ❤️ Contributors [![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) -### Citations -If you want to use our library for research purposes please quote us with the following reference +## 🎓 Citations +If you have used our library for research purposes please quote us with the following reference: ```text @misc{scrapegraph-ai, author = {Marco Perini, Lorenzo Padoan, Marco Vinciguerra}, @@ -121,7 +116,7 @@ If you want to use our library for research purposes please quote us with the fo ## 📜 License -ScrapeGraphAI is licensed under the Apache 2.0 License. See the [LICENSE](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/LICENSE) file for more information. +ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/LICENSE) file for more information. ## Acknowledgements From d0f4b0159fbcd20cb2b204a2b3359079a253475d Mon Sep 17 00:00:00 2001 From: Lorenzo Padoan Date: Thu, 14 Mar 2024 19:35:04 +0100 Subject: [PATCH 3/4] DEV gemini support for simple custom graph -Not supported yet smartscrapergrapn with gemini --- .gitignore | 1 + .../graph_examples/custom_graph_example.py | 28 +++-- .../graph_examples/graph_from_text_example.py | 4 +- requirements.txt | 1 + scrapegraphai/builders/graph_builder.py | 9 +- scrapegraphai/graphs/smart_scraper_graph.py | 4 +- scrapegraphai/graphs/speech_summary_graph.py | 4 +- scrapegraphai/models/__init__.py | 1 + scrapegraphai/models/gemini.py | 19 ++++ scrapegraphai/nodes/__init__.py | 5 +- ...de.py => generate_answer_node_from_rag.py} | 2 +- .../nodes/generate_answer_node_vanilla.py | 103 ++++++++++++++++++ tests/graphs/graph_from_text_test.py | 4 +- 13 files changed, 163 insertions(+), 22 deletions(-) create mode 100644 scrapegraphai/models/gemini.py rename scrapegraphai/nodes/{generate_answer_node.py => generate_answer_node_from_rag.py} (99%) create mode 100644 scrapegraphai/nodes/generate_answer_node_vanilla.py diff --git a/.gitignore b/.gitignore index 45329238..26f73e8c 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ venv/ *.mp3 *.sqlite examples/graph_examples/ScrapeGraphAI_generated_graph +main.py diff --git a/examples/graph_examples/custom_graph_example.py b/examples/graph_examples/custom_graph_example.py index e5c4c4a3..c22cd369 100644 --- a/examples/graph_examples/custom_graph_example.py +++ b/examples/graph_examples/custom_graph_example.py @@ -4,47 +4,55 @@ Example of custom graph using existing nodes import os from dotenv import load_dotenv -from scrapegraphai.models import OpenAI +#from scrapegraphai.models import OpenAI +from scrapegraphai.models import Gemini from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNode +from scrapegraphai.nodes import FetchHTMLNode, ParseNode, GenerateAnswerNodeVanilla + + load_dotenv() # Define the configuration for the language model -openai_key = os.getenv("OPENAI_APIKEY") +""" openai_key = os.getenv("OPENAI_APIKEY") llm_config = { "api_key": openai_key, "model_name": "gpt-3.5-turbo", "temperature": 0, "streaming": True } -model = OpenAI(llm_config) +model = OpenAI(llm_config) """ + +gemini_key = os.getenv("GOOGLE_API_KEY") +llm_config = { + "api_key": gemini_key, + "model_name": "gemini-pro", +} + +model = Gemini(llm_config) # define the nodes for the graph fetch_html_node = FetchHTMLNode("fetch_html") parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document") -rag_node = RAGNode(model, "rag") -generate_answer_node = GenerateAnswerNode(model, "generate_answer") +generate_answer_node = GenerateAnswerNodeVanilla(model, "generate_answer") # create the graph graph = BaseGraph( nodes={ fetch_html_node, parse_document_node, - rag_node, generate_answer_node }, edges={ (fetch_html_node, parse_document_node), - (parse_document_node, rag_node), - (rag_node, generate_answer_node) + (parse_document_node,generate_answer_node) }, entry_point=fetch_html_node ) # execute the graph inputs = {"user_input": "List me the projects with their description", - "url": "https://perinim.github.io/projects/"} + "url": "https://perinim.github.io/projects/"} result = graph.execute(inputs) # get the answer from the result diff --git a/examples/graph_examples/graph_from_text_example.py b/examples/graph_examples/graph_from_text_example.py index 0b6733e9..97444f09 100644 --- a/examples/graph_examples/graph_from_text_example.py +++ b/examples/graph_examples/graph_from_text_example.py @@ -6,7 +6,7 @@ import os from dotenv import load_dotenv from scrapegraphai.models import OpenAI from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchTextNode, ParseNode, RAGNode, GenerateAnswerNode +from scrapegraphai.nodes import FetchTextNode, ParseNode, RAGNode, GenerateAnswerNodeFromRag load_dotenv() @@ -32,7 +32,7 @@ fetch_text_node = FetchTextNode("load_html_from_text") parse_document_node = ParseNode( doc_type="text", chunks_size=4000, node_name="parse_document") rag_node = RAGNode(model, "rag") -generate_answer_node = GenerateAnswerNode(model, "generate_answer") +generate_answer_node = GenerateAnswerNodeFromRag(model, "generate_answer") # create the graph graph = BaseGraph( diff --git a/requirements.txt b/requirements.txt index 5a289fef..3f387f1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ langchain==0.1.6 langchain_community==0.0.19 langchain_core==0.1.22 langchain_openai==0.0.5 +langchain_google_genai==0.0.11 faiss-cpu==1.7.4 html2text==2020.1.16 beautifulsoup4==4.12.3 diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py index 1da4c928..fe1942ed 100644 --- a/scrapegraphai/builders/graph_builder.py +++ b/scrapegraphai/builders/graph_builder.py @@ -4,7 +4,7 @@ Module for making the graph building import graphviz from langchain_core.prompts import ChatPromptTemplate from langchain.chains import create_extraction_chain -from ..models import OpenAI +from ..models import OpenAI, Gemini from ..helpers import nodes_metadata, graph_schema @@ -68,6 +68,13 @@ class GraphBuilder: llm_params = {**llm_defaults, **self.llm_config} if "api_key" not in llm_params: raise ValueError("LLM configuration must include an 'api_key'.") + + # select the model based on the model name + if "gpt-" in llm_params["model_name"]: + return OpenAI(llm_params) + elif "gemini" in llm_params["model_name"]: + return Gemini(llm_params) + return OpenAI(llm_params) def _generate_nodes_description(self): diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index fc25ad5a..ff03f7fc 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -7,7 +7,7 @@ from ..nodes import ( FetchHTMLNode, ParseNode, RAGNode, - GenerateAnswerNode + GenerateAnswerNodeFromRag ) class SmartScraperGraph: @@ -78,7 +78,7 @@ class SmartScraperGraph: fetch_html_node = FetchHTMLNode("fetch_html") parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document") rag_node = RAGNode(self.llm, "rag") - generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer") + generate_answer_node = GenerateAnswerNodeFromRag(self.llm, "generate_answer") return BaseGraph( nodes={ diff --git a/scrapegraphai/graphs/speech_summary_graph.py b/scrapegraphai/graphs/speech_summary_graph.py index a23af88f..c121ae02 100644 --- a/scrapegraphai/graphs/speech_summary_graph.py +++ b/scrapegraphai/graphs/speech_summary_graph.py @@ -8,7 +8,7 @@ from ..nodes import ( FetchHTMLNode, ParseNode, RAGNode, - GenerateAnswerNode, + GenerateAnswerNodeFromRag, TextToSpeechNode, ) @@ -82,7 +82,7 @@ class SpeechSummaryGraph: fetch_html_node = FetchHTMLNode("fetch_html") parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document") rag_node = RAGNode(self.llm, "rag") - generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer") + generate_answer_node = GenerateAnswerNodeFromRag(self.llm, "generate_answer") text_to_speech_node = TextToSpeechNode( self.text_to_speech_model, "text_to_speech") diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py index f41bd57d..985a9153 100644 --- a/scrapegraphai/models/__init__.py +++ b/scrapegraphai/models/__init__.py @@ -5,3 +5,4 @@ from .openai import OpenAI from .openai_itt import OpenAIImageToText from .openai_tts import OpenAITextToSpeech +from .gemini import Gemini diff --git a/scrapegraphai/models/gemini.py b/scrapegraphai/models/gemini.py new file mode 100644 index 00000000..51be4fa2 --- /dev/null +++ b/scrapegraphai/models/gemini.py @@ -0,0 +1,19 @@ +from langchain_google_genai import ChatGoogleGenerativeAI + + +class Gemini(ChatGoogleGenerativeAI): + """Class for wrapping gemini module""" + + def __init__(self, llm_config: dict): + """ + A wrapper for the Gemini class that provides default configuration + and could be extended with additional methods if needed. + + Args: + llm_config (dict): Configuration parameters for the language model. + such as model="gemini-pro" and api_key + """ + # change the key model_name to model + llm_config["model"] = llm_config["model_name"] + # Initialize the superclass (ChatOpenAI) with provided config parameters + super().__init__(**llm_config) diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index d4af797c..03e027c5 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -4,9 +4,10 @@ __init__.py file for node folder from .fetch_html_node import FetchHTMLNode from .conditional_node import ConditionalNode from .get_probable_tags_node import GetProbableTagsNode -from .generate_answer_node import GenerateAnswerNode +from .generate_answer_node_from_rag import GenerateAnswerNodeFromRag from .parse_node import ParseNode from .rag_node import RAGNode from .text_to_speech_node import TextToSpeechNode from .image_to_text_node import ImageToTextNode -from .fetch_text_node import FetchTextNode \ No newline at end of file +from .fetch_text_node import FetchTextNode +from .generate_answer_node_vanilla import GenerateAnswerNodeVanilla \ No newline at end of file diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node_from_rag.py similarity index 99% rename from scrapegraphai/nodes/generate_answer_node.py rename to scrapegraphai/nodes/generate_answer_node_from_rag.py index 3524e187..6aca5af0 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node_from_rag.py @@ -13,7 +13,7 @@ from langchain_core.runnables import RunnableParallel from .base_node import BaseNode -class GenerateAnswerNode(BaseNode): +class GenerateAnswerNodeFromRag(BaseNode): """ A node that generates an answer using a language model (LLM) based on the user's input and the content extracted from a webpage. It constructs a prompt from the user's input diff --git a/scrapegraphai/nodes/generate_answer_node_vanilla.py b/scrapegraphai/nodes/generate_answer_node_vanilla.py new file mode 100644 index 00000000..a7cd9aaa --- /dev/null +++ b/scrapegraphai/nodes/generate_answer_node_vanilla.py @@ -0,0 +1,103 @@ +""" +Module for generating the answer node +""" +# Imports from standard library +from tqdm import tqdm + +# Imports from Langchain +from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.runnables import RunnableParallel + +# Imports from the library +from .base_node import BaseNode + + +class GenerateAnswerNodeVanilla(BaseNode): + """ + A node that generates an answer using a language model (LLM) based on the user's input + and the content extracted from a webpage. It constructs a prompt from the user's input + and the scraped content, feeds it to the LLM, and parses the LLM's response to produce + an answer. + + Attributes: + llm (ChatOpenAI): An instance of a language model client, configured for generating answers. + node_name (str): The unique identifier name for the node, defaulting + to "GenerateAnswerNode". + node_type (str): The type of the node, set to "node" indicating a + standard operational node. + + Args: + llm: An instance of the language model client (e.g., ChatOpenAI) used + for generating answers. + node_name (str, optional): The unique identifier name for the node. + Defaults to "GenerateAnswerNodeVanilla". + + Methods: + execute(state): Processes the input and document from the state to generate an answer, + updating the state with the generated answer under the 'answer' key. + """ + + def __init__(self, llm, node_name: str): + """ + Initializes the GenerateAnswerNode with a language model client and a node name. + Args: + llm (OpenAIImageToText): An instance of the OpenAIImageToText class. + node_name (str): name of the node + """ + super().__init__(node_name, "node") + self.llm = llm + + def execute(self, state: dict) -> dict: + """ + Generates an answer by constructing a prompt from the user's input and the scraped + content, querying the language model, and parsing its response. + + The method updates the state with the generated answer under the 'answer' key. + + Args: + state (dict): The current state of the graph, expected to contain 'user_input', + and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + + Returns: + dict: The updated state with the 'answer' key containing the generated answer. + + Raises: + KeyError: If 'user_input' or 'document' is not found in the state, indicating + that the necessary information for generating an answer is missing. + """ + + print("---GENERATING ANSWER---") + try: + user_input = state["user_input"] + document = state["document"][0] + except KeyError as e: + print(f"Error: {e} not found in state.") + raise + + context = document + + output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() + + template_json = """You are a website scraper and you have just scraped the + following content from a website. + You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n + This is the scraped text:\n + {context} \n + Question: {question} + """ + + # Merge the answers from the chunks + merge_prompt = PromptTemplate( + template=template_json, + input_variables=["context", "question"], + partial_variables={"format_instructions": format_instructions}, + ) + merge_chain = merge_prompt | self.llm | output_parser + answer = merge_chain.invoke( + {"context": context, "question": user_input}) + + # Update the state with the generated answer + state.update({"answer": answer}) + return state \ No newline at end of file diff --git a/tests/graphs/graph_from_text_test.py b/tests/graphs/graph_from_text_test.py index 5c4154a8..c222e223 100644 --- a/tests/graphs/graph_from_text_test.py +++ b/tests/graphs/graph_from_text_test.py @@ -6,7 +6,7 @@ import unittest from unittest.mock import patch from scrapegraphai.models import OpenAI from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchTextNode, ParseNode, RAGNode, GenerateAnswerNode +from scrapegraphai.nodes import FetchTextNode, ParseNode, RAGNode, GenerateAnswerNodeFromRag class TestCustomGraph(unittest.TestCase): @@ -59,7 +59,7 @@ class TestCustomGraph(unittest.TestCase): parse_document_node = ParseNode( doc_type="text", chunks_size=20, node_name="parse_document") rag_node = RAGNode(model, "rag") - generate_answer_node = GenerateAnswerNode(model, "generate_answer") + generate_answer_node = GenerateAnswerNodeFromRag(model, "generate_answer") graph = BaseGraph( nodes={ From e0c5bfe09340dc4180b8d7c260d381a05e230a25 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Thu, 14 Mar 2024 21:03:34 +0100 Subject: [PATCH 4/4] add new example for gemini --- .../graph_examples/custom_graph_example.py | 29 +++++------ .../graph_examples/custom_graph_gemini.py | 49 +++++++++++++++++++ pyproject.toml | 2 +- 3 files changed, 61 insertions(+), 19 deletions(-) create mode 100644 examples/graph_examples/custom_graph_gemini.py diff --git a/examples/graph_examples/custom_graph_example.py b/examples/graph_examples/custom_graph_example.py index c22cd369..8d4f1bd0 100644 --- a/examples/graph_examples/custom_graph_example.py +++ b/examples/graph_examples/custom_graph_example.py @@ -4,36 +4,27 @@ Example of custom graph using existing nodes import os from dotenv import load_dotenv -#from scrapegraphai.models import OpenAI -from scrapegraphai.models import Gemini +from scrapegraphai.models import OpenAI from scrapegraphai.graphs import BaseGraph -from scrapegraphai.nodes import FetchHTMLNode, ParseNode, GenerateAnswerNodeVanilla - - +from scrapegraphai.nodes import FetchHTMLNode, ParseNode, RAGNode, GenerateAnswerNodeVanilla load_dotenv() # Define the configuration for the language model -""" openai_key = os.getenv("OPENAI_APIKEY") +openai_key = os.getenv("OPENAI_APIKEY") llm_config = { "api_key": openai_key, "model_name": "gpt-3.5-turbo", "temperature": 0, "streaming": True } -model = OpenAI(llm_config) """ - -gemini_key = os.getenv("GOOGLE_API_KEY") -llm_config = { - "api_key": gemini_key, - "model_name": "gemini-pro", -} - -model = Gemini(llm_config) +model = OpenAI(llm_config) # define the nodes for the graph fetch_html_node = FetchHTMLNode("fetch_html") -parse_document_node = ParseNode(doc_type="html", chunks_size=4000, node_name="parse_document") +parse_document_node = ParseNode( + doc_type="html", chunks_size=4000, node_name="parse_document") +rag_node = RAGNode(model, "rag") generate_answer_node = GenerateAnswerNodeVanilla(model, "generate_answer") # create the graph @@ -41,18 +32,20 @@ graph = BaseGraph( nodes={ fetch_html_node, parse_document_node, + rag_node, generate_answer_node }, edges={ (fetch_html_node, parse_document_node), - (parse_document_node,generate_answer_node) + (parse_document_node, rag_node), + (rag_node, generate_answer_node) }, entry_point=fetch_html_node ) # execute the graph inputs = {"user_input": "List me the projects with their description", - "url": "https://perinim.github.io/projects/"} + "url": "https://perinim.github.io/projects/"} result = graph.execute(inputs) # get the answer from the result diff --git a/examples/graph_examples/custom_graph_gemini.py b/examples/graph_examples/custom_graph_gemini.py new file mode 100644 index 00000000..86117846 --- /dev/null +++ b/examples/graph_examples/custom_graph_gemini.py @@ -0,0 +1,49 @@ +""" +Example of custom graph using existing node using Gemini APIs +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.models import Gemini +from scrapegraphai.graphs import BaseGraph +from scrapegraphai.nodes import FetchHTMLNode, ParseNode, GenerateAnswerNodeVanilla + + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_API_KEY") +llm_config = { + "api_key": gemini_key, + "model_name": "gemini-pro", +} + +model = Gemini(llm_config) + +# define the nodes for the graph +fetch_html_node = FetchHTMLNode("fetch_html") +parse_document_node = ParseNode( + doc_type="html", chunks_size=4000, node_name="parse_document") +generate_answer_node = GenerateAnswerNodeVanilla(model, "generate_answer") + +# create the graph +graph = BaseGraph( + nodes={ + fetch_html_node, + parse_document_node, + generate_answer_node + }, + edges={ + (fetch_html_node, parse_document_node), + (parse_document_node, generate_answer_node) + }, + entry_point=fetch_html_node +) + +# execute the graph +inputs = {"user_input": "List me the projects with their description", + "url": "https://perinim.github.io/projects/"} +result = graph.execute(inputs) + +# get the answer from the result +answer = result.get("answer", "No answer found.") +print(answer) diff --git a/pyproject.toml b/pyproject.toml index 0a1a552e..ade0885a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapegraphai" -version = "0.0.8" +version = "0.0.9" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ "Marco Vinciguerra ",