From cec4c43c64f582c18d8d7e6038a511884780d68d Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Tue, 9 Apr 2024 10:23:38 +0200 Subject: [PATCH 1/3] add new local models context window --- scrapegraphai/graphs/abstract_graph.py | 12 +++++++----- scrapegraphai/graphs/smart_scraper_graph.py | 1 - scrapegraphai/graphs/speech_graph.py | 4 ++-- scrapegraphai/helpers/models_tokens.py | 6 ++++-- scrapegraphai/nodes/fetch_node.py | 4 ++-- scrapegraphai/nodes/rag_node.py | 12 +++++++----- scrapegraphai/nodes/search_internet_node.py | 2 +- 7 files changed, 23 insertions(+), 18 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 79eea199..4014aae4 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -6,6 +6,7 @@ from typing import Optional from ..models import OpenAI, Gemini, Ollama, AzureOpenAI from ..helpers import models_tokens + class AbstractGraph(ABC): """ Abstract class representing a generic graph-based tool. @@ -19,7 +20,8 @@ class AbstractGraph(ABC): self.source = source self.config = config self.llm_model = self._create_llm(config["llm"]) - self.embedder_model = None if "embeddings" not in config else self._create_llm(config["embeddings"]) + self.embedder_model = None if "embeddings" not in config else self._create_llm( + config["embeddings"]) self.graph = self._create_graph() def _create_llm(self, llm_config: dict): @@ -39,7 +41,7 @@ class AbstractGraph(ABC): except KeyError: raise ValueError("Model not supported") return OpenAI(llm_params) - + elif "azure" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -48,14 +50,14 @@ class AbstractGraph(ABC): except KeyError: raise ValueError("Model not supported") return AzureOpenAI(llm_params) - + elif "gemini" in llm_params["model"]: try: self.model_token = models_tokens["gemini"][llm_params["model"]] except KeyError: raise ValueError("Model not supported") return Gemini(llm_params) - + elif "ollama" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -64,7 +66,7 @@ class AbstractGraph(ABC): except KeyError: raise ValueError("Model not supported") return Ollama(llm_params) - + else: raise ValueError("Model not supported") diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 06f7d010..e4c32100 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -1,7 +1,6 @@ """ Module for creating the smart scraper """ -from ..models import OpenAI, Gemini from .base_graph import BaseGraph from ..nodes import ( FetchNode, diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 28d0cdfa..a34fa2f1 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -2,7 +2,7 @@ Module for converting text to speach """ from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes -from ..models import OpenAI, Gemini, OpenAITextToSpeech +from ..models import OpenAITextToSpeech from .base_graph import BaseGraph from ..nodes import ( FetchNode, @@ -27,7 +27,7 @@ class SpeechGraph(AbstractGraph): super().__init__(prompt, config, source) self.input_key = "url" if source.startswith("http") else "local_dir" - + def _create_graph(self): """ Creates the graph of nodes representing the workflow for web scraping and summarization. diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index dc286526..5f9aa743 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -21,9 +21,11 @@ models_tokens = { "gemini-pro": 128000, }, - "ollama":{ + "ollama": { "llama2": 4096, "mistral": 8192, + "codellama": 16000, + "dolphin-mixtral": 32000, + "mistral-openorca": 32000, } - } diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 75f53655..12f69240 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -67,8 +67,8 @@ class FetchNode(BaseNode): # Fetching data from the state based on the input keys input_data = [state[key] for key in input_keys] - source = input_data[0] - + source = input_data[0] + # if it is a local directory if not source.startswith("http"): document = [Document(page_content=source, metadata={ diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 4ddabe66..adda5c33 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -8,8 +8,8 @@ from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS -from ..models import OpenAI, Gemini, Ollama, AzureOpenAI from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings +from ..models import OpenAI, Ollama, AzureOpenAI from langchain_community.embeddings import OllamaEmbeddings from .base_node import BaseNode @@ -86,16 +86,18 @@ class RAGNode(BaseNode): embedding_model = self.embedder_model if self.embedder_model else self.llm_model if isinstance(embedding_model, OpenAI): - embeddings = OpenAIEmbeddings(api_key=embedding_model.openai_api_key) + embeddings = OpenAIEmbeddings( + api_key=embedding_model.openai_api_key) elif isinstance(embedding_model, AzureOpenAI): embeddings = AzureOpenAIEmbeddings() elif isinstance(embedding_model, Ollama): embeddings = OllamaEmbeddings() else: raise ValueError("Embedding Model missing or not supported") - - retriever = FAISS.from_documents(chunked_docs, embeddings).as_retriever() - + + retriever = FAISS.from_documents( + chunked_docs, embeddings).as_retriever() + redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings) # similarity_threshold could be set, now k=20 relevant_filter = EmbeddingsFilter(embeddings=embeddings) diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 6220441d..8d5813b2 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -94,7 +94,7 @@ class SearchInternetNode(BaseNode): # Execute the chain to get the search query search_answer = search_prompt | self.llm_model | output_parser search_query = search_answer.invoke({"user_prompt": user_prompt})[0] - + print(f"Search Query: {search_query}") # TODO: handle multiple URLs answer = search_on_web(query=search_query, max_results=1)[0] From 992e7f862fee04271b0ab5db6a9198f28d76a4fd Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Tue, 9 Apr 2024 10:33:45 +0200 Subject: [PATCH 2/3] add models avaiables --- scrapegraphai/graphs/abstract_graph.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 4014aae4..f08b0f33 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -59,7 +59,14 @@ class AbstractGraph(ABC): return Gemini(llm_params) elif "ollama" in llm_params["model"]: - # take the model after the last dash + """ + Avaiable models: + - llama2 + - mistral + - codellama + - dolphin-mixtral + - mistral-openorca + """ llm_params["model"] = llm_params["model"].split("/")[-1] try: self.model_token = models_tokens["ollama"][llm_params["model"]] From a25e7ea32bd5daec15d29956a000b7faac1c042a Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Tue, 9 Apr 2024 11:45:20 +0200 Subject: [PATCH 3/3] add new version --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index f8db368b..e47aec4d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1117,13 +1117,13 @@ extended-testing = ["lxml (>=5.1.0,<6.0.0)"] [[package]] name = "langsmith" -version = "0.1.40" +version = "0.1.41" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langsmith-0.1.40-py3-none-any.whl", hash = "sha256:aa47d0f5a1eabd5c05ac6ce2cd3e28ccfc554d366e856a27b7c3c17c443881cb"}, - {file = "langsmith-0.1.40.tar.gz", hash = "sha256:50fdf313741cf94e978de06025fd180b56acf1d1a4549b0fd5453ef23d5461ef"}, + {file = "langsmith-0.1.41-py3-none-any.whl", hash = "sha256:11de22b6990502c630fdfdf6906681e664c6659d2118bcd2b79d08016e770831"}, + {file = "langsmith-0.1.41.tar.gz", hash = "sha256:1250cd6c9074ca10d40002b23d79b3017329b139fbe954248fdd7a79544e90d0"}, ] [package.dependencies] diff --git a/pyproject.toml b/pyproject.toml index 22ffb592..1d7dfe1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapegraphai" -version = "0.1.0" +version = "0.1.1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ "Marco Vinciguerra ",