From d3e63d91be79f74e8a3fdb00e692d546c24cead5 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 12 Jul 2024 11:49:12 +0000 Subject: [PATCH 01/20] ci(release): 1.9.0-beta.3 [skip ci] ## [1.9.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.2...v1.9.0-beta.3) (2024-07-12) ### Bug Fixes * solve a burr integration ([881290b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/881290b5066b39c505532656671fbf65f8fc312c)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63eb6250..ac4e94f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.9.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.2...v1.9.0-beta.3) (2024-07-12) + + +### Bug Fixes + +* solve a burr integration ([881290b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/881290b5066b39c505532656671fbf65f8fc312c)) + ## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-07-05) diff --git a/pyproject.toml b/pyproject.toml index 30dad8df..2de923c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.9.0b2" +version = "1.9.0b3" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From a7249685cb2b133beeea439d1337cb1adeb64acd Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 14 Jul 2024 10:24:38 +0200 Subject: [PATCH 02/20] removed rag node --- scrapegraphai/graphs/csv_scraper_graph.py | 14 ++------------ scrapegraphai/graphs/json_scraper_graph.py | 14 ++------------ scrapegraphai/graphs/markdown_scraper_graph.py | 14 ++------------ scrapegraphai/graphs/omni_scraper_graph.py | 16 +++------------- scrapegraphai/graphs/pdf_scraper_graph.py | 13 +------------ scrapegraphai/graphs/smart_scraper_graph.py | 13 ++----------- scrapegraphai/graphs/xml_scraper_graph.py | 14 ++------------ scrapegraphai/nodes/generate_answer_csv_node.py | 4 ++-- scrapegraphai/nodes/generate_answer_node.py | 4 ++-- scrapegraphai/nodes/generate_answer_omni_node.py | 4 ++-- scrapegraphai/nodes/generate_answer_pdf_node.py | 2 +- 11 files changed, 21 insertions(+), 91 deletions(-) diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index ea205bb3..f4efd1fb 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerCSVNode ) @@ -37,14 +36,7 @@ class CSVScraperGraph(AbstractGraph): input="csv | csv_dir", output=["doc"], ) - rag_node = RAGNode( - input="user_prompt & doc", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model, - } - ) + generate_answer_node = GenerateAnswerCSVNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -58,12 +50,10 @@ class CSVScraperGraph(AbstractGraph): return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index b85a34dc..fe54ebec 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerNode ) @@ -62,14 +61,7 @@ class JSONScraperGraph(AbstractGraph): input="json | json_dir", output=["doc", "link_urls", "img_urls"], ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -83,12 +75,10 @@ class JSONScraperGraph(AbstractGraph): return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py index 66b161dc..c177facd 100644 --- a/scrapegraphai/graphs/markdown_scraper_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_graph.py @@ -3,7 +3,7 @@ import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph -from ..nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode +from ..nodes import FetchNode, ParseNode, GenerateAnswerNode class MDScraperGraph(AbstractGraph): """ @@ -63,14 +63,6 @@ class MDScraperGraph(AbstractGraph): "chunk_size": self.model_token } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -86,13 +78,11 @@ class MDScraperGraph(AbstractGraph): nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 7e34dab7..1965dc04 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -12,7 +12,6 @@ from ..nodes import ( FetchNode, ParseNode, ImageToTextNode, - RAGNode, GenerateAnswerOmniNode ) @@ -89,14 +88,7 @@ class OmniScraperGraph(AbstractGraph): "max_images": self.max_images } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_omni_node = GenerateAnswerOmniNode( input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc", output=["answer"], @@ -112,14 +104,12 @@ class OmniScraperGraph(AbstractGraph): fetch_node, parse_node, image_to_text_node, - rag_node, generate_answer_omni_node, ], edges=[ (fetch_node, parse_node), (parse_node, image_to_text_node), - (image_to_text_node, rag_node), - (rag_node, generate_answer_omni_node) + (image_to_text_node, generate_answer_omni_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ @@ -136,4 +126,4 @@ class OmniScraperGraph(AbstractGraph): inputs = {"user_prompt": self.prompt, self.input_key: self.source} self.final_state, self.execution_info = self.graph.execute(inputs) - return self.final_state.get("answer", "No answer found.") \ No newline at end of file + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 732b4789..049425d0 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -12,7 +12,6 @@ from .abstract_graph import AbstractGraph from ..nodes import ( FetchNode, ParseNode, - RAGNode, GenerateAnswerPDFNode ) @@ -76,14 +75,6 @@ class PDFScraperGraph(AbstractGraph): } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) generate_answer_node_pdf = GenerateAnswerPDFNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -98,13 +89,11 @@ class PDFScraperGraph(AbstractGraph): nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node_pdf, ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node_pdf) + (parse_node, generate_answer_node_pdf) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index ba27b60e..7862f88f 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -78,14 +78,7 @@ class SmartScraperGraph(AbstractGraph): "chunk_size": self.model_token } ) - rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | parsed_doc | doc)", output=["answer"], @@ -100,13 +93,11 @@ class SmartScraperGraph(AbstractGraph): nodes=[ fetch_node, parse_node, - rag_node, generate_answer_node, ], edges=[ (fetch_node, parse_node), - (parse_node, rag_node), - (rag_node, generate_answer_node) + (parse_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 28c58bb2..24b1ff0d 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -10,7 +10,6 @@ from .abstract_graph import AbstractGraph from ..nodes import ( FetchNode, - RAGNode, GenerateAnswerNode ) @@ -64,14 +63,7 @@ class XMLScraperGraph(AbstractGraph): input="xml | xml_dir", output=["doc", "link_urls", "img_urls"] ) - rag_node = RAGNode( - input="user_prompt & doc", - output=["relevant_chunks"], - node_config={ - "llm_model": self.llm_model, - "embedder_model": self.embedder_model - } - ) + generate_answer_node = GenerateAnswerNode( input="user_prompt & (relevant_chunks | doc)", output=["answer"], @@ -85,12 +77,10 @@ class XMLScraperGraph(AbstractGraph): return BaseGraph( nodes=[ fetch_node, - rag_node, generate_answer_node, ], edges=[ - (fetch_node, rag_node), - (rag_node, generate_answer_node) + (fetch_node, generate_answer_node) ], entry_point=fetch_node, graph_name=self.__class__.__name__ diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 58adb1d4..6008dbdd 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -125,7 +125,7 @@ class GenerateAnswerCSVNode(BaseNode): template=template_no_chunks_csv_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "format_instructions": format_instructions, }, ) @@ -137,7 +137,7 @@ class GenerateAnswerCSVNode(BaseNode): template=template_chunks_csv_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions, }, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index fabb4e66..3ea8a128 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -115,7 +115,7 @@ class GenerateAnswerNode(BaseNode): prompt = PromptTemplate( template=template_no_chunks_prompt, input_variables=["question"], - partial_variables={"context": chunk.page_content, + partial_variables={"context": chunk, "format_instructions": format_instructions}) chain = prompt | self.llm_model | output_parser answer = chain.invoke({"question": user_prompt}) @@ -124,7 +124,7 @@ class GenerateAnswerNode(BaseNode): prompt = PromptTemplate( template=template_chunks_prompt, input_variables=["question"], - partial_variables={"context": chunk.page_content, + partial_variables={"context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions}) # Dynamically name the chains based on their index diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index e6ea9206..f5474177 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -110,7 +110,7 @@ class GenerateAnswerOmniNode(BaseNode): template=template_no_chunk_omni_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "format_instructions": format_instructions, "img_desc": imag_desc, }, @@ -123,7 +123,7 @@ class GenerateAnswerOmniNode(BaseNode): template=template_chunks_omni_prompt, input_variables=["question"], partial_variables={ - "context": chunk.page_content, + "context": chunk, "chunk_id": i + 1, "format_instructions": format_instructions, }, diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index c6509f34..fac25c06 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -124,7 +124,7 @@ class GenerateAnswerPDFNode(BaseNode): template=template_no_chunks_pdf_prompt, input_variables=["question"], partial_variables={ - "context":chunk.page_content, + "context":chunk, "format_instructions": format_instructions, }, ) From cf3ab5564ae5c415c63d1771b32ea68f5169ca82 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 14 Jul 2024 16:49:29 +0200 Subject: [PATCH 03/20] fix: search link node --- scrapegraphai/nodes/search_link_node.py | 83 ++++++++++++++----------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 2a0c5f18..8c81d07b 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -4,6 +4,7 @@ SearchLinkNode Module # Imports from standard library from typing import List, Optional +import re from tqdm import tqdm # Imports from Langchain @@ -20,7 +21,7 @@ from .base_node import BaseNode class SearchLinkNode(BaseNode): """ A node that can filter out the relevant links in the webpage content for the user prompt. - Node expects the aleready scrapped links on the webpage and hence it is expected + Node expects the already scrapped links on the webpage and hence it is expected that this node be used after the FetchNode. Attributes: @@ -74,32 +75,6 @@ class SearchLinkNode(BaseNode): parsed_content_chunks = state[input_keys[1]] output_parser = JsonOutputParser() - prompt_relevant_links = """ - You are a website scraper and you have just scraped the following content from a website. - Content: {content} - - You are now tasked with identifying all hyper links within the content that are potentially - relevant to the user task: {user_prompt} - - Assume relevance broadly, including any links that might be related or potentially useful - in relation to the task. - - Sort it in order of importance, the first one should be the most important one, the last one - the least important - - Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain - whether the content at the link is directly relevant. - - Output only a list of relevant links in the format: - [ - "link1", - "link2", - "link3", - . - . - . - ] - """ relevant_links = [] for i, chunk in enumerate( @@ -109,15 +84,49 @@ class SearchLinkNode(BaseNode): disable=not self.verbose, ) ): - merge_prompt = PromptTemplate( - template=prompt_relevant_links, - input_variables=["content", "user_prompt"], - ) - merge_chain = merge_prompt | self.llm_model | output_parser - # merge_chain = merge_prompt | self.llm_model - answer = merge_chain.invoke( - {"content": chunk.page_content, "user_prompt": user_prompt} - ) - relevant_links += answer + try: + # Primary approach: Regular expression to extract links + links = re.findall(r'(https?://\S+)', chunk.page_content) + relevant_links += links + except Exception as e: + # Fallback approach: Using the LLM to extract links + self.logger.error(f"Error extracting links: {e}. Falling back to LLM.") + prompt_relevant_links = """ + You are a website scraper and you have just scraped the following content from a website. + Content: {content} + + You are now tasked with identifying all hyper links within the content that are potentially + relevant to the user task: {user_prompt} + + Assume relevance broadly, including any links that might be related or potentially useful + in relation to the task. + + Sort it in order of importance, the first one should be the most important one, the last one + the least important + + Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain + whether the content at the link is directly relevant. + + Output only a list of relevant links in the format: + [ + "link1", + "link2", + "link3", + . + . + . + ] + """ + + merge_prompt = PromptTemplate( + template=prompt_relevant_links, + input_variables=["content", "user_prompt"], + ) + merge_chain = merge_prompt | self.llm_model | output_parser + answer = merge_chain.invoke( + {"content": chunk.page_content, "user_prompt": user_prompt} + ) + relevant_links += answer + state.update({self.output[0]: relevant_links}) return state From 7e5789baa300d43deae3711176c24429d03521fd Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 14 Jul 2024 19:31:22 +0200 Subject: [PATCH 04/20] Update research_web.py --- scrapegraphai/utils/research_web.py | 38 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index ac7fc09d..b439b14d 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -1,6 +1,3 @@ -""" -research web module -""" import re from typing import List from langchain_community.tools import DuckDuckGoSearchResults @@ -8,41 +5,39 @@ from googlesearch import search as google_search import requests from bs4 import BeautifulSoup -def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10) -> List[str]: +def search_on_web(query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080) -> List[str]: """ Searches the web for a given query using specified search engine options. Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', or 'Bing'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearcNGX'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. + port (int, optional): The port number to use when searching with 'SearcNGX'. Default is 8080. Returns: List[str]: A list of URLs as strings that are the search results. Raises: - ValueError: If the search engine specified is neither 'Google', 'DuckDuckGo', nor 'Bing'. + ValueError: If the search engine specified is not supported. Example: >>> search_on_web("example query", search_engine="Google", max_results=5) ['http://example.com', 'http://example.org', ...] - - This function allows switching between Google, DuckDuckGo, and Bing to perform - internet searches, returning a list of result URLs. """ - + if search_engine.lower() == "google": res = [] for url in google_search(query, stop=max_results): res.append(url) return res - + elif search_engine.lower() == "duckduckgo": research = DuckDuckGoSearchResults(max_results=max_results) res = research.run(query) links = re.findall(r'https?://[^\s,\]]+', res) return links - + elif search_engine.lower() == "bing": headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" @@ -51,11 +46,24 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = response = requests.get(search_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - + search_results = [] for result in soup.find_all('li', class_='b_algo', limit=max_results): link = result.find('a')['href'] search_results.append(link) return search_results - - raise ValueError("The only search engines available are DuckDuckGo, Google, or Bing") + + elif search_engine.lower() == "searcngx": + url = f"http://localhost:{port}" + params = {"q": query, "format": "json"} + + # Send the GET request to the server + response = requests.get(url, params=params) + + # Parse the response and limit to the specified max_results + data = response.json() + limited_results = data["results"][:max_results] + return limited_results + + else: + raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearcNGX") From 5c9218608140bf694fbfd96aa90276bc438bb475 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 14 Jul 2024 19:32:18 +0200 Subject: [PATCH 05/20] feat: add searchngx integration From fd1b7cb24a7c252277607abde35826e3c58e34ef Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Sun, 14 Jul 2024 22:36:43 +0200 Subject: [PATCH 06/20] chore: remove unused import --- scrapegraphai/graphs/smart_scraper_graph.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 7862f88f..cb4777a8 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -11,7 +11,6 @@ from .abstract_graph import AbstractGraph from ..nodes import ( FetchNode, ParseNode, - RAGNode, GenerateAnswerNode ) From 7ba2f6ae0b9d2e9336e973e1f57ab8355c739e27 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Sun, 14 Jul 2024 22:39:34 +0200 Subject: [PATCH 07/20] chore: correct search engine name --- scrapegraphai/utils/research_web.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index b439b14d..101693e4 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -11,9 +11,9 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = Args: query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearcNGX'. Default is 'Google'. + search_engine (str, optional): Specifies the search engine to use, options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. max_results (int, optional): The maximum number of search results to return. - port (int, optional): The port number to use when searching with 'SearcNGX'. Default is 8080. + port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. Returns: List[str]: A list of URLs as strings that are the search results. @@ -53,7 +53,7 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = search_results.append(link) return search_results - elif search_engine.lower() == "searcngx": + elif search_engine.lower() == "searxng": url = f"http://localhost:{port}" params = {"q": query, "format": "json"} @@ -66,4 +66,4 @@ def search_on_web(query: str, search_engine: str = "Google", max_results: int = return limited_results else: - raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearcNGX") + raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") From 2fa04b58159abf7af890ebc0768fe23d51bf177f Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 14 Jul 2024 20:57:09 +0000 Subject: [PATCH 08/20] ci(release): 1.9.0-beta.4 [skip ci] ## [1.9.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.3...v1.9.0-beta.4) (2024-07-14) ### Features * add searchngx integration ([5c92186](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c9218608140bf694fbfd96aa90276bc438bb475)) ### chore * correct search engine name ([7ba2f6a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7ba2f6ae0b9d2e9336e973e1f57ab8355c739e27)) * remove unused import ([fd1b7cb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd1b7cb24a7c252277607abde35826e3c58e34ef)) --- CHANGELOG.md | 13 +++++++++++++ pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac4e94f0..3e50e973 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## [1.9.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.3...v1.9.0-beta.4) (2024-07-14) + + +### Features + +* add searchngx integration ([5c92186](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5c9218608140bf694fbfd96aa90276bc438bb475)) + + +### chore + +* correct search engine name ([7ba2f6a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7ba2f6ae0b9d2e9336e973e1f57ab8355c739e27)) +* remove unused import ([fd1b7cb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd1b7cb24a7c252277607abde35826e3c58e34ef)) + ## [1.9.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.2...v1.9.0-beta.3) (2024-07-12) diff --git a/pyproject.toml b/pyproject.toml index 2de923c9..a841a4a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.9.0b3" +version = "1.9.0b4" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 57fdaf9e3a67e5006ffd5149fbbf1ec468ed16a4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 15 Jul 2024 12:36:11 +0200 Subject: [PATCH 09/20] create search_link_graph --- .../local_models/search_link_graph_ollama.py | 43 ++++++++ scrapegraphai/graphs/__init__.py | 1 + scrapegraphai/graphs/search_link_graph.py | 104 ++++++++++++++++++ scrapegraphai/nodes/search_link_node.py | 15 +-- 4 files changed, 153 insertions(+), 10 deletions(-) create mode 100644 examples/local_models/search_link_graph_ollama.py create mode 100644 scrapegraphai/graphs/search_link_graph.py diff --git a/examples/local_models/search_link_graph_ollama.py b/examples/local_models/search_link_graph_ollama.py new file mode 100644 index 00000000..5c594270 --- /dev/null +++ b/examples/local_models/search_link_graph_ollama.py @@ -0,0 +1,43 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, + "headless": False +} + +# ************************************************ +# Create the SearchLinkGraph instance and run it +# ************************************************ + +smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index b1bf1242..26a0b9e1 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -23,3 +23,4 @@ from .xml_scraper_multi_graph import XMLScraperMultiGraph from .script_creator_multi_graph import ScriptCreatorMultiGraph from .markdown_scraper_graph import MDScraperGraph from .markdown_scraper_multi_graph import MDScraperMultiGraph +from .search_link_graph import SearchLinkGraph diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py new file mode 100644 index 00000000..2e23357c --- /dev/null +++ b/scrapegraphai/graphs/search_link_graph.py @@ -0,0 +1,104 @@ +""" SearchLinkGraph Module """ +from typing import Optional +import logging +from pydantic import BaseModel +from .base_graph import BaseGraph +from .abstract_graph import AbstractGraph + + +from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) + +class SearchLinkGraph(AbstractGraph): + """ + SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The prompt for the graph. + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel): The schema for the graph output. + llm_model: An instance of a language model client, configured for generating answers. + embedder_model: An instance of an embedding model client, + configured for generating embeddings. + verbose (bool): A flag indicating whether to show print statements during execution. + headless (bool): A flag indicating whether to run the graph in headless mode. + + Args: + source (str): The source of the graph. + config (dict): Configuration parameters for the graph. + schema (BaseModel, optional): The schema for the graph output. Defaults to None. + + Example: + >>> smart_scraper = SearchLinkGraph( + ... "List me all the attractions in Chioggia.", + ... "https://en.wikipedia.org/wiki/Chioggia", + ... {"llm": {"model": "gpt-3.5-turbo"}} + ... ) + >>> result = smart_scraper.run() + """ + + def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None): + super().__init__("", config, source, schema) + + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: A graph instance representing the web scraping workflow. + """ + + fetch_node = FetchNode( + input="url| local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + } + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": self.model_token + } + ) + search_link_node = SearchLinkNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token + } + ) + + return BaseGraph( + nodes=[ + fetch_node, + parse_node, + search_link_node + ], + edges=[ + (fetch_node, parse_node), + (parse_node, search_link_node) + ], + entry_point=fetch_node, + graph_name=self.__class__.__name__ + ) + + def run(self) -> str: + """ + Executes the scraping process and returns the answer to the prompt. + + Returns: + str: The answer to the prompt. + """ + + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + + return self.final_state.get("parsed_doc", "No answer found.") \ No newline at end of file diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 8c81d07b..ffc8a71b 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -68,11 +68,8 @@ class SearchLinkNode(BaseNode): self.logger.info(f"--- Executing {self.node_name} Node ---") - # Interpret input keys based on the provided input expression - input_keys = self.get_input_keys(state) - user_prompt = state[input_keys[0]] - parsed_content_chunks = state[input_keys[1]] + parsed_content_chunks = state.get("doc") output_parser = JsonOutputParser() relevant_links = [] @@ -86,7 +83,8 @@ class SearchLinkNode(BaseNode): ): try: # Primary approach: Regular expression to extract links - links = re.findall(r'(https?://\S+)', chunk.page_content) + links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content)) + relevant_links += links except Exception as e: # Fallback approach: Using the LLM to extract links @@ -95,9 +93,6 @@ class SearchLinkNode(BaseNode): You are a website scraper and you have just scraped the following content from a website. Content: {content} - You are now tasked with identifying all hyper links within the content that are potentially - relevant to the user task: {user_prompt} - Assume relevance broadly, including any links that might be related or potentially useful in relation to the task. @@ -124,9 +119,9 @@ class SearchLinkNode(BaseNode): ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke( - {"content": chunk.page_content, "user_prompt": user_prompt} + {"content": chunk.page_content} ) relevant_links += answer state.update({self.output[0]: relevant_links}) - return state + return state \ No newline at end of file From f42d47e2c36d9fb4a4add9235885f98d43e6083c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 15 Jul 2024 20:27:44 +0200 Subject: [PATCH 10/20] add examples --- examples/anthropic/search_link_graph_haiku.py | 57 +++++++++++++++++++ examples/azure/search_link_graph_azure.py | 52 +++++++++++++++++ examples/bedrock/search_link_graph_bedrock.py | 45 +++++++++++++++ .../deepseek/search_link_graph_deepseek.py | 52 +++++++++++++++++ examples/ernie/search_graph_ernie.py | 17 +++--- examples/ernie/search_link_graph_ernie.py | 46 +++++++++++++++ .../fireworks/search_link_graph_fireworks.py | 52 +++++++++++++++++ examples/gemini/search_link_graph_gemini.py | 44 ++++++++++++++ examples/groq/search_link_graph_groq.py | 52 +++++++++++++++++ examples/groq/smart_scraper_groq.py | 1 - .../search_link_graph_huggingfacehub.py | 54 ++++++++++++++++++ 11 files changed, 464 insertions(+), 8 deletions(-) create mode 100644 examples/anthropic/search_link_graph_haiku.py create mode 100644 examples/azure/search_link_graph_azure.py create mode 100644 examples/bedrock/search_link_graph_bedrock.py create mode 100644 examples/deepseek/search_link_graph_deepseek.py create mode 100644 examples/ernie/search_link_graph_ernie.py create mode 100644 examples/fireworks/search_link_graph_fireworks.py create mode 100644 examples/gemini/search_link_graph_gemini.py create mode 100644 examples/groq/search_link_graph_groq.py create mode 100644 examples/huggingfacehub/search_link_graph_huggingfacehub.py diff --git a/examples/anthropic/search_link_graph_haiku.py b/examples/anthropic/search_link_graph_haiku.py new file mode 100644 index 00000000..ccfbc1d2 --- /dev/null +++ b/examples/anthropic/search_link_graph_haiku.py @@ -0,0 +1,57 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from langchain_openai import AzureChatOpenAI +from langchain_openai import AzureOpenAIEmbeddings + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +llm_model_instance = AzureChatOpenAI( + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], + azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] +) + +embedder_model_instance = AzureOpenAIEmbeddings( + azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"], + openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"], +) + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/azure/search_link_graph_azure.py b/examples/azure/search_link_graph_azure.py new file mode 100644 index 00000000..f940c2a4 --- /dev/null +++ b/examples/azure/search_link_graph_azure.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/search_link_graph_bedrock.py b/examples/bedrock/search_link_graph_bedrock.py new file mode 100644 index 00000000..116dea01 --- /dev/null +++ b/examples/bedrock/search_link_graph_bedrock.py @@ -0,0 +1,45 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/deepseek/search_link_graph_deepseek.py b/examples/deepseek/search_link_graph_deepseek.py new file mode 100644 index 00000000..30e4a9b3 --- /dev/null +++ b/examples/deepseek/search_link_graph_deepseek.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek-chat", + "openai_api_key": deepseek_key, + "openai_api_base": 'https://api.deepseek.com/v1', + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/ernie/search_graph_ernie.py b/examples/ernie/search_graph_ernie.py index 22802c6e..c04d9f9b 100644 --- a/examples/ernie/search_graph_ernie.py +++ b/examples/ernie/search_graph_ernie.py @@ -12,15 +12,18 @@ load_dotenv() # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") - graph_config = { "llm": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", - }, - "max_results": 2, - "verbose": True, + "model": "ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434"}, + "library": "beautifulsoup" } # ************************************************ diff --git a/examples/ernie/search_link_graph_ernie.py b/examples/ernie/search_link_graph_ernie.py new file mode 100644 index 00000000..466b230c --- /dev/null +++ b/examples/ernie/search_link_graph_ernie.py @@ -0,0 +1,46 @@ +""" +Example of Search Graph +""" +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434"}, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/fireworks/search_link_graph_fireworks.py b/examples/fireworks/search_link_graph_fireworks.py new file mode 100644 index 00000000..a1d3a979 --- /dev/null +++ b/examples/fireworks/search_link_graph_fireworks.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct" + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "max_results": 2, + "verbose": True, + "headless": False, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/gemini/search_link_graph_gemini.py b/examples/gemini/search_link_graph_gemini.py new file mode 100644 index 00000000..937038bd --- /dev/null +++ b/examples/gemini/search_link_graph_gemini.py @@ -0,0 +1,44 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "gemini-pro", + }, +} +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/search_link_graph_groq.py b/examples/groq/search_link_graph_groq.py new file mode 100644 index 00000000..f940c2a4 --- /dev/null +++ b/examples/groq/search_link_graph_groq.py @@ -0,0 +1,52 @@ +""" +Example of Search Graph +""" +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +load_dotenv() + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "headless": False +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py index c1a5d319..f828cdec 100644 --- a/examples/groq/smart_scraper_groq.py +++ b/examples/groq/smart_scraper_groq.py @@ -9,7 +9,6 @@ from scrapegraphai.utils import prettify_exec_info load_dotenv() - # ************************************************ # Define the configuration for the graph # ************************************************ diff --git a/examples/huggingfacehub/search_link_graph_huggingfacehub.py b/examples/huggingfacehub/search_link_graph_huggingfacehub.py new file mode 100644 index 00000000..a49fb3b9 --- /dev/null +++ b/examples/huggingfacehub/search_link_graph_huggingfacehub.py @@ -0,0 +1,54 @@ +""" +Example of Search Graph +""" +import os +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from langchain_community.llms import HuggingFaceEndpoint +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings + +# ************************************************ +# Define the configuration for the graph +# ************************************************ +# ************************************************ + +HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') + +repo_id = "mistralai/Mistral-7B-Instruct-v0.2" + +llm_model_instance = HuggingFaceEndpoint( + repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN +) + +embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" +) + +graph_config = { + "llm": {"model_instance": llm_model_instance}, + "embeddings": {"model_instance": embedder_model_instance} +} + + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me the best escursions near Trento", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") From da0b744443ba8955f2336592220f4778d6f15f52 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 15 Jul 2024 20:46:22 +0200 Subject: [PATCH 11/20] add test --- tests/graphs/search_link_ollama.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/graphs/search_link_ollama.py diff --git a/tests/graphs/search_link_ollama.py b/tests/graphs/search_link_ollama.py new file mode 100644 index 00000000..3b41f699 --- /dev/null +++ b/tests/graphs/search_link_ollama.py @@ -0,0 +1,26 @@ +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info + +def test_smart_scraper_pipeline(): + graph_config = { + "llm": { + "model": "ollama/llama3", + "temperature": 0, + "format": "json", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + }, + "verbose": True, + "headless": False + } + + smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config + ) + + result = smart_scraper_graph.run() + + assert result is not None From 830daee1f32c01728bec180787de8808b275a17e Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 15 Jul 2024 20:47:09 +0200 Subject: [PATCH 12/20] Update search_link_node.py --- scrapegraphai/nodes/search_link_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index ffc8a71b..b3d289d9 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -124,4 +124,4 @@ class SearchLinkNode(BaseNode): relevant_links += answer state.update({self.output[0]: relevant_links}) - return state \ No newline at end of file + return state From bb624399cfc3924825892dd48697fc298ad3b002 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Mon, 15 Jul 2024 21:07:35 +0000 Subject: [PATCH 13/20] ci(release): 1.9.0-beta.5 [skip ci] ## [1.9.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.4...v1.9.0-beta.5) (2024-07-15) ### Bug Fixes * search link node ([cf3ab55](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cf3ab5564ae5c415c63d1771b32ea68f5169ca82)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e50e973..cf1a1e7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.9.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.4...v1.9.0-beta.5) (2024-07-15) + + +### Bug Fixes + +* search link node ([cf3ab55](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cf3ab5564ae5c415c63d1771b32ea68f5169ca82)) + ## [1.9.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.3...v1.9.0-beta.4) (2024-07-14) diff --git a/pyproject.toml b/pyproject.toml index a841a4a0..dc011516 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.9.0b4" +version = "1.9.0b5" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 602dd00209ee1d72a1223fc4793759450921fcf9 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Tue, 16 Jul 2024 12:39:48 +0200 Subject: [PATCH 14/20] feat: refactoring_to_md function --- pyproject.toml | 1 - requirements-dev.lock | 31 --------------------------- requirements.lock | 32 ---------------------------- scrapegraphai/utils/convert_to_md.py | 8 +++---- tests/utils/convert_to_md_test.py | 6 +++--- 5 files changed, 6 insertions(+), 72 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dc011516..59e9ef1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ dependencies = [ "undetected-playwright==0.3.0", "semchunk==1.0.1", "html2text==2024.2.26", - "trafilatura==1.10.0", "langchain-fireworks==0.1.3" ] diff --git a/requirements-dev.lock b/requirements-dev.lock index f3d4786c..dcc4744e 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -41,7 +41,6 @@ attrs==23.2.0 # via jsonschema # via referencing babel==2.15.0 - # via courlan # via sphinx beautifulsoup4==4.12.3 # via furo @@ -63,11 +62,8 @@ certifi==2024.2.2 # via httpcore # via httpx # via requests - # via trafilatura charset-normalizer==3.3.2 - # via htmldate # via requests - # via trafilatura click==8.1.7 # via burr # via streamlit @@ -75,15 +71,11 @@ click==8.1.7 # via uvicorn contourpy==1.2.1 # via matplotlib -courlan==1.2.0 - # via trafilatura cycler==0.12.1 # via matplotlib dataclasses-json==0.6.6 # via langchain # via langchain-community -dateparser==1.2.0 - # via htmldate defusedxml==0.7.1 # via langchain-anthropic dill==0.3.8 @@ -204,8 +196,6 @@ h11==0.14.0 # via uvicorn html2text==2024.2.26 # via scrapegraphai -htmldate==1.8.1 - # via trafilatura httpcore==1.0.5 # via httpx httplib2==0.22.0 @@ -259,8 +249,6 @@ jsonschema==4.22.0 # via altair jsonschema-specifications==2023.12.1 # via jsonschema -justext==3.0.1 - # via trafilatura kiwisolver==1.4.5 # via matplotlib langchain==0.1.15 @@ -302,12 +290,6 @@ loguru==0.7.2 # via burr lxml==5.2.2 # via free-proxy - # via htmldate - # via justext - # via lxml-html-clean - # via trafilatura -lxml-html-clean==0.1.1 - # via lxml markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 @@ -430,9 +412,7 @@ pytest==8.0.0 pytest-mock==3.14.0 python-dateutil==2.9.0.post0 # via botocore - # via dateparser # via google-cloud-bigquery - # via htmldate # via matplotlib # via pandas python-dotenv==1.0.1 @@ -441,7 +421,6 @@ python-dotenv==1.0.1 python-multipart==0.0.9 # via fastapi pytz==2024.1 - # via dateparser # via pandas pyyaml==6.0.1 # via huggingface-hub @@ -453,7 +432,6 @@ referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.5.15 - # via dateparser # via tiktoken requests==2.32.2 # via burr @@ -534,8 +512,6 @@ tenacity==8.3.0 tiktoken==0.6.0 # via langchain-openai # via scrapegraphai -tld==0.13 - # via courlan tokenizers==0.19.1 # via anthropic toml==0.10.2 @@ -555,8 +531,6 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk -trafilatura==1.10.0 - # via scrapegraphai typer==0.12.3 # via fastapi-cli typing-extensions==4.12.0 @@ -586,8 +560,6 @@ typing-inspect==0.9.0 # via sf-hamilton tzdata==2024.1 # via pandas -tzlocal==5.2 - # via dateparser ujson==5.10.0 # via fastapi undetected-playwright==0.3.0 @@ -596,10 +568,7 @@ uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.18 # via botocore - # via courlan - # via htmldate # via requests - # via trafilatura uvicorn==0.29.0 # via burr # via fastapi diff --git a/requirements.lock b/requirements.lock index 21b276eb..ad1c7ed7 100644 --- a/requirements.lock +++ b/requirements.lock @@ -28,8 +28,6 @@ async-timeout==4.0.3 # via langchain attrs==23.2.0 # via aiohttp -babel==2.15.0 - # via courlan beautifulsoup4==4.12.3 # via google # via scrapegraphai @@ -44,18 +42,11 @@ certifi==2024.2.2 # via httpcore # via httpx # via requests - # via trafilatura charset-normalizer==3.3.2 - # via htmldate # via requests - # via trafilatura -courlan==1.2.0 - # via trafilatura dataclasses-json==0.6.6 # via langchain # via langchain-community -dateparser==1.2.0 - # via htmldate defusedxml==0.7.1 # via langchain-anthropic distro==1.9.0 @@ -150,8 +141,6 @@ h11==0.14.0 # via httpcore html2text==2024.2.26 # via scrapegraphai -htmldate==1.8.1 - # via trafilatura httpcore==1.0.5 # via httpx httplib2==0.22.0 @@ -181,8 +170,6 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -justext==3.0.1 - # via trafilatura langchain==0.1.15 # via scrapegraphai langchain-anthropic==0.1.11 @@ -220,12 +207,6 @@ langsmith==0.1.63 # via langchain-core lxml==5.2.2 # via free-proxy - # via htmldate - # via justext - # via lxml-html-clean - # via trafilatura -lxml-html-clean==0.1.1 - # via lxml marshmallow==3.21.2 # via dataclasses-json minify-html==0.15.0 @@ -298,14 +279,11 @@ pyparsing==3.1.2 # via httplib2 python-dateutil==2.9.0.post0 # via botocore - # via dateparser # via google-cloud-bigquery - # via htmldate # via pandas python-dotenv==1.0.1 # via scrapegraphai pytz==2024.1 - # via dateparser # via pandas pyyaml==6.0.1 # via huggingface-hub @@ -313,7 +291,6 @@ pyyaml==6.0.1 # via langchain-community # via langchain-core regex==2024.5.15 - # via dateparser # via tiktoken requests==2.32.2 # via free-proxy @@ -354,8 +331,6 @@ tenacity==8.3.0 tiktoken==0.6.0 # via langchain-openai # via scrapegraphai -tld==0.13 - # via courlan tokenizers==0.19.1 # via anthropic tqdm==4.66.4 @@ -364,8 +339,6 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk -trafilatura==1.10.0 - # via scrapegraphai typing-extensions==4.12.0 # via anthropic # via anyio @@ -382,17 +355,12 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.1 # via pandas -tzlocal==5.2 - # via dateparser undetected-playwright==0.3.0 # via scrapegraphai uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.18 # via botocore - # via courlan - # via htmldate # via requests - # via trafilatura yarl==1.9.4 # via aiohttp diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index a2ec04db..35123042 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -2,8 +2,6 @@ convert_to_md modul """ import html2text -from trafilatura import extract - def convert_to_md(html): """ Convert HTML to Markdown. @@ -20,6 +18,6 @@ def convert_to_md(html): 'This is a paragraph.\n\n# This is a heading.' Note: All the styles and links are ignored during the conversion. """ - - return extract(filecontent=html,include_images=True, - include_links=True, include_tables=True, output_format="markdown") + h = html2text.HTML2Text() + h.ignore_links = False + return h.handle(html) diff --git a/tests/utils/convert_to_md_test.py b/tests/utils/convert_to_md_test.py index 0b6d552e..72270913 100644 --- a/tests/utils/convert_to_md_test.py +++ b/tests/utils/convert_to_md_test.py @@ -7,7 +7,7 @@ def test_basic_html_to_md(): def test_html_with_links_and_images(): html = '

This is a link and this is an image

' - assert convert_to_md(html) is None + assert convert_to_md(html) is not None def test_html_with_tables(): html = ''' @@ -17,11 +17,11 @@ def test_html_with_tables(): Row 2, Cell 1Row 2, Cell 2 ''' - assert convert_to_md(html) is None + assert convert_to_md(html) is not None def test_empty_html(): html = "" - assert convert_to_md(html) is None + assert convert_to_md(html) is not None def test_complex_html_structure(): html = ''' From 54a69de69e8077e02fd5584783ca62cc2e0ec5bb Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 17 Jul 2024 08:06:07 +0000 Subject: [PATCH 15/20] ci(release): 1.9.0-beta.6 [skip ci] ## [1.9.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.5...v1.9.0-beta.6) (2024-07-17) ### Features * refactoring_to_md function ([602dd00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/602dd00209ee1d72a1223fc4793759450921fcf9)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf1a1e7a..cbd47fe8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.9.0-beta.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.5...v1.9.0-beta.6) (2024-07-17) + + +### Features + +* refactoring_to_md function ([602dd00](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/602dd00209ee1d72a1223fc4793759450921fcf9)) + ## [1.9.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.9.0-beta.4...v1.9.0-beta.5) (2024-07-15) diff --git a/pyproject.toml b/pyproject.toml index 59e9ef1a..369113c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.9.0b5" +version = "1.9.0b6" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From e2ca41e928b9b8868fadc2bfd5bb02a9ff71f709 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 18 Jul 2024 10:24:24 +0200 Subject: [PATCH 16/20] add example --- examples/openai/search_link_graph_openai.py | 36 +++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 examples/openai/search_link_graph_openai.py diff --git a/examples/openai/search_link_graph_openai.py b/examples/openai/search_link_graph_openai.py new file mode 100644 index 00000000..10d10d4c --- /dev/null +++ b/examples/openai/search_link_graph_openai.py @@ -0,0 +1,36 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SearchLinkGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "api_key": "s", + "model": "gpt-3.5-turbo", + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SearchLinkGraph instance and run it +# ************************************************ + +smart_scraper_graph = SearchLinkGraph( + source="https://sport.sky.it/nba?gr=www", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) From 7314bc383068db590662bf7e512f799529308991 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:56:25 +0200 Subject: [PATCH 17/20] chore: upgrade tiktoken --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 369113c5..f77bb185 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "beautifulsoup4==4.12.3", "pandas==2.2.2", "python-dotenv==1.0.1", - "tiktoken==0.6.0", + "tiktoken==0.7", "tqdm==4.66.4", "graphviz==0.20.3", "minify-html==0.15.0", From dfa759e8340828cbed1f899b325fab507fe89a78 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Thu, 18 Jul 2024 11:02:51 +0200 Subject: [PATCH 18/20] style(models): fix module docstrings --- scrapegraphai/models/bedrock.py | 2 +- scrapegraphai/models/ernie.py | 2 +- scrapegraphai/models/oneapi.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/models/bedrock.py b/scrapegraphai/models/bedrock.py index b7cbe288..06299075 100644 --- a/scrapegraphai/models/bedrock.py +++ b/scrapegraphai/models/bedrock.py @@ -1,5 +1,5 @@ """ -bedrock configuration wrapper +Bedrock Module """ from langchain_aws import ChatBedrock diff --git a/scrapegraphai/models/ernie.py b/scrapegraphai/models/ernie.py index 0b4701e1..75e2a261 100644 --- a/scrapegraphai/models/ernie.py +++ b/scrapegraphai/models/ernie.py @@ -1,5 +1,5 @@ """ -Ollama Module +Ernie Module """ from langchain_community.chat_models import ErnieBotChat diff --git a/scrapegraphai/models/oneapi.py b/scrapegraphai/models/oneapi.py index 00dddbf9..54e846d9 100644 --- a/scrapegraphai/models/oneapi.py +++ b/scrapegraphai/models/oneapi.py @@ -1,5 +1,5 @@ """ -OpenAI Module +OneAPI Module """ from langchain_openai import ChatOpenAI From c7b05a4993df14d6ed4848121a3cd209571232f7 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Thu, 18 Jul 2024 11:23:10 +0200 Subject: [PATCH 19/20] chore(ci): upgrade lockfiles --- requirements-dev.lock | 3 ++- requirements.lock | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index dcc4744e..300d05f5 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -180,6 +180,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.8.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -509,7 +510,7 @@ tenacity==8.3.0 # via langchain-community # via langchain-core # via streamlit -tiktoken==0.6.0 +tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 diff --git a/requirements.lock b/requirements.lock index ad1c7ed7..2e9cf828 100644 --- a/requirements.lock +++ b/requirements.lock @@ -126,6 +126,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.8.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -328,7 +329,7 @@ tenacity==8.3.0 # via langchain # via langchain-community # via langchain-core -tiktoken==0.6.0 +tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 From a3d0aacff5c55c83dbc84723526856a7f38fbb42 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 18 Jul 2024 11:30:25 +0200 Subject: [PATCH 20/20] update env --- requirements-dev.lock | 1 - requirements.lock | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements-dev.lock b/requirements-dev.lock index 300d05f5..b0bcaaa0 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -180,7 +180,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.8.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.lock b/requirements.lock index 2e9cf828..7a8bb455 100644 --- a/requirements.lock +++ b/requirements.lock @@ -126,7 +126,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.8.0 # via langchain-groq grpc-google-iam-v1==0.13.1