diff --git a/CHANGELOG.md b/CHANGELOG.md index b5092c45..3247613c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,8 @@ * conditional node ([ce00345](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ce003454953e5785d4746223c252de38cd5d07ea)) ## [1.13.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.12.2...v1.13.0) (2024-08-09) +## [1.13.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.8...v1.13.0-beta.9) (2024-08-10) + ### Features @@ -65,6 +67,15 @@ * **release:** 1.13.0-beta.5 [skip ci] ([2eba73b](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/2eba73b784ee443260117e98ab7c943934b3018d)), closes [#513](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/513) * **release:** 1.13.0-beta.6 [skip ci] ([e75b574](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e75b574b67040e127599da9ee1b0eee13d234cb9)) * **release:** 1.13.0-beta.7 [skip ci] ([6e56925](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6e56925355c424edae290c70fd98646ab5f420ee)) +* add refactoring of default temperature ([6c3b37a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6c3b37ab001b80c09ea9ffb56d4c3df338e33a7a)) + +## [1.13.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.7...v1.13.0-beta.8) (2024-08-09) + + +### Bug Fixes + +* broken node ([1272273](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/127227349915deeb0dede34aa575ad269ed7cbe3)) + ## [1.13.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.6...v1.13.0-beta.7) (2024-08-09) diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index b161cd0f..14fe622f 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -14,7 +14,6 @@ graph_config = { "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "verbose": True, "headless": False } @@ -23,8 +22,13 @@ graph_config = { # Create the SmartScraperGraph instance and run it # ************************************************ smart_scraper_graph = SmartScraperGraph( +<<<<<<< Updated upstream prompt="Find some information about what does the company do, the name and a contact email.", source="https://scrapegraphai.com/", +======= + prompt="List all the projects with their descriptions", + source="https://perinim.github.io/projects/", +>>>>>>> Stashed changes config=graph_config ) diff --git a/pyproject.toml b/pyproject.toml index 51160d68..b05ed3ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,8 @@ name = "scrapegraphai" version = "1.13.3" +version = "1.13.0b9" + description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 83b532bc..f22f764c 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,11 +7,9 @@ from typing import Optional import uuid import warnings from pydantic import BaseModel - from langchain_community.chat_models import ErnieBotChat from langchain_nvidia_ai_endpoints import ChatNVIDIA from langchain.chat_models import init_chat_model - from ..helpers import models_tokens from ..models import ( OneApi, @@ -19,8 +17,6 @@ from ..models import ( ) from ..utils.logging import set_verbosity_warning, set_verbosity_info - - class AbstractGraph(ABC): """ Scaffolding class for creating a graph representation and executing it. @@ -53,6 +49,9 @@ class AbstractGraph(ABC): def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[BaseModel] = None): + if config.get("llm").get("temperature") is None: + config["llm"]["temperature"] = 0 + self.prompt = prompt self.source = source self.config = config @@ -212,7 +211,7 @@ class AbstractGraph(ABC): print("model not found, using default token size (8192)") self.model_token = 8192 return ErnieBotChat(llm_params) - + if "oneapi" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -221,7 +220,7 @@ class AbstractGraph(ABC): except KeyError as exc: raise KeyError("Model not supported") from exc return OneApi(llm_params) - + if "nvidia" in llm_params["model"]: try: self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index c441f7ab..f442ac21 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -6,9 +6,7 @@ import warnings from typing import Tuple from langchain_community.callbacks import get_openai_callback from ..integrations import BurrBridge - -# Import telemetry functions -from ..telemetry import log_graph_execution, log_event +from ..telemetry import log_graph_execution class BaseGraph: """ diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index d7ec186e..42153be5 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -4,16 +4,13 @@ Module for creating the smart scraper from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, GenerateAnswerCSVNode ) - class CSVScraperGraph(AbstractGraph): """ SmartScraper is a comprehensive web scraping tool that automates the process of extracting diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index 716e9aca..808549aa 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -4,22 +4,19 @@ CSVScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional - from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .csv_scraper_graph import CSVScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class CSVScraperMultiGraph(AbstractGraph): """ - CSVScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + CSVScraperMultiGraph is a scraping pipeline that + scrapes a list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: @@ -44,7 +41,8 @@ class CSVScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index 43a461d0..ca617d19 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -4,10 +4,8 @@ DeepScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, SearchLinkNode, @@ -18,7 +16,6 @@ from ..nodes import ( MergeAnswersNode ) - class DeepScraperGraph(AbstractGraph): """ [WIP] @@ -87,7 +84,6 @@ class DeepScraperGraph(AbstractGraph): output=["relevant_chunks"], node_config={ "llm_model": self.llm_model, - "embedder_model": self.embedder_model } ) generate_answer_node = GenerateAnswerNode( @@ -104,7 +100,6 @@ class DeepScraperGraph(AbstractGraph): output=["relevant_links"], node_config={ "llm_model": self.llm_model, - "embedder_model": self.embedder_model } ) graph_iterator_node = GraphIteratorNode( diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index fe54ebec..a23c1f38 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -4,16 +4,13 @@ JSONScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, GenerateAnswerNode ) - class JSONScraperGraph(AbstractGraph): """ JSONScraperGraph defines a scraping pipeline for JSON files. diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 48fd8217..da7f33ba 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -5,20 +5,18 @@ JSONScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .json_scraper_graph import JSONScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class JSONScraperMultiGraph(AbstractGraph): """ - JSONScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + JSONScraperMultiGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: diff --git a/scrapegraphai/graphs/markdown_scraper_multi_graph.py b/scrapegraphai/graphs/markdown_scraper_multi_graph.py index ec47f74d..e59f6e5a 100644 --- a/scrapegraphai/graphs/markdown_scraper_multi_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_multi_graph.py @@ -5,17 +5,14 @@ MDScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .markdown_scraper_graph import MDScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class MDScraperMultiGraph(AbstractGraph): """ MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 1965dc04..6849ee12 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -4,17 +4,14 @@ OmniScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, ImageToTextNode, GenerateAnswerOmniNode ) - from ..models import OpenAIImageToText class OmniScraperGraph(AbstractGraph): diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 049425d0..ae783aba 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -5,17 +5,14 @@ PDFScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, GenerateAnswerPDFNode ) - class PDFScraperGraph(AbstractGraph): """ PDFScraperGraph is a scraping pipeline that extracts information from pdf files using a natural diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index f9b3061b..6803e27a 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -5,17 +5,14 @@ PdfScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .pdf_scraper_graph import PDFScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class PdfScraperMultiGraph(AbstractGraph): """ PdfScraperMultiGraph is a scraping pipeline that scrapes a diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index a4d1d6f6..bb5629c5 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -4,17 +4,14 @@ ScriptCreatorGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, GenerateScraperNode ) - class ScriptCreatorGraph(AbstractGraph): """ ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts. diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 0bafd561..969ba722 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -16,10 +16,10 @@ from ..nodes import ( MergeGeneratedScriptsNode ) - class ScriptCreatorMultiGraph(AbstractGraph): """ - ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list of URLs generating web scraping scripts. + ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list + of URLs generating web scraping scripts. It only requires a user prompt and a list of URLs. Attributes: prompt (str): The user prompt to search the internet. diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 0c0f1104..080aaf19 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -16,8 +16,6 @@ from ..nodes import ( MergeAnswersNode ) - - class SearchGraph(AbstractGraph): """ SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index c9521497..3898e4a9 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -4,13 +4,13 @@ import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - - from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) class SearchLinkGraph(AbstractGraph): """ - SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts. + SearchLinkGraph is a scraping pipeline that automates the process of + extracting information from web pages using a natural language model + to interpret and answer prompts. Attributes: prompt (str): The prompt for the graph. diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index cb4777a8..aa83c23b 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -14,7 +14,6 @@ from ..nodes import ( GenerateAnswerNode ) - class SmartScraperGraph(AbstractGraph): """ SmartScraper is a scraping pipeline that automates the process of diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 84e028fc..66d53851 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -15,10 +15,10 @@ from ..nodes import ( MergeAnswersNode ) - class SmartScraperMultiGraph(AbstractGraph): """ - SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + SmartScraperMultiGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: @@ -43,7 +43,8 @@ class SmartScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) @@ -51,7 +52,7 @@ class SmartScraperMultiGraph(AbstractGraph): self.copy_config = copy(config) else: self.copy_config = deepcopy(config) - + self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index d1d6f94b..8d77621a 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -18,10 +18,10 @@ from ..nodes import ( from ..utils.save_audio_from_bytes import save_audio_from_bytes from ..models import OpenAITextToSpeech - class SpeechGraph(AbstractGraph): """ - SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file. + SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer + to a given prompt, and generate an audio file. Attributes: prompt (str): The prompt for the graph. diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 24b1ff0d..e0a149eb 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -13,7 +13,6 @@ from ..nodes import ( GenerateAnswerNode ) - class XMLScraperGraph(AbstractGraph): """ XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index a6f90bea..648db500 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -15,7 +15,6 @@ from ..nodes import ( MergeAnswersNode ) - class XMLScraperMultiGraph(AbstractGraph): """ XMLScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 6a1aad4e..2a12c987 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -32,7 +32,10 @@ class ConditionalNode(BaseNode): """ Initializes an empty ConditionalNode. """ - pass + + #super().__init__(node_name, "node", input, output, 2, node_config) + pass + def execute(self, state: dict) -> dict: """ diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 55b8b5f3..10ff786e 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -1,17 +1,12 @@ """ GenerateAnswerNode Module """ - -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm from langchain_community.chat_models import ChatOllama -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 733898bd..fbd47a34 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -4,16 +4,11 @@ GenerateScraperNode Module # Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser, JsonOutputParser from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode - class GenerateScraperNode(BaseNode): """ Generates a python script for scraping a website using the specified library. diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 6ce4bdaf..db7d8f02 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -11,7 +11,6 @@ from .base_node import BaseNode DEFAULT_BATCHSIZE = 16 - class GraphIteratorNode(BaseNode): """ A node responsible for instantiating and running multiple graph instances in parallel. diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index eaea0184..5bfee267 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -9,7 +9,6 @@ from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import template_combined - class MergeAnswersNode(BaseNode): """ A node responsible for merging the answers from multiple graph instances into a single answer. diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py index 8c8eaecd..bf8f7f4a 100644 --- a/scrapegraphai/nodes/merge_generated_scripts.py +++ b/scrapegraphai/nodes/merge_generated_scripts.py @@ -10,7 +10,6 @@ from langchain_core.output_parsers import JsonOutputParser, StrOutputParser from ..utils.logging import get_logger from .base_node import BaseNode - class MergeGeneratedScriptsNode(BaseNode): """ A node responsible for merging scripts generated. diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 59471de1..48741085 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -9,7 +9,6 @@ from langchain_core.documents import Document from ..utils.logging import get_logger from .base_node import BaseNode - class ParseNode(BaseNode): """ A node responsible for parsing HTML content from a document. @@ -91,7 +90,7 @@ class ParseNode(BaseNode): chunk_size=self.node_config.get("chunk_size", 4096)-250, token_counter=lambda text: len(text.split()), memoize=False) - + state.update({self.output[0]: chunks}) return state diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 952daa6c..fcacac99 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -13,7 +13,6 @@ from langchain.retrievers.document_compressors import ( ) from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS - from langchain_community.chat_models import ChatOllama from langchain_aws import BedrockEmbeddings, ChatBedrock from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings @@ -23,7 +22,6 @@ from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings from langchain_fireworks import FireworksEmbeddings, ChatFireworks from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA - from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import models_tokens diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index a2bea856..8a0fc269 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -7,20 +7,23 @@ from urllib.parse import urljoin def cleanup_html(html_content: str, base_url: str) -> str: """ - Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. + Processes HTML content by removing unnecessary tags, + minifying the HTML, and extracting the title and body content. Args: html_content (str): The HTML content to be processed. Returns: - str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so. + str: A string combining the parsed title and the minified body content. + If no body content is found, it indicates so. Example: >>> html_content = "Example

Hello World!

" >>> remover(html_content) 'Title: Example, Body:

Hello World!

' - This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. + This function is particularly useful for preparing HTML content for + environments where bandwidth usage needs to be minimized. """ soup = BeautifulSoup(html_content, 'html.parser') @@ -55,4 +58,5 @@ def cleanup_html(html_content: str, base_url: str) -> str: return title, minimized_body, link_urls, image_urls else: - raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}") + raise ValueError(f"""No HTML body content found, please try setting the 'headless' + flag to False in the graph configuration. HTML content: {html_content}""") diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py index be001d06..44897c7c 100644 --- a/scrapegraphai/utils/convert_to_csv.py +++ b/scrapegraphai/utils/convert_to_csv.py @@ -5,7 +5,6 @@ import os import sys import pandas as pd - def convert_to_csv(data: dict, filename: str, position: str = None) -> None: """ Converts a dictionary to a CSV file and saves it at a specified location. diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py index 7cf12c53..57618fc1 100644 --- a/scrapegraphai/utils/convert_to_json.py +++ b/scrapegraphai/utils/convert_to_json.py @@ -5,7 +5,6 @@ import json import os import sys - def convert_to_json(data: dict, filename: str, position: str = None) -> None: """ Converts a dictionary to a JSON file and saves it at a specified location. diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index 4c22d35b..1db7f037 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -27,5 +27,5 @@ def convert_to_md(html: str, url: str = None) -> str: parsed_url = urlparse(url) domain = f"{parsed_url.scheme}://{parsed_url.netloc}" h.baseurl = domain - + return h.handle(html) diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py index afb63c52..b40c2cd8 100644 --- a/scrapegraphai/utils/logging.py +++ b/scrapegraphai/utils/logging.py @@ -17,7 +17,6 @@ _default_logging_level = logging.WARNING _semaphore = threading.Lock() - def _get_library_root_logger() -> logging.Logger: return logging.getLogger(_library_name) diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index 6bda73c6..8cfef81a 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -1,7 +1,6 @@ """ Prettify the execution information of the graph. """ - import pandas as pd diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 6f6019e9..586e640e 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -10,7 +10,6 @@ import requests from fp.errors import FreeProxyException from fp.fp import FreeProxy - class ProxyBrokerCriteria(TypedDict, total=False): """proxy broker criteria""" diff --git a/scrapegraphai/utils/save_audio_from_bytes.py b/scrapegraphai/utils/save_audio_from_bytes.py index 3027e4e8..2bad3106 100644 --- a/scrapegraphai/utils/save_audio_from_bytes.py +++ b/scrapegraphai/utils/save_audio_from_bytes.py @@ -11,7 +11,8 @@ def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) - Args: byte_response (bytes): The byte array containing audio data. - output_path (Union[str, Path]): The destination file path where the audio file will be saved. + output_path (Union[str, Path]): The destination + file path where the audio file will be saved. Example: >>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3') diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 19b0d29a..4d1511a2 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -10,7 +10,6 @@ import importlib.util # noqa: F401 if typing.TYPE_CHECKING: import types - def srcfile_import(modpath: str, modname: str) -> "types.ModuleType": """imports a python module from its srcfile