From 127227349915deeb0dede34aa575ad269ed7cbe3 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 9 Aug 2024 17:35:43 +0200 Subject: [PATCH 1/5] fix: broken node --- scrapegraphai/nodes/conditional_node.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 0bf84766..0a46684b 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -41,7 +41,8 @@ class ConditionalNode(BaseNode): key_name (str): The name of the key to check in the state. """ - super().__init__(node_name, "node", input, output, 2, node_config) + #super().__init__(node_name, "node", input, output, 2, node_config) + def execute(self, state: dict) -> dict: From b470d974cf3fdb3a75ead46fceb8c21525e2e616 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Fri, 9 Aug 2024 15:37:24 +0000 Subject: [PATCH 2/5] ci(release): 1.13.0-beta.8 [skip ci] ## [1.13.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.7...v1.13.0-beta.8) (2024-08-09) ### Bug Fixes * broken node ([1272273](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/127227349915deeb0dede34aa575ad269ed7cbe3)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5aa6c032..776660d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.13.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.7...v1.13.0-beta.8) (2024-08-09) + + +### Bug Fixes + +* broken node ([1272273](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/127227349915deeb0dede34aa575ad269ed7cbe3)) + ## [1.13.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.6...v1.13.0-beta.7) (2024-08-09) diff --git a/pyproject.toml b/pyproject.toml index 866c3a4a..cd985243 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.13.0b7" +version = "1.13.0b8" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 6c3b37ab001b80c09ea9ffb56d4c3df338e33a7a Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 10 Aug 2024 11:51:37 +0200 Subject: [PATCH 3/5] feat: add refactoring of default temperature --- examples/local_models/smart_scraper_ollama.py | 1 - scrapegraphai/graphs/abstract_graph.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index b161cd0f..d5585ff7 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -14,7 +14,6 @@ graph_config = { "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "verbose": True, "headless": False } diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 83b532bc..2ccf14b2 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -53,6 +53,9 @@ class AbstractGraph(ABC): def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[BaseModel] = None): + if config.get("llm").get("temperature") is None: + config["llm"]["temperature"] = 0 + self.prompt = prompt self.source = source self.config = config @@ -212,7 +215,7 @@ class AbstractGraph(ABC): print("model not found, using default token size (8192)") self.model_token = 8192 return ErnieBotChat(llm_params) - + if "oneapi" in llm_params["model"]: # take the model after the last dash llm_params["model"] = llm_params["model"].split("/")[-1] @@ -221,7 +224,7 @@ class AbstractGraph(ABC): except KeyError as exc: raise KeyError("Model not supported") from exc return OneApi(llm_params) - + if "nvidia" in llm_params["model"]: try: self.model_token = models_tokens["nvidia"][llm_params["model"].split("/")[-1]] From d4c1a1c58a54740ff50aa87b1d1d3500b61ea088 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sat, 10 Aug 2024 10:34:05 +0000 Subject: [PATCH 4/5] ci(release): 1.13.0-beta.9 [skip ci] ## [1.13.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.8...v1.13.0-beta.9) (2024-08-10) ### Features * add refactoring of default temperature ([6c3b37a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6c3b37ab001b80c09ea9ffb56d4c3df338e33a7a)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 776660d8..815258c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.13.0-beta.9](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.8...v1.13.0-beta.9) (2024-08-10) + + +### Features + +* add refactoring of default temperature ([6c3b37a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6c3b37ab001b80c09ea9ffb56d4c3df338e33a7a)) + ## [1.13.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.13.0-beta.7...v1.13.0-beta.8) (2024-08-09) diff --git a/pyproject.toml b/pyproject.toml index cd985243..ac89384e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.13.0b8" +version = "1.13.0b9" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." From 8b2c266affc77f4d4d9a0ec4b56fc01e92849eb4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 10 Aug 2024 17:44:35 +0200 Subject: [PATCH 5/5] refactoring of the code Co-Authored-By: Matteo Vedovati <68272450+vedovati-matteo@users.noreply.github.com> --- examples/local_models/smart_scraper_ollama.py | 5 +++++ scrapegraphai/graphs/abstract_graph.py | 4 ---- scrapegraphai/graphs/base_graph.py | 4 +--- scrapegraphai/graphs/csv_scraper_graph.py | 3 --- scrapegraphai/graphs/csv_scraper_multi_graph.py | 10 ++++------ scrapegraphai/graphs/deep_scraper_graph.py | 5 ----- scrapegraphai/graphs/json_scraper_graph.py | 3 --- scrapegraphai/graphs/json_scraper_multi_graph.py | 6 ++---- scrapegraphai/graphs/markdown_scraper_multi_graph.py | 3 --- scrapegraphai/graphs/omni_scraper_graph.py | 3 --- scrapegraphai/graphs/pdf_scraper_graph.py | 3 --- scrapegraphai/graphs/pdf_scraper_multi_graph.py | 3 --- scrapegraphai/graphs/script_creator_graph.py | 3 --- scrapegraphai/graphs/script_creator_multi_graph.py | 4 ++-- scrapegraphai/graphs/search_graph.py | 2 -- scrapegraphai/graphs/search_link_graph.py | 6 +++--- scrapegraphai/graphs/smart_scraper_graph.py | 1 - scrapegraphai/graphs/smart_scraper_multi_graph.py | 9 +++++---- scrapegraphai/graphs/speech_graph.py | 4 ++-- scrapegraphai/graphs/xml_scraper_graph.py | 1 - scrapegraphai/graphs/xml_scraper_multi_graph.py | 1 - scrapegraphai/nodes/conditional_node.py | 8 ++------ scrapegraphai/nodes/generate_answer_omni_node.py | 5 ----- scrapegraphai/nodes/generate_scraper_node.py | 5 ----- scrapegraphai/nodes/graph_iterator_node.py | 1 - scrapegraphai/nodes/merge_answers_node.py | 1 - scrapegraphai/nodes/merge_generated_scripts.py | 1 - scrapegraphai/nodes/parse_node.py | 3 +-- scrapegraphai/nodes/rag_node.py | 2 -- scrapegraphai/utils/cleanup_html.py | 12 ++++++++---- scrapegraphai/utils/convert_to_csv.py | 1 - scrapegraphai/utils/convert_to_json.py | 1 - scrapegraphai/utils/convert_to_md.py | 2 +- scrapegraphai/utils/logging.py | 1 - scrapegraphai/utils/prettify_exec_info.py | 1 - scrapegraphai/utils/proxy_rotation.py | 1 - scrapegraphai/utils/save_audio_from_bytes.py | 3 ++- scrapegraphai/utils/sys_dynamic_import.py | 1 - 38 files changed, 38 insertions(+), 94 deletions(-) diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py index d5585ff7..14fe622f 100644 --- a/examples/local_models/smart_scraper_ollama.py +++ b/examples/local_models/smart_scraper_ollama.py @@ -22,8 +22,13 @@ graph_config = { # Create the SmartScraperGraph instance and run it # ************************************************ smart_scraper_graph = SmartScraperGraph( +<<<<<<< Updated upstream prompt="Find some information about what does the company do, the name and a contact email.", source="https://scrapegraphai.com/", +======= + prompt="List all the projects with their descriptions", + source="https://perinim.github.io/projects/", +>>>>>>> Stashed changes config=graph_config ) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 2ccf14b2..f22f764c 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -7,11 +7,9 @@ from typing import Optional import uuid import warnings from pydantic import BaseModel - from langchain_community.chat_models import ErnieBotChat from langchain_nvidia_ai_endpoints import ChatNVIDIA from langchain.chat_models import init_chat_model - from ..helpers import models_tokens from ..models import ( OneApi, @@ -19,8 +17,6 @@ from ..models import ( ) from ..utils.logging import set_verbosity_warning, set_verbosity_info - - class AbstractGraph(ABC): """ Scaffolding class for creating a graph representation and executing it. diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index c441f7ab..f442ac21 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -6,9 +6,7 @@ import warnings from typing import Tuple from langchain_community.callbacks import get_openai_callback from ..integrations import BurrBridge - -# Import telemetry functions -from ..telemetry import log_graph_execution, log_event +from ..telemetry import log_graph_execution class BaseGraph: """ diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py index d7ec186e..42153be5 100644 --- a/scrapegraphai/graphs/csv_scraper_graph.py +++ b/scrapegraphai/graphs/csv_scraper_graph.py @@ -4,16 +4,13 @@ Module for creating the smart scraper from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, GenerateAnswerCSVNode ) - class CSVScraperGraph(AbstractGraph): """ SmartScraper is a comprehensive web scraping tool that automates the process of extracting diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py index 716e9aca..808549aa 100644 --- a/scrapegraphai/graphs/csv_scraper_multi_graph.py +++ b/scrapegraphai/graphs/csv_scraper_multi_graph.py @@ -4,22 +4,19 @@ CSVScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional - from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .csv_scraper_graph import CSVScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class CSVScraperMultiGraph(AbstractGraph): """ - CSVScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + CSVScraperMultiGraph is a scraping pipeline that + scrapes a list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: @@ -44,7 +41,8 @@ class CSVScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py index 43a461d0..ca617d19 100644 --- a/scrapegraphai/graphs/deep_scraper_graph.py +++ b/scrapegraphai/graphs/deep_scraper_graph.py @@ -4,10 +4,8 @@ DeepScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, SearchLinkNode, @@ -18,7 +16,6 @@ from ..nodes import ( MergeAnswersNode ) - class DeepScraperGraph(AbstractGraph): """ [WIP] @@ -87,7 +84,6 @@ class DeepScraperGraph(AbstractGraph): output=["relevant_chunks"], node_config={ "llm_model": self.llm_model, - "embedder_model": self.embedder_model } ) generate_answer_node = GenerateAnswerNode( @@ -104,7 +100,6 @@ class DeepScraperGraph(AbstractGraph): output=["relevant_links"], node_config={ "llm_model": self.llm_model, - "embedder_model": self.embedder_model } ) graph_iterator_node = GraphIteratorNode( diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py index fe54ebec..a23c1f38 100644 --- a/scrapegraphai/graphs/json_scraper_graph.py +++ b/scrapegraphai/graphs/json_scraper_graph.py @@ -4,16 +4,13 @@ JSONScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, GenerateAnswerNode ) - class JSONScraperGraph(AbstractGraph): """ JSONScraperGraph defines a scraping pipeline for JSON files. diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py index 48fd8217..da7f33ba 100644 --- a/scrapegraphai/graphs/json_scraper_multi_graph.py +++ b/scrapegraphai/graphs/json_scraper_multi_graph.py @@ -5,20 +5,18 @@ JSONScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .json_scraper_graph import JSONScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class JSONScraperMultiGraph(AbstractGraph): """ - JSONScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + JSONScraperMultiGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: diff --git a/scrapegraphai/graphs/markdown_scraper_multi_graph.py b/scrapegraphai/graphs/markdown_scraper_multi_graph.py index ec47f74d..e59f6e5a 100644 --- a/scrapegraphai/graphs/markdown_scraper_multi_graph.py +++ b/scrapegraphai/graphs/markdown_scraper_multi_graph.py @@ -5,17 +5,14 @@ MDScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .markdown_scraper_graph import MDScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class MDScraperMultiGraph(AbstractGraph): """ MDScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py index 1965dc04..6849ee12 100644 --- a/scrapegraphai/graphs/omni_scraper_graph.py +++ b/scrapegraphai/graphs/omni_scraper_graph.py @@ -4,17 +4,14 @@ OmniScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, ImageToTextNode, GenerateAnswerOmniNode ) - from ..models import OpenAIImageToText class OmniScraperGraph(AbstractGraph): diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py index 049425d0..ae783aba 100644 --- a/scrapegraphai/graphs/pdf_scraper_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_graph.py @@ -5,17 +5,14 @@ PDFScraperGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, GenerateAnswerPDFNode ) - class PDFScraperGraph(AbstractGraph): """ PDFScraperGraph is a scraping pipeline that extracts information from pdf files using a natural diff --git a/scrapegraphai/graphs/pdf_scraper_multi_graph.py b/scrapegraphai/graphs/pdf_scraper_multi_graph.py index f9b3061b..6803e27a 100644 --- a/scrapegraphai/graphs/pdf_scraper_multi_graph.py +++ b/scrapegraphai/graphs/pdf_scraper_multi_graph.py @@ -5,17 +5,14 @@ PdfScraperMultiGraph Module from copy import copy, deepcopy from typing import List, Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph from .pdf_scraper_graph import PDFScraperGraph - from ..nodes import ( GraphIteratorNode, MergeAnswersNode ) - class PdfScraperMultiGraph(AbstractGraph): """ PdfScraperMultiGraph is a scraping pipeline that scrapes a diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py index a4d1d6f6..bb5629c5 100644 --- a/scrapegraphai/graphs/script_creator_graph.py +++ b/scrapegraphai/graphs/script_creator_graph.py @@ -4,17 +4,14 @@ ScriptCreatorGraph Module from typing import Optional from pydantic import BaseModel - from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - from ..nodes import ( FetchNode, ParseNode, GenerateScraperNode ) - class ScriptCreatorGraph(AbstractGraph): """ ScriptCreatorGraph defines a scraping pipeline for generating web scraping scripts. diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py index 0bafd561..969ba722 100644 --- a/scrapegraphai/graphs/script_creator_multi_graph.py +++ b/scrapegraphai/graphs/script_creator_multi_graph.py @@ -16,10 +16,10 @@ from ..nodes import ( MergeGeneratedScriptsNode ) - class ScriptCreatorMultiGraph(AbstractGraph): """ - ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list of URLs generating web scraping scripts. + ScriptCreatorMultiGraph is a scraping pipeline that scrapes a list + of URLs generating web scraping scripts. It only requires a user prompt and a list of URLs. Attributes: prompt (str): The user prompt to search the internet. diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index 0c0f1104..080aaf19 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -16,8 +16,6 @@ from ..nodes import ( MergeAnswersNode ) - - class SearchGraph(AbstractGraph): """ SearchGraph is a scraping pipeline that searches the internet for answers to a given prompt. diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py index c9521497..3898e4a9 100644 --- a/scrapegraphai/graphs/search_link_graph.py +++ b/scrapegraphai/graphs/search_link_graph.py @@ -4,13 +4,13 @@ import logging from pydantic import BaseModel from .base_graph import BaseGraph from .abstract_graph import AbstractGraph - - from ..nodes import ( FetchNode, ParseNode, SearchLinkNode ) class SearchLinkGraph(AbstractGraph): """ - SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts. + SearchLinkGraph is a scraping pipeline that automates the process of + extracting information from web pages using a natural language model + to interpret and answer prompts. Attributes: prompt (str): The prompt for the graph. diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index cb4777a8..aa83c23b 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -14,7 +14,6 @@ from ..nodes import ( GenerateAnswerNode ) - class SmartScraperGraph(AbstractGraph): """ SmartScraper is a scraping pipeline that automates the process of diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py index 84e028fc..66d53851 100644 --- a/scrapegraphai/graphs/smart_scraper_multi_graph.py +++ b/scrapegraphai/graphs/smart_scraper_multi_graph.py @@ -15,10 +15,10 @@ from ..nodes import ( MergeAnswersNode ) - class SmartScraperMultiGraph(AbstractGraph): """ - SmartScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt. + SmartScraperMultiGraph is a scraping pipeline that scrapes a + list of URLs and generates answers to a given prompt. It only requires a user prompt and a list of URLs. Attributes: @@ -43,7 +43,8 @@ class SmartScraperMultiGraph(AbstractGraph): >>> result = search_graph.run() """ - def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None): + def __init__(self, prompt: str, source: List[str], + config: dict, schema: Optional[BaseModel] = None): self.max_results = config.get("max_results", 3) @@ -51,7 +52,7 @@ class SmartScraperMultiGraph(AbstractGraph): self.copy_config = copy(config) else: self.copy_config = deepcopy(config) - + self.copy_schema = deepcopy(schema) super().__init__(prompt, config, source, schema) diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index d1d6f94b..8d77621a 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -18,10 +18,10 @@ from ..nodes import ( from ..utils.save_audio_from_bytes import save_audio_from_bytes from ..models import OpenAITextToSpeech - class SpeechGraph(AbstractGraph): """ - SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer to a given prompt, and generate an audio file. + SpeechyGraph is a scraping pipeline that scrapes the web, provide an answer + to a given prompt, and generate an audio file. Attributes: prompt (str): The prompt for the graph. diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py index 24b1ff0d..e0a149eb 100644 --- a/scrapegraphai/graphs/xml_scraper_graph.py +++ b/scrapegraphai/graphs/xml_scraper_graph.py @@ -13,7 +13,6 @@ from ..nodes import ( GenerateAnswerNode ) - class XMLScraperGraph(AbstractGraph): """ XMLScraperGraph is a scraping pipeline that extracts information from XML files using a natural diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py index a6f90bea..648db500 100644 --- a/scrapegraphai/graphs/xml_scraper_multi_graph.py +++ b/scrapegraphai/graphs/xml_scraper_multi_graph.py @@ -15,7 +15,6 @@ from ..nodes import ( MergeAnswersNode ) - class XMLScraperMultiGraph(AbstractGraph): """ XMLScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and diff --git a/scrapegraphai/nodes/conditional_node.py b/scrapegraphai/nodes/conditional_node.py index 0a46684b..85a4f8ef 100644 --- a/scrapegraphai/nodes/conditional_node.py +++ b/scrapegraphai/nodes/conditional_node.py @@ -42,7 +42,7 @@ class ConditionalNode(BaseNode): """ #super().__init__(node_name, "node", input, output, 2, node_config) - + pass def execute(self, state: dict) -> dict: @@ -56,8 +56,4 @@ class ConditionalNode(BaseNode): str: The name of the next node to execute based on the presence of the key. """ - if self.key_name in state and len(state[self.key_name]) > 0: - state["next_node"] = 0 - else: - state["next_node"] = 1 - return state + pass diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 55b8b5f3..10ff786e 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -1,17 +1,12 @@ """ GenerateAnswerNode Module """ - -# Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from tqdm import tqdm from langchain_community.chat_models import ChatOllama -# Imports from the library from .base_node import BaseNode from ..helpers.generate_answer_node_omni_prompts import template_no_chunk_omni, template_chunks_omni, template_merge_omni diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 733898bd..fbd47a34 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -4,16 +4,11 @@ GenerateScraperNode Module # Imports from standard library from typing import List, Optional - -# Imports from Langchain from langchain.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser, JsonOutputParser from ..utils.logging import get_logger - -# Imports from the library from .base_node import BaseNode - class GenerateScraperNode(BaseNode): """ Generates a python script for scraping a website using the specified library. diff --git a/scrapegraphai/nodes/graph_iterator_node.py b/scrapegraphai/nodes/graph_iterator_node.py index 6ce4bdaf..db7d8f02 100644 --- a/scrapegraphai/nodes/graph_iterator_node.py +++ b/scrapegraphai/nodes/graph_iterator_node.py @@ -11,7 +11,6 @@ from .base_node import BaseNode DEFAULT_BATCHSIZE = 16 - class GraphIteratorNode(BaseNode): """ A node responsible for instantiating and running multiple graph instances in parallel. diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index eaea0184..5bfee267 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -9,7 +9,6 @@ from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import template_combined - class MergeAnswersNode(BaseNode): """ A node responsible for merging the answers from multiple graph instances into a single answer. diff --git a/scrapegraphai/nodes/merge_generated_scripts.py b/scrapegraphai/nodes/merge_generated_scripts.py index 8c8eaecd..bf8f7f4a 100644 --- a/scrapegraphai/nodes/merge_generated_scripts.py +++ b/scrapegraphai/nodes/merge_generated_scripts.py @@ -10,7 +10,6 @@ from langchain_core.output_parsers import JsonOutputParser, StrOutputParser from ..utils.logging import get_logger from .base_node import BaseNode - class MergeGeneratedScriptsNode(BaseNode): """ A node responsible for merging scripts generated. diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index 59471de1..48741085 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -9,7 +9,6 @@ from langchain_core.documents import Document from ..utils.logging import get_logger from .base_node import BaseNode - class ParseNode(BaseNode): """ A node responsible for parsing HTML content from a document. @@ -91,7 +90,7 @@ class ParseNode(BaseNode): chunk_size=self.node_config.get("chunk_size", 4096)-250, token_counter=lambda text: len(text.split()), memoize=False) - + state.update({self.output[0]: chunks}) return state diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 952daa6c..fcacac99 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -13,7 +13,6 @@ from langchain.retrievers.document_compressors import ( ) from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_community.vectorstores import FAISS - from langchain_community.chat_models import ChatOllama from langchain_aws import BedrockEmbeddings, ChatBedrock from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings @@ -23,7 +22,6 @@ from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings from langchain_fireworks import FireworksEmbeddings, ChatFireworks from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA - from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import models_tokens diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index a2bea856..8a0fc269 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -7,20 +7,23 @@ from urllib.parse import urljoin def cleanup_html(html_content: str, base_url: str) -> str: """ - Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. + Processes HTML content by removing unnecessary tags, + minifying the HTML, and extracting the title and body content. Args: html_content (str): The HTML content to be processed. Returns: - str: A string combining the parsed title and the minified body content. If no body content is found, it indicates so. + str: A string combining the parsed title and the minified body content. + If no body content is found, it indicates so. Example: >>> html_content = "Example

Hello World!

" >>> remover(html_content) 'Title: Example, Body:

Hello World!

' - This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized. + This function is particularly useful for preparing HTML content for + environments where bandwidth usage needs to be minimized. """ soup = BeautifulSoup(html_content, 'html.parser') @@ -55,4 +58,5 @@ def cleanup_html(html_content: str, base_url: str) -> str: return title, minimized_body, link_urls, image_urls else: - raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}") + raise ValueError(f"""No HTML body content found, please try setting the 'headless' + flag to False in the graph configuration. HTML content: {html_content}""") diff --git a/scrapegraphai/utils/convert_to_csv.py b/scrapegraphai/utils/convert_to_csv.py index be001d06..44897c7c 100644 --- a/scrapegraphai/utils/convert_to_csv.py +++ b/scrapegraphai/utils/convert_to_csv.py @@ -5,7 +5,6 @@ import os import sys import pandas as pd - def convert_to_csv(data: dict, filename: str, position: str = None) -> None: """ Converts a dictionary to a CSV file and saves it at a specified location. diff --git a/scrapegraphai/utils/convert_to_json.py b/scrapegraphai/utils/convert_to_json.py index 7cf12c53..57618fc1 100644 --- a/scrapegraphai/utils/convert_to_json.py +++ b/scrapegraphai/utils/convert_to_json.py @@ -5,7 +5,6 @@ import json import os import sys - def convert_to_json(data: dict, filename: str, position: str = None) -> None: """ Converts a dictionary to a JSON file and saves it at a specified location. diff --git a/scrapegraphai/utils/convert_to_md.py b/scrapegraphai/utils/convert_to_md.py index 4c22d35b..1db7f037 100644 --- a/scrapegraphai/utils/convert_to_md.py +++ b/scrapegraphai/utils/convert_to_md.py @@ -27,5 +27,5 @@ def convert_to_md(html: str, url: str = None) -> str: parsed_url = urlparse(url) domain = f"{parsed_url.scheme}://{parsed_url.netloc}" h.baseurl = domain - + return h.handle(html) diff --git a/scrapegraphai/utils/logging.py b/scrapegraphai/utils/logging.py index afb63c52..b40c2cd8 100644 --- a/scrapegraphai/utils/logging.py +++ b/scrapegraphai/utils/logging.py @@ -17,7 +17,6 @@ _default_logging_level = logging.WARNING _semaphore = threading.Lock() - def _get_library_root_logger() -> logging.Logger: return logging.getLogger(_library_name) diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index 6bda73c6..8cfef81a 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -1,7 +1,6 @@ """ Prettify the execution information of the graph. """ - import pandas as pd diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 6f6019e9..586e640e 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -10,7 +10,6 @@ import requests from fp.errors import FreeProxyException from fp.fp import FreeProxy - class ProxyBrokerCriteria(TypedDict, total=False): """proxy broker criteria""" diff --git a/scrapegraphai/utils/save_audio_from_bytes.py b/scrapegraphai/utils/save_audio_from_bytes.py index 3027e4e8..2bad3106 100644 --- a/scrapegraphai/utils/save_audio_from_bytes.py +++ b/scrapegraphai/utils/save_audio_from_bytes.py @@ -11,7 +11,8 @@ def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) - Args: byte_response (bytes): The byte array containing audio data. - output_path (Union[str, Path]): The destination file path where the audio file will be saved. + output_path (Union[str, Path]): The destination + file path where the audio file will be saved. Example: >>> save_audio_from_bytes(b'audio data', 'path/to/audio.mp3') diff --git a/scrapegraphai/utils/sys_dynamic_import.py b/scrapegraphai/utils/sys_dynamic_import.py index 19b0d29a..4d1511a2 100644 --- a/scrapegraphai/utils/sys_dynamic_import.py +++ b/scrapegraphai/utils/sys_dynamic_import.py @@ -10,7 +10,6 @@ import importlib.util # noqa: F401 if typing.TYPE_CHECKING: import types - def srcfile_import(modpath: str, modname: str) -> "types.ModuleType": """imports a python module from its srcfile