From a28ca993de6430660d0e85c36c5b180e5fe01c56 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Wed, 27 Mar 2024 21:41:21 +0100 Subject: [PATCH 01/12] refactoring of examples folder --- examples/ScrapeGraphAI_generated_graph | 19 ++++++ .../custom_graph_gemini.py | 0 .../custom_graph_openai.py | 0 .../graph_builder_example.py | 0 .../graph_evaluation_example.py | 0 .../plain_html_example.txt | 0 examples/{graph_examples => }/result.csv | 0 examples/{graph_examples => }/result.json | 0 .../{graph_examples => }/scrape_plain_text.py | 0 .../smart_scraper_example.py | 0 .../speech_graph_example.py | 0 examples/utils/convert_to_csv.example.py | 39 ------------- examples/utils/convert_to_json_example.py | 28 --------- examples/utils/remover_example.py | 21 ------- .../utils/save_audio_from_bytes_example.py | 10 ---- examples/utils/token_calculator_example.py | 14 ----- examples/utils/vision_speech_example.py | 46 --------------- scrapegraphai/utils/test_node.py | 58 ------------------- 18 files changed, 19 insertions(+), 216 deletions(-) create mode 100644 examples/ScrapeGraphAI_generated_graph rename examples/{graph_examples => }/custom_graph_gemini.py (100%) rename examples/{graph_examples => }/custom_graph_openai.py (100%) rename examples/{graph_examples => }/graph_builder_example.py (100%) rename examples/{graph_examples => }/graph_evaluation_example.py (100%) rename examples/{graph_examples => }/plain_html_example.txt (100%) rename examples/{graph_examples => }/result.csv (100%) rename examples/{graph_examples => }/result.json (100%) rename examples/{graph_examples => }/scrape_plain_text.py (100%) rename examples/{graph_examples => }/smart_scraper_example.py (100%) rename examples/{graph_examples => }/speech_graph_example.py (100%) delete mode 100644 examples/utils/convert_to_csv.example.py delete mode 100644 examples/utils/convert_to_json_example.py delete mode 100644 examples/utils/remover_example.py delete mode 100644 examples/utils/save_audio_from_bytes_example.py delete mode 100644 examples/utils/token_calculator_example.py delete mode 100644 examples/utils/vision_speech_example.py delete mode 100644 scrapegraphai/utils/test_node.py diff --git a/examples/ScrapeGraphAI_generated_graph b/examples/ScrapeGraphAI_generated_graph new file mode 100644 index 00000000..acc3232c --- /dev/null +++ b/examples/ScrapeGraphAI_generated_graph @@ -0,0 +1,19 @@ +// ScrapeGraphAI Generated Graph +digraph { + node [color=lightblue2 style=filled] + FetchHTMLNode [shape=doublecircle] + GetProbableTagsNode + ParseNode + RAGNode + GenerateAnswerNode + ConditionalNode + ImageToTextNode + TextToSpeechNode + FetchHTMLNode -> GetProbableTagsNode + GetProbableTagsNode -> ParseNode + ParseNode -> RAGNode + RAGNode -> GenerateAnswerNode + RAGNode -> ConditionalNode + ConditionalNode -> ImageToTextNode + ConditionalNode -> TextToSpeechNode +} diff --git a/examples/graph_examples/custom_graph_gemini.py b/examples/custom_graph_gemini.py similarity index 100% rename from examples/graph_examples/custom_graph_gemini.py rename to examples/custom_graph_gemini.py diff --git a/examples/graph_examples/custom_graph_openai.py b/examples/custom_graph_openai.py similarity index 100% rename from examples/graph_examples/custom_graph_openai.py rename to examples/custom_graph_openai.py diff --git a/examples/graph_examples/graph_builder_example.py b/examples/graph_builder_example.py similarity index 100% rename from examples/graph_examples/graph_builder_example.py rename to examples/graph_builder_example.py diff --git a/examples/graph_examples/graph_evaluation_example.py b/examples/graph_evaluation_example.py similarity index 100% rename from examples/graph_examples/graph_evaluation_example.py rename to examples/graph_evaluation_example.py diff --git a/examples/graph_examples/plain_html_example.txt b/examples/plain_html_example.txt similarity index 100% rename from examples/graph_examples/plain_html_example.txt rename to examples/plain_html_example.txt diff --git a/examples/graph_examples/result.csv b/examples/result.csv similarity index 100% rename from examples/graph_examples/result.csv rename to examples/result.csv diff --git a/examples/graph_examples/result.json b/examples/result.json similarity index 100% rename from examples/graph_examples/result.json rename to examples/result.json diff --git a/examples/graph_examples/scrape_plain_text.py b/examples/scrape_plain_text.py similarity index 100% rename from examples/graph_examples/scrape_plain_text.py rename to examples/scrape_plain_text.py diff --git a/examples/graph_examples/smart_scraper_example.py b/examples/smart_scraper_example.py similarity index 100% rename from examples/graph_examples/smart_scraper_example.py rename to examples/smart_scraper_example.py diff --git a/examples/graph_examples/speech_graph_example.py b/examples/speech_graph_example.py similarity index 100% rename from examples/graph_examples/speech_graph_example.py rename to examples/speech_graph_example.py diff --git a/examples/utils/convert_to_csv.example.py b/examples/utils/convert_to_csv.example.py deleted file mode 100644 index 73f1efa9..00000000 --- a/examples/utils/convert_to_csv.example.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Teest for convert_to_csv -""" -import os -from scrapegraphai.utils.convert_to_csv import convert_to_csv - - -def main(): - """ - Example usage of the convert_to_csv function. - """ - # Example data - data = { - 'Name': ['John', 'Alice', 'Bob'], - 'Age': [30, 25, 35], - 'City': ['New York', 'San Francisco', 'Seattle'] - } - - # Example filename and position - filename = "example_data" - position = "./output" - - try: - # Convert data to CSV and save - convert_to_csv(data, filename, position) - print( - f"Data saved successfully to {os.path.join(position, filename)}.csv") - except ValueError as ve: - print(f"ValueError: {ve}") - except FileNotFoundError as fnfe: - print(f"FileNotFoundError: {fnfe}") - except PermissionError as pe: - print(f"PermissionError: {pe}") - except Exception as e: - print(f"An unexpected error occurred: {e}") - - -if __name__ == "__main__": - main() diff --git a/examples/utils/convert_to_json_example.py b/examples/utils/convert_to_json_example.py deleted file mode 100644 index 3859107c..00000000 --- a/examples/utils/convert_to_json_example.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -Example of using convert_to_json function to save data in JSON format. -""" -import os -from scrapegraphai.utils.convert_to_json import convert_to_json - -# Data to save in JSON format -data_to_save = { - "name": "John Doe", - "age": 30, - "city": "New York" -} - -FILENAME = "example_data" -DIRECTORY = "data_output" - -try: - convert_to_json(data_to_save, FILENAME, DIRECTORY) - print( - f"Data has been successfully saved to {os.path.join(DIRECTORY, FILENAME)}.json") -except ValueError as value_error: - print(value_error) -except FileNotFoundError as file_not_found_error: - print(file_not_found_error) -except PermissionError as permission_error: - print(permission_error) -except Exception as exception: - print(f"An error occurred: {exception}") diff --git a/examples/utils/remover_example.py b/examples/utils/remover_example.py deleted file mode 100644 index dbdca118..00000000 --- a/examples/utils/remover_example.py +++ /dev/null @@ -1,21 +0,0 @@ -""" -Example of the remover method -""" -from scrapegraphai.utils.remover import remover - -HTML_CONTENT = """ - - - Test Page - - -

This is a Test

-

Hello, World!

- - - -""" - -parsed_content = remover(HTML_CONTENT) - -print(parsed_content) diff --git a/examples/utils/save_audio_from_bytes_example.py b/examples/utils/save_audio_from_bytes_example.py deleted file mode 100644 index c915aad6..00000000 --- a/examples/utils/save_audio_from_bytes_example.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -Example for th e file save_audio_from_bytes -""" -from scrapegraphai.utils.save_audio_from_bytes import save_audio_from_bytes - -BYTE_RESPONSE = b'\x12\x34\x56\x78\x90' - -OUTPUT_PATH = "generated_speech.wav" - -save_audio_from_bytes(BYTE_RESPONSE, OUTPUT_PATH) diff --git a/examples/utils/token_calculator_example.py b/examples/utils/token_calculator_example.py deleted file mode 100644 index 68ff25d2..00000000 --- a/examples/utils/token_calculator_example.py +++ /dev/null @@ -1,14 +0,0 @@ -""" -Example for calclating the tokenizer -""" -from scrapegraphai.utils.token_calculator import truncate_text_tokens - -INPUT_TEXT = "http://nba.com" - -MODEL_NAME = "gpt-3.5-turbo" -ENCODING_NAME = "EMBEDDING_ENCODING" - -tokenized_chunks = truncate_text_tokens(INPUT_TEXT, MODEL_NAME, ENCODING_NAME) - -for i, chunk in enumerate(tokenized_chunks): - print(f"Chunk {i+1}: {chunk}") diff --git a/examples/utils/vision_speech_example.py b/examples/utils/vision_speech_example.py deleted file mode 100644 index 5c90dc25..00000000 --- a/examples/utils/vision_speech_example.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.models import OpenAIImageToText, OpenAITextToSpeech -from scrapegraphai.utils import save_audio_from_bytes - -load_dotenv() -openai_key = os.getenv("OPENAI_APIKEY") - -# Define the configuration for the graph -config = { - "itt_model": { - "api_key": openai_key, - "model": "gpt-4-vision-preview", - }, - "tts_model": { - "api_key": openai_key, - "model": "tts-1", - "voice": "alloy" - }, -} - -itt_model = OpenAIImageToText(config["itt_model"]) -img_to_text_result = itt_model.run( - "https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/scrapegraphai_logo.png" - ) - -print(f"Image description: {img_to_text_result}") - -tts_model = OpenAITextToSpeech(config["tts_model"]) - -audio = tts_model.run( - img_to_text_result - ) - -# Save the audio to a file -file_name = "image_description.mp3" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -output_path = os.path.join(curr_dir, file_name) - -save_audio_from_bytes(audio, output_path) - -print(f"Audio file saved to: {output_path}") diff --git a/scrapegraphai/utils/test_node.py b/scrapegraphai/utils/test_node.py deleted file mode 100644 index 4abf744c..00000000 --- a/scrapegraphai/utils/test_node.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Module for making the tests -""" -import os -from dotenv import load_dotenv -from scrapegraphai.models import OpenAI -from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode - -load_dotenv() - -# Define the configuration for the language model -openai_key = os.getenv("OPENAI_APIKEY") - -llm_config = { - "api_key": openai_key, - "model_name": "gpt-3.5-turbo", - "temperature": 0, - "streaming": True -} -llm_model = OpenAI(llm_config) - -state = { - "user_prompt": "List me all the projects", - "url": "https://perinim.github.io/projects/", -} - -fetch_node = FetchNode( - input="url | local_dir", - output=["doc"], - node_name="fetch_html" -) - -updated_state = fetch_node.execute(state) -parse_node = ParseNode( - input="doc", - output=["parsed_doc"], - node_name="parse_document" -) - -updated_state = parse_node.execute(updated_state) - -rag_node = RAGNode( - input="user_prompt & (parsed_doc | doc)", - output=["relevant_chunks"], - model_config={"llm_model": llm_model}, - node_name="rag_node" -) - -updated_state = rag_node.execute(updated_state) - -generate_answer_node = GenerateAnswerNode( - input="user_prompt & (relevant_chunks | parsed_doc | doc)", - output=["answer"], - model_config={"llm_model": llm_model}, - node_name="generate_answer" -) - -print(generate_answer_node.execute(updated_state)) From 60fe3c5d2ac5686189ce2eb1b19d2d5dc98dd704 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Thu, 28 Mar 2024 22:04:37 +0100 Subject: [PATCH 02/12] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 946e2b61..8fecf51b 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ graph_config = { # Create the SmartScraperGraph instance smart_scraper_graph = SmartScraperGraph( prompt="List me all the titles and project descriptions" - file_source="https://perinim.github.io/projects/", # also accepts a local file path + file_source="https://perinim.github.io/projects/", #Also accepts a string with the already downloaded HTML code config=graph_config ) From b67af93946cc36b1cd4fcf3cc1230d5f98b74083 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Fri, 29 Mar 2024 19:17:18 +0100 Subject: [PATCH 03/12] Update scrape_plain_text.py --- examples/scrape_plain_text.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/scrape_plain_text.py b/examples/scrape_plain_text.py index 4062ab86..fb2c0f84 100644 --- a/examples/scrape_plain_text.py +++ b/examples/scrape_plain_text.py @@ -32,3 +32,6 @@ result = smart_scraper_graph.run() print(result) # Save to json or csv +onvert_to_csv(result, "result") +convert_to_json(result, "result") + From 345fc32b378a66af515e805c0873017ce05cbc12 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Mon, 1 Apr 2024 12:52:30 +0200 Subject: [PATCH 04/12] add discord server --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8fecf51b..5ff114e3 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint) [![Pylint](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml/badge.svg)](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![](https://dcbadge.vercel.app/api/server/zkspfFwqDg)](https://discord.gg/zkspfFwqDg) ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents. From c2885e0f4c2a00316731f3bf2c135d4c7e86e3d9 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Mon, 1 Apr 2024 12:54:11 +0200 Subject: [PATCH 05/12] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ff114e3..e6ec1e85 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint) [![Pylint](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml/badge.svg)](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![](https://dcbadge.vercel.app/api/server/zkspfFwqDg)](https://discord.gg/zkspfFwqDg) +[![](https://dcbadge.vercel.app/api/server/bSgWTVXz)](https://discord.gg/bSgWTVXz) ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents. From 15e7e0f8b559a2aa541dd02cacf9160640f6aba2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> Date: Tue, 2 Apr 2024 12:56:26 +0200 Subject: [PATCH 06/12] Update README.md --- README.md | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e6ec1e85..46b087ca 100644 --- a/README.md +++ b/README.md @@ -50,31 +50,26 @@ You can use the `SmartScraper` class to extract information from a website using The `SmartScraper` class is a direct graph implementation that uses the most common nodes present in a web scraping pipeline. For more information, please see the [documentation](https://scrapegraph-ai.readthedocs.io/en/latest/). ```python -import os -from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph - -load_dotenv() -openai_key = os.getenv("OPENAI_APIKEY") +OPENAI_API_KEY = "YOUR_API_KEY" # Define the configuration for the graph graph_config = { "llm": { - "api_key": openai_key, + "api_key": OPENAI_API_KEY, "model": "gpt-3.5-turbo", }, } # Create the SmartScraperGraph instance smart_scraper_graph = SmartScraperGraph( - prompt="List me all the titles and project descriptions" - file_source="https://perinim.github.io/projects/", #Also accepts a string with the already downloaded HTML code + prompt="List me all the news with their description.", + file_source="https://perinim.github.io/projects/", # also accepts a string with the already downloaded HTML code as string format config=graph_config ) result = smart_scraper_graph.run() print(result) - ``` The output will be a dictionary with the extracted information, for example: From 99f89a6c3960c5b8dc0ad094b5bd2851c35d0740 Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Tue, 2 Apr 2024 13:37:11 +0200 Subject: [PATCH 07/12] new discord permanent link --- README.md | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e6ec1e85..1f747edb 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen)](https://github.com/pylint-dev/pylint) [![Pylint](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml/badge.svg)](https://github.com/VinciGit00/Scrapegraph-ai/actions/workflows/pylint.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![](https://dcbadge.vercel.app/api/server/bSgWTVXz)](https://discord.gg/bSgWTVXz) +[![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) ScrapeGraphAI is a *web scraping* python library based on LangChain which uses LLM and direct graph logic to create scraping pipelines for websites and documents. @@ -96,7 +96,7 @@ Fell free to contribute and join our Discord server to discuss with us improveme For more information, please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/CONTRIBUTING.md). -[![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/bSgWTVXz) +[![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/gkxQDAjfeX) [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) [![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraph) diff --git a/pyproject.toml b/pyproject.toml index a60ac03e..120a9ba0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapegraphai" -version = "0.0.12" +version = "0.0.14" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ "Marco Vinciguerra ", From 4c3ea8f4ac1e5d86501cbdb1140d4b906cb7fecd Mon Sep 17 00:00:00 2001 From: VinciGit00 Date: Wed, 3 Apr 2024 12:53:12 +0200 Subject: [PATCH 08/12] fixed image_to_tex_node and refactoring --- examples/ScrapeGraphAI_generated_graph | 19 ------------ examples/graph_builder_example.py | 31 -------------------- examples/{ => inputs}/plain_html_example.txt | 0 examples/{ => results}/result.csv | 0 examples/{ => results}/result.json | 0 examples/scrape_plain_text.py | 6 ++-- scrapegraphai/nodes/base_node.py | 7 +++-- scrapegraphai/nodes/image_to_text_node.py | 29 ++++++++++-------- scrapegraphai/utils/parse_state_keys.py | 8 +++-- scrapegraphai/utils/remover.py | 3 -- scrapegraphai/utils/save_audio_from_bytes.py | 4 +-- 11 files changed, 32 insertions(+), 75 deletions(-) delete mode 100644 examples/ScrapeGraphAI_generated_graph delete mode 100644 examples/graph_builder_example.py rename examples/{ => inputs}/plain_html_example.txt (100%) rename examples/{ => results}/result.csv (100%) rename examples/{ => results}/result.json (100%) diff --git a/examples/ScrapeGraphAI_generated_graph b/examples/ScrapeGraphAI_generated_graph deleted file mode 100644 index acc3232c..00000000 --- a/examples/ScrapeGraphAI_generated_graph +++ /dev/null @@ -1,19 +0,0 @@ -// ScrapeGraphAI Generated Graph -digraph { - node [color=lightblue2 style=filled] - FetchHTMLNode [shape=doublecircle] - GetProbableTagsNode - ParseNode - RAGNode - GenerateAnswerNode - ConditionalNode - ImageToTextNode - TextToSpeechNode - FetchHTMLNode -> GetProbableTagsNode - GetProbableTagsNode -> ParseNode - ParseNode -> RAGNode - RAGNode -> GenerateAnswerNode - RAGNode -> ConditionalNode - ConditionalNode -> ImageToTextNode - ConditionalNode -> TextToSpeechNode -} diff --git a/examples/graph_builder_example.py b/examples/graph_builder_example.py deleted file mode 100644 index 53956e09..00000000 --- a/examples/graph_builder_example.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Example of graph builder -""" -import os -from dotenv import load_dotenv -from scrapegraphai.builders import GraphBuilder - -load_dotenv() -openai_key = os.getenv("OPENAI_APIKEY") - -# Define the configuration for the graph -graph_config = { - "llm": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", - }, -} - -# Example usage of GraphBuilder -graph_builder = GraphBuilder( - user_prompt="Extract the news and generate a text summary with a voiceover.", - config=graph_config -) - -graph_json = graph_builder.build_graph() - -# Convert the resulting JSON to Graphviz format -graphviz_graph = graph_builder.convert_json_to_graphviz(graph_json) - -# Save the graph to a file and open it in the default viewer -graphviz_graph.render('ScrapeGraphAI_generated_graph', view=True) diff --git a/examples/plain_html_example.txt b/examples/inputs/plain_html_example.txt similarity index 100% rename from examples/plain_html_example.txt rename to examples/inputs/plain_html_example.txt diff --git a/examples/result.csv b/examples/results/result.csv similarity index 100% rename from examples/result.csv rename to examples/results/result.csv diff --git a/examples/result.json b/examples/results/result.json similarity index 100% rename from examples/result.json rename to examples/results/result.json diff --git a/examples/scrape_plain_text.py b/examples/scrape_plain_text.py index fb2c0f84..81dee0f9 100644 --- a/examples/scrape_plain_text.py +++ b/examples/scrape_plain_text.py @@ -5,6 +5,7 @@ Basic example of scraping pipeline using SmartScraper import os from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json load_dotenv() openai_key = os.getenv("OPENAI_APIKEY") @@ -19,7 +20,7 @@ graph_config = { # It could be also a http request using the request model -text = open('plain_html_example.txt', 'r', encoding="utf-8") +text = open('inputs/plain_html_example.txt', 'r', encoding="utf-8") # Create the SmartScraperGraph instance smart_scraper_graph = SmartScraperGraph( @@ -32,6 +33,5 @@ result = smart_scraper_graph.run() print(result) # Save to json or csv -onvert_to_csv(result, "result") +convert_to_csv(result, "result") convert_to_json(result, "result") - diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index e9766588..6a85f2d3 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -40,7 +40,8 @@ class BaseNode(ABC): raised to indicate the incorrect usage. """ - def __init__(self, node_name: str, node_type: str, input: str, output: List[str], min_input_len: int = 1, model_config: Optional[dict] = None): + def __init__(self, node_name: str, node_type: str, input: str, output: List[str], + min_input_len: int = 1, model_config: Optional[dict] = None): """ Initialize the node with a unique identifier and a specified node type. @@ -73,7 +74,9 @@ class BaseNode(ABC): pass def get_input_keys(self, state: dict) -> List[str]: - # Use the _parse_input_keys method to identify which state keys are needed based on the input attribute + """Use the _parse_input_keys method to identify which state keys are + needed based on the input attribute + """ try: input_keys = self._parse_input_keys(state, self.input) self._validate_input_keys(input_keys) diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 703355d5..0a845d05 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -1,7 +1,7 @@ -""" +""" Module for the ImageToTextNode class. """ - +from typing import List from .base_node import BaseNode @@ -10,34 +10,39 @@ class ImageToTextNode(BaseNode): A class representing a node that processes an image and returns the text description. Attributes: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. + llm_model (OpenAIImageToText): An instance of the OpenAIImageToText class. Methods: execute(state, url): Execute the node's logic and return the updated state. """ - def __init__(self, llm, node_name: str): + def __init__(self, input: str, output: List[str], model_config: dict, + node_name: str = "GetProbableTags"): """ Initializes an instance of the ImageToTextNode class. Args: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. - node_name (str): name of the node + input (str): The input for the node. + output (List[str]): The output of the node. + model_config (dict): Configuration for the model. + node_name (str): Name of the node. """ - super().__init__(node_name, "node") - self.llm = llm + super().__init__(node_name, "node", input, output, 2, model_config) + self.llm_model = model_config["llm_model"] def execute(self, state: dict, url: str) -> dict: """ Execute the node's logic and return the updated state. + Args: state (dict): The current state of the graph. - url (str): url of the image where to - :return: The updated state after executing this node. - """ + url (str): URL of the image to process. + Returns: + dict: The updated state after executing this node. + """ print("---GENERATING TEXT FROM IMAGE---") - text_answer = self.llm.run(url) + text_answer = self.llm_model.run(url) state.update({"image_text": text_answer}) return state diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index c5da7e8a..5c99a60f 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -7,6 +7,8 @@ import re def parse_expression(expression, state: dict): """ Function for parsing the expressions + Args: + state (dict): state to elaborate """ # Check for empty expression if not expression: @@ -69,14 +71,14 @@ def parse_expression(expression, state: dict): '|'.join(sub_result) + expression[end+1:] return evaluate_simple_expression(expression) - result = evaluate_expression(expression) + temp_result = evaluate_expression(expression) - if not result: + if not temp_result: raise ValueError("No state keys matched the expression.") # Remove redundant state keys from the result, without changing their order final_result = [] - for key in result: + for key in temp_result: if key not in final_result: final_result.append(key) diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py index 9f765473..1cde0c0f 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/remover.py @@ -18,14 +18,11 @@ def remover(html_content: str) -> str: soup = BeautifulSoup(html_content, 'html.parser') - # Estrai il titolo title_tag = soup.find('title') title = title_tag.get_text() if title_tag else "" - # Rimuovi i tag