From 6b71ec1d2be953220b6767bc429f4cf6529803fd Mon Sep 17 00:00:00 2001 From: Marco Perini Date: Wed, 8 May 2024 15:36:26 +0200 Subject: [PATCH] fix(examples): local, mixed models and fixed SearchGraph embeddings problem --- .../local_models/Docker/csv_scraper_docker.py | 54 ------ examples/local_models/Docker/inputs/books.xml | 120 ------------ .../local_models/Docker/inputs/example.json | 182 ------------------ .../Docker/inputs/plain_html_example.txt | 105 ---------- .../local_models/Docker/inputs/username.csv | 7 - .../Docker/json_scraper_docker.py | 61 ------ .../Docker/scrape_plain_text_docker.py | 57 ------ .../local_models/Docker/scrape_xml_docker.py | 56 ------ .../Docker/script_generator_docker.py | 44 ----- .../Docker/search_graph_docker.py | 49 ----- .../Docker/smart_scraper_docker.py | 43 ----- .../local_models/Docker/xml_scraper_docker.py | 61 ------ .../local_models/Ollama/csv_scraper_ollama.py | 14 +- .../Ollama/json_scraper_ollama.py | 3 +- examples/local_models/Ollama/readme.md | 0 .../Ollama/scrape_plain_text_ollama.py | 5 +- .../local_models/Ollama/scrape_xml_ollama.py | 3 +- .../Ollama/script_generator_ollama.py | 3 +- .../Ollama/search_graph_ollama.py | 9 +- .../Ollama/smart_scraper_ollama.py | 12 +- .../local_models/Ollama/xml_scraper_ollama.py | 5 +- .../{Docker/readme.md => README.md} | 0 .../mixed_models/search_graph_groq_ollama.py | 15 +- examples/mixed_models/smart_scraper_mixed.py | 3 +- scrapegraphai/graphs/search_graph.py | 6 +- scrapegraphai/nodes/search_internet_node.py | 7 +- 26 files changed, 53 insertions(+), 871 deletions(-) delete mode 100644 examples/local_models/Docker/csv_scraper_docker.py delete mode 100644 examples/local_models/Docker/inputs/books.xml delete mode 100644 examples/local_models/Docker/inputs/example.json delete mode 100644 examples/local_models/Docker/inputs/plain_html_example.txt delete mode 100644 examples/local_models/Docker/inputs/username.csv delete mode 100644 examples/local_models/Docker/json_scraper_docker.py delete mode 100644 examples/local_models/Docker/scrape_plain_text_docker.py delete mode 100644 examples/local_models/Docker/scrape_xml_docker.py delete mode 100644 examples/local_models/Docker/script_generator_docker.py delete mode 100644 examples/local_models/Docker/search_graph_docker.py delete mode 100644 examples/local_models/Docker/smart_scraper_docker.py delete mode 100644 examples/local_models/Docker/xml_scraper_docker.py delete mode 100644 examples/local_models/Ollama/readme.md rename examples/local_models/{Docker/readme.md => README.md} (100%) diff --git a/examples/local_models/Docker/csv_scraper_docker.py b/examples/local_models/Docker/csv_scraper_docker.py deleted file mode 100644 index 51e96b17..00000000 --- a/examples/local_models/Docker/csv_scraper_docker.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Basic example of scraping pipeline using CSVScraperGraph from CSV documents -""" - -import pandas as pd -from scrapegraphai.graphs import CSVScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Read the csv file -# ************************************************ - -text = pd.read_csv("inputs/username.csv") - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } -} - -# ************************************************ -# Create the CSVScraperGraph instance and run it -# ************************************************ - -csv_scraper_graph = CSVScraperGraph( - prompt="List me all the last names", - source=str(text), # Pass the content of the file, not the file object - config=graph_config -) - -result = csv_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = csv_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/local_models/Docker/inputs/books.xml b/examples/local_models/Docker/inputs/books.xml deleted file mode 100644 index e3d1fe87..00000000 --- a/examples/local_models/Docker/inputs/books.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - Gambardella, Matthew - XML Developer's Guide - Computer - 44.95 - 2000-10-01 - An in-depth look at creating applications - with XML. - - - Ralls, Kim - Midnight Rain - Fantasy - 5.95 - 2000-12-16 - A former architect battles corporate zombies, - an evil sorceress, and her own childhood to become queen - of the world. - - - Corets, Eva - Maeve Ascendant - Fantasy - 5.95 - 2000-11-17 - After the collapse of a nanotechnology - society in England, the young survivors lay the - foundation for a new society. - - - Corets, Eva - Oberon's Legacy - Fantasy - 5.95 - 2001-03-10 - In post-apocalypse England, the mysterious - agent known only as Oberon helps to create a new life - for the inhabitants of London. Sequel to Maeve - Ascendant. - - - Corets, Eva - The Sundered Grail - Fantasy - 5.95 - 2001-09-10 - The two daughters of Maeve, half-sisters, - battle one another for control of England. Sequel to - Oberon's Legacy. - - - Randall, Cynthia - Lover Birds - Romance - 4.95 - 2000-09-02 - When Carla meets Paul at an ornithology - conference, tempers fly as feathers get ruffled. - - - Thurman, Paula - Splish Splash - Romance - 4.95 - 2000-11-02 - A deep sea diver finds true love twenty - thousand leagues beneath the sea. - - - Knorr, Stefan - Creepy Crawlies - Horror - 4.95 - 2000-12-06 - An anthology of horror stories about roaches, - centipedes, scorpions and other insects. - - - Kress, Peter - Paradox Lost - Science Fiction - 6.95 - 2000-11-02 - After an inadvertant trip through a Heisenberg - Uncertainty Device, James Salway discovers the problems - of being quantum. - - - O'Brien, Tim - Microsoft .NET: The Programming Bible - Computer - 36.95 - 2000-12-09 - Microsoft's .NET initiative is explored in - detail in this deep programmer's reference. - - - O'Brien, Tim - MSXML3: A Comprehensive Guide - Computer - 36.95 - 2000-12-01 - The Microsoft MSXML3 parser is covered in - detail, with attention to XML DOM interfaces, XSLT processing, - SAX and more. - - - Galos, Mike - Visual Studio 7: A Comprehensive Guide - Computer - 49.95 - 2001-04-16 - Microsoft Visual Studio 7 is explored in depth, - looking at how Visual Basic, Visual C++, C#, and ASP+ are - integrated into a comprehensive development - environment. - - \ No newline at end of file diff --git a/examples/local_models/Docker/inputs/example.json b/examples/local_models/Docker/inputs/example.json deleted file mode 100644 index 2263184c..00000000 --- a/examples/local_models/Docker/inputs/example.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "kind":"youtube#searchListResponse", - "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", - "nextPageToken":"CAUQAA", - "regionCode":"NL", - "pageInfo":{ - "totalResults":1000000, - "resultsPerPage":5 - }, - "items":[ - { - "kind":"youtube#searchResult", - "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", - "id":{ - "kind":"youtube#video", - "videoId":"TvWDY4Mm5GM" - }, - "snippet":{ - "publishedAt":"2023-07-24T14:15:01Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T14:15:01Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", - "id":{ - "kind":"youtube#video", - "videoId":"aZM_42CcNZ4" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:09:27Z", - "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", - "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", - "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"John Nellis", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:09:27Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", - "id":{ - "kind":"youtube#video", - "videoId":"wkP3XS3aNAY" - }, - "snippet":{ - "publishedAt":"2023-07-24T16:00:50Z", - "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", - "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", - "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Shoot for Love", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T16:00:50Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", - "id":{ - "kind":"youtube#video", - "videoId":"rJkDZ0WvfT8" - }, - "snippet":{ - "publishedAt":"2023-07-24T10:00:39Z", - "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", - "title":"TOP 10 DEFENDERS 2023", - "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"Home of Football", - "liveBroadcastContent":"none", - "publishTime":"2023-07-24T10:00:39Z" - } - }, - { - "kind":"youtube#searchResult", - "etag":"wtuknXTmI1txoULeH3aWaOuXOow", - "id":{ - "kind":"youtube#video", - "videoId":"XH0rtu4U6SE" - }, - "snippet":{ - "publishedAt":"2023-07-21T16:30:05Z", - "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", - "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", - "description":"", - "thumbnails":{ - "default":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", - "width":120, - "height":90 - }, - "medium":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", - "width":320, - "height":180 - }, - "high":{ - "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", - "width":480, - "height":360 - } - }, - "channelTitle":"FC Motivate", - "liveBroadcastContent":"none", - "publishTime":"2023-07-21T16:30:05Z" - } - } - ] -} \ No newline at end of file diff --git a/examples/local_models/Docker/inputs/plain_html_example.txt b/examples/local_models/Docker/inputs/plain_html_example.txt deleted file mode 100644 index 78f814ae..00000000 --- a/examples/local_models/Docker/inputs/plain_html_example.txt +++ /dev/null @@ -1,105 +0,0 @@ - -
- - -
-
-
-
-
-
-

Projects

-

-
-
- -
-
-
- -
- \ No newline at end of file diff --git a/examples/local_models/Docker/inputs/username.csv b/examples/local_models/Docker/inputs/username.csv deleted file mode 100644 index 006ac8e6..00000000 --- a/examples/local_models/Docker/inputs/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username; Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/examples/local_models/Docker/json_scraper_docker.py b/examples/local_models/Docker/json_scraper_docker.py deleted file mode 100644 index 758de09e..00000000 --- a/examples/local_models/Docker/json_scraper_docker.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Basic example of scraping pipeline using JSONScraperGraph from JSON documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import JSONScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the JSON file -# ************************************************ - -FILE_NAME = "inputs/example.json" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } -} - -# ************************************************ -# Create the JSONScraperGraph instance and run it -# ************************************************ - -json_scraper_graph = JSONScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = json_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = json_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/local_models/Docker/scrape_plain_text_docker.py b/examples/local_models/Docker/scrape_plain_text_docker.py deleted file mode 100644 index 40f48549..00000000 --- a/examples/local_models/Docker/scrape_plain_text_docker.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from text -""" - -import os -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Read the text file -# ************************************************ - -FILE_NAME = "inputs/plain_html_example.txt" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -# It could be also a http request using the request model -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the projects with their description.", - source=text, - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/Docker/scrape_xml_docker.py b/examples/local_models/Docker/scrape_xml_docker.py deleted file mode 100644 index e15b4b89..00000000 --- a/examples/local_models/Docker/scrape_xml_docker.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper from XML documents -""" -import os -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/Docker/script_generator_docker.py b/examples/local_models/Docker/script_generator_docker.py deleted file mode 100644 index ae585a35..00000000 --- a/examples/local_models/Docker/script_generator_docker.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Basic example of scraping pipeline using ScriptCreatorGraph -""" -from scrapegraphai.graphs import ScriptCreatorGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", - # "model_tokens": 2000, # set context length arbitrarily, - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, - "library": "beautifoulsoup" -} - -# ************************************************ -# Create the ScriptCreatorGraph instance and run it -# ************************************************ - -smart_scraper_graph = ScriptCreatorGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) - -result = smart_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/Docker/search_graph_docker.py b/examples/local_models/Docker/search_graph_docker.py deleted file mode 100644 index 1db5c207..00000000 --- a/examples/local_models/Docker/search_graph_docker.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Example of Search Graph -""" - -from scrapegraphai.graphs import SearchGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - }, - "max_results": 5, - "verbose": True, -} - -# ************************************************ -# Create the SearchGraph instance and run it -# ************************************************ - -search_graph = SearchGraph( - prompt="List me the best escursions near Trento", - config=graph_config -) - -result = search_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = search_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json and csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/local_models/Docker/smart_scraper_docker.py b/examples/local_models/Docker/smart_scraper_docker.py deleted file mode 100644 index 9e64aed9..00000000 --- a/examples/local_models/Docker/smart_scraper_docker.py +++ /dev/null @@ -1,43 +0,0 @@ -""" -Basic example of scraping pipeline using SmartScraper -""" -from scrapegraphai.graphs import SmartScraperGraph -from scrapegraphai.utils import prettify_exec_info - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/llama3", - "temperature": 0, - "format": "json", - "model_tokens": 2000, # set context length arbitrarily, - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "base_url": "http://localhost:11434", - } -} - -# ************************************************ -# Create the SmartScraperGraph instance and run it -# ************************************************ - -smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", - # also accepts a string with the already downloaded HTML code - source="https://perinim.github.io/projects", - config=graph_config -) -result = smart_scraper_graph.run() -print(result) - - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = smart_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/Docker/xml_scraper_docker.py b/examples/local_models/Docker/xml_scraper_docker.py deleted file mode 100644 index 6a8c86cc..00000000 --- a/examples/local_models/Docker/xml_scraper_docker.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -Basic example of scraping pipeline using XMLScraperGraph from XML documents -""" - -import os -from dotenv import load_dotenv -from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info -load_dotenv() - -# ************************************************ -# Read the XML file -# ************************************************ - -FILE_NAME = "inputs/books.xml" -curr_dir = os.path.dirname(os.path.realpath(__file__)) -file_path = os.path.join(curr_dir, FILE_NAME) - -with open(file_path, 'r', encoding="utf-8") as file: - text = file.read() - -# ************************************************ -# Define the configuration for the graph -# ************************************************ - -graph_config = { - "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily - }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } -} - -# ************************************************ -# Create the XMLScraperGraph instance and run it -# ************************************************ - -xml_scraper_graph = XMLScraperGraph( - prompt="List me all the authors, title and genres of the books", - source=text, # Pass the content of the file, not the file object - config=graph_config -) - -result = xml_scraper_graph.run() -print(result) - -# ************************************************ -# Get graph execution info -# ************************************************ - -graph_exec_info = xml_scraper_graph.get_execution_info() -print(prettify_exec_info(graph_exec_info)) - -# Save to json or csv -convert_to_csv(result, "result") -convert_to_json(result, "result") diff --git a/examples/local_models/Ollama/csv_scraper_ollama.py b/examples/local_models/Ollama/csv_scraper_ollama.py index c81d963b..8d1edbd7 100644 --- a/examples/local_models/Ollama/csv_scraper_ollama.py +++ b/examples/local_models/Ollama/csv_scraper_ollama.py @@ -2,15 +2,20 @@ Basic example of scraping pipeline using CSVScraperGraph from CSV documents """ +import os import pandas as pd from scrapegraphai.graphs import CSVScraperGraph from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info # ************************************************ -# Read the csv file +# Read the CSV file # ************************************************ -text = pd.read_csv("inputs/username.csv") +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) # ************************************************ # Define the configuration for the graph @@ -18,7 +23,7 @@ text = pd.read_csv("inputs/username.csv") graph_config = { "llm": { - "model": "ollama/mistral", + "model": "ollama/llama3", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "model_tokens": 2000, # set context length arbitrarily @@ -28,7 +33,8 @@ graph_config = { "model": "ollama/nomic-embed-text", "temperature": 0, "base_url": "http://localhost:11434", - } + }, + "verbose": True, } # ************************************************ diff --git a/examples/local_models/Ollama/json_scraper_ollama.py b/examples/local_models/Ollama/json_scraper_ollama.py index 90c4a151..2dd072ac 100644 --- a/examples/local_models/Ollama/json_scraper_ollama.py +++ b/examples/local_models/Ollama/json_scraper_ollama.py @@ -35,7 +35,8 @@ graph_config = { "model": "ollama/nomic-embed-text", "temperature": 0, "base_url": "http://localhost:11434", - } + }, + "verbose": True, } # ************************************************ diff --git a/examples/local_models/Ollama/readme.md b/examples/local_models/Ollama/readme.md deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/local_models/Ollama/scrape_plain_text_ollama.py b/examples/local_models/Ollama/scrape_plain_text_ollama.py index c8f13d3b..9700d713 100644 --- a/examples/local_models/Ollama/scrape_plain_text_ollama.py +++ b/examples/local_models/Ollama/scrape_plain_text_ollama.py @@ -34,7 +34,8 @@ graph_config = { "model": "ollama/nomic-embed-text", "temperature": 0, "base_url": "http://localhost:11434", - } + }, + "verbose": True, } # ************************************************ @@ -42,7 +43,7 @@ graph_config = { # ************************************************ smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", + prompt="List me all the projects", source=text, config=graph_config ) diff --git a/examples/local_models/Ollama/scrape_xml_ollama.py b/examples/local_models/Ollama/scrape_xml_ollama.py index 64c87089..4a3e1f65 100644 --- a/examples/local_models/Ollama/scrape_xml_ollama.py +++ b/examples/local_models/Ollama/scrape_xml_ollama.py @@ -33,7 +33,8 @@ graph_config = { "model": "ollama/nomic-embed-text", "temperature": 0, "base_url": "http://localhost:11434", # set ollama URL arbitrarily - } + }, + "verbose": True, } # ************************************************ diff --git a/examples/local_models/Ollama/script_generator_ollama.py b/examples/local_models/Ollama/script_generator_ollama.py index a756b202..3ad0b55f 100644 --- a/examples/local_models/Ollama/script_generator_ollama.py +++ b/examples/local_models/Ollama/script_generator_ollama.py @@ -19,7 +19,8 @@ graph_config = { "temperature": 0, "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "library": "beautifoulsoup" + "library": "beautifoulsoup", + "verbose": True, } # ************************************************ diff --git a/examples/local_models/Ollama/search_graph_ollama.py b/examples/local_models/Ollama/search_graph_ollama.py index cfa73e3e..8ecb60c1 100644 --- a/examples/local_models/Ollama/search_graph_ollama.py +++ b/examples/local_models/Ollama/search_graph_ollama.py @@ -11,16 +11,15 @@ from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_i graph_config = { "llm": { - "model": "ollama/mistral", + "model": "ollama/llama3", "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily + # "format": "json", # Ollama needs the format to be specified explicitly + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "embeddings": { "model": "ollama/nomic-embed-text", "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "max_results": 5, "verbose": True, diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/Ollama/smart_scraper_ollama.py index d710b986..02529e9e 100644 --- a/examples/local_models/Ollama/smart_scraper_ollama.py +++ b/examples/local_models/Ollama/smart_scraper_ollama.py @@ -9,17 +9,17 @@ from scrapegraphai.utils import prettify_exec_info graph_config = { "llm": { - "model": "ollama/mistral", + "model": "ollama/llama3", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly - # "model_tokens": 2000, # set context length arbitrarily, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, "embeddings": { "model": "ollama/nomic-embed-text", "temperature": 0, - "base_url": "http://localhost:11434", # set ollama URL arbitrarily - } + # "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "verbose": True, } # ************************************************ @@ -27,7 +27,7 @@ graph_config = { # ************************************************ smart_scraper_graph = SmartScraperGraph( - prompt="List me all the news with their description.", + prompt="List me all the projects with their description.", # also accepts a string with the already downloaded HTML code source="https://perinim.github.io/projects", config=graph_config diff --git a/examples/local_models/Ollama/xml_scraper_ollama.py b/examples/local_models/Ollama/xml_scraper_ollama.py index 4c149a2b..f13122f7 100644 --- a/examples/local_models/Ollama/xml_scraper_ollama.py +++ b/examples/local_models/Ollama/xml_scraper_ollama.py @@ -25,7 +25,7 @@ with open(file_path, 'r', encoding="utf-8") as file: graph_config = { "llm": { - "model": "ollama/mistral", + "model": "ollama/llama3", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "model_tokens": 2000, # set context length arbitrarily @@ -35,7 +35,8 @@ graph_config = { "model": "ollama/nomic-embed-text", "temperature": 0, "base_url": "http://localhost:11434", - } + }, + "verbose": True, } # ************************************************ diff --git a/examples/local_models/Docker/readme.md b/examples/local_models/README.md similarity index 100% rename from examples/local_models/Docker/readme.md rename to examples/local_models/README.md diff --git a/examples/mixed_models/search_graph_groq_ollama.py b/examples/mixed_models/search_graph_groq_ollama.py index 76afe1cc..7883fa77 100644 --- a/examples/mixed_models/search_graph_groq_ollama.py +++ b/examples/mixed_models/search_graph_groq_ollama.py @@ -12,18 +12,21 @@ load_dotenv() # Define the configuration for the graph # ************************************************ -openai_key = os.getenv("OPENAI_APIKEY") +groq_key = os.getenv("GROQ_APIKEY") graph_config = { "llm": { - "model": "ollama/mistral", - "temperature": 0, - "format": "json", # Ollama needs the format to be specified explicitly + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0 }, "embeddings": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, + "max_results": 2, + "verbose": True, } # ************************************************ diff --git a/examples/mixed_models/smart_scraper_mixed.py b/examples/mixed_models/smart_scraper_mixed.py index 6adb61b5..95dec64c 100644 --- a/examples/mixed_models/smart_scraper_mixed.py +++ b/examples/mixed_models/smart_scraper_mixed.py @@ -25,7 +25,8 @@ graph_config = { "temperature": 0, "base_url": "http://localhost:11434", # set ollama URL arbitrarily }, - "headless": False + "headless": False, + "verbose": True, } # ************************************************ diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index cb109384..58b7069c 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -2,6 +2,8 @@ SearchGraph Module """ +from copy import deepcopy + from .base_graph import BaseGraph from ..nodes import ( SearchInternetNode, @@ -40,6 +42,8 @@ class SearchGraph(AbstractGraph): def __init__(self, prompt: str, config: dict): self.max_results = config.get("max_results", 3) + self.copy_config = deepcopy(config) + super().__init__(prompt, config) def _create_graph(self) -> BaseGraph: @@ -57,7 +61,7 @@ class SearchGraph(AbstractGraph): smart_scraper_instance = SmartScraperGraph( prompt="", source="", - config=self.config + config=self.copy_config ) # ************************************************ diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py index 55e622d0..87f8dcb2 100644 --- a/scrapegraphai/nodes/search_internet_node.py +++ b/scrapegraphai/nodes/search_internet_node.py @@ -69,10 +69,13 @@ class SearchInternetNode(BaseNode): search_template = """ PROMPT: - Given the following user prompt, return a query that can be + You are a search engine and you need to generate a search query based on the user's prompt. \n + Given the following user prompt, return a query that can be used to search the internet for relevant information. \n You should return only the query string without any additional sentences. \n - You are taught to reply directly giving the search query. \n + For example, if the user prompt is "What is the capital of France?", + you should return "capital of France". \n + If yuo return something else, you will get a really bad grade. \n USER PROMPT: {user_prompt}""" search_prompt = PromptTemplate(