diff --git a/.gitignore b/.gitignore index 8b9c55dd..e3cb105b 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,5 @@ examples/**/result.json main.py lib/ *.html - \ No newline at end of file +.idea + diff --git a/CHANGELOG.md b/CHANGELOG.md index 311c2d66..7cdea914 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,32 @@ +## [1.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.4.0-beta.1...v1.4.0-beta.2) (2024-05-19) + + +### Features + +* **docloaders:** undetected-playwright ([7b3ee4e](https://github.com/VinciGit00/Scrapegraph-ai/commit/7b3ee4e71e4af04edeb47999d70d398b67c93ac4)) + +## [1.4.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.0...v1.4.0-beta.1) (2024-05-19) + + +### Features + +* **base_graph:** alligned with main ([73fa31d](https://github.com/VinciGit00/Scrapegraph-ai/commit/73fa31db0f791d1fd63b489ac88cc6e595aa07f9)) + + +### CI + +* **release:** 1.2.0-beta.1 [skip ci] ([fd3e0aa](https://github.com/VinciGit00/Scrapegraph-ai/commit/fd3e0aa5823509dfb46b4f597521c24d4eb345f1)) +* **release:** 1.3.0-beta.1 [skip ci] ([191db0b](https://github.com/VinciGit00/Scrapegraph-ai/commit/191db0bc779e4913713b47b68ec4162a347da3ea)) + +## [1.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.2.4...v1.3.0) (2024-05-19) + + + +### Features + +* add new model ([8c7afa7](https://github.com/VinciGit00/Scrapegraph-ai/commit/8c7afa7570f0a104578deb35658168435cfe5ae1)) + + ## [1.2.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.2.3...v1.2.4) (2024-05-17) diff --git a/examples/bedrock/.env.example b/examples/bedrock/.env.example new file mode 100644 index 00000000..cd27769e --- /dev/null +++ b/examples/bedrock/.env.example @@ -0,0 +1,4 @@ +AWS_ACCESS_KEY_ID="..." +AWS_SECRET_ACCESS_KEY="..." +AWS_SESSION_TOKEN="..." +AWS_DEFAULT_REGION="..." \ No newline at end of file diff --git a/examples/bedrock/README.md b/examples/bedrock/README.md new file mode 100644 index 00000000..88edd82c --- /dev/null +++ b/examples/bedrock/README.md @@ -0,0 +1,3 @@ +This folder contains examples of how to use ScrapeGraphAI with [Amazon Bedrock](https://aws.amazon.com/bedrock/) ⛰️. The examples show how to extract information from websites and files using a natural language prompt. + +![](scrapegraphai_bedrock.png) \ No newline at end of file diff --git a/examples/bedrock/csv_scraper_bedrock.py b/examples/bedrock/csv_scraper_bedrock.py new file mode 100644 index 00000000..1fe09d0f --- /dev/null +++ b/examples/bedrock/csv_scraper_bedrock.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using CSVScraperGraph from CSV documents +""" + +import os +import json + +from dotenv import load_dotenv + +import pandas as pd + +from scrapegraphai.graphs import CSVScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the CSV file +# ************************************************ + +FILE_NAME = "inputs/username.csv" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +text = pd.read_csv(file_path) + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the CSVScraperGraph instance and run it +# ************************************************ + +csv_scraper_graph = CSVScraperGraph( + prompt="List me all the last names", + source=str(text), # Pass the content of the file, not the file object + config=graph_config +) + +result = csv_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = csv_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py new file mode 100644 index 00000000..d550b46b --- /dev/null +++ b/examples/bedrock/custom_graph_bedrock.py @@ -0,0 +1,127 @@ +""" +Example of custom graph using existing nodes +""" + +import json + +from dotenv import load_dotenv + +from langchain_aws import BedrockEmbeddings +from scrapegraphai.models import Bedrock +from scrapegraphai.graphs import BaseGraph +from scrapegraphai.nodes import ( + FetchNode, + ParseNode, + RAGNode, + GenerateAnswerNode, + RobotsNode +) + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Define the graph nodes +# ************************************************ + +llm_model = Bedrock({ + 'model_id': graph_config["llm"]["model"].split("/")[-1], + 'model_kwargs': { + 'temperature': 0.0 + }}) +embedder = BedrockEmbeddings(model_id=graph_config["embeddings"]["model"].split("/")[-1]) + +# Define the nodes for the graph +robot_node = RobotsNode( + input="url", + output=["is_scrapable"], + node_config={ + "llm_model": llm_model, + "force_scraping": True, + "verbose": True, + } +) + +fetch_node = FetchNode( + input="url | local_dir", + output=["doc", "link_urls", "img_urls"], + node_config={ + "verbose": True, + "headless": True, + } +) + +parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "chunk_size": 4096, + "verbose": True, + } +) + +rag_node = RAGNode( + input="user_prompt & (parsed_doc | doc)", + output=["relevant_chunks"], + node_config={ + "llm_model": llm_model, + "embedder_model": embedder, + "verbose": True, + } +) + +generate_answer_node = GenerateAnswerNode( + input="user_prompt & (relevant_chunks | parsed_doc | doc)", + output=["answer"], + node_config={ + "llm_model": llm_model, + "verbose": True, + } +) + +# ************************************************ +# Create the graph by defining the connections +# ************************************************ + +graph = BaseGraph( + nodes=[ + robot_node, + fetch_node, + parse_node, + rag_node, + generate_answer_node, + ], + edges=[ + (robot_node, fetch_node), + (fetch_node, parse_node), + (parse_node, rag_node), + (rag_node, generate_answer_node) + ], + entry_point=robot_node +) + +# ************************************************ +# Execute the graph +# ************************************************ + +result, execution_info = graph.execute({ + "user_prompt": "List me all the articles", + "url": "https://perinim.github.io/projects" +}) + +# Get the answer from the result +result = result.get("answer", "No answer found.") +print(json.dumps(result, indent=4)) diff --git a/examples/bedrock/inputs/books.xml b/examples/bedrock/inputs/books.xml new file mode 100644 index 00000000..e3d1fe87 --- /dev/null +++ b/examples/bedrock/inputs/books.xml @@ -0,0 +1,120 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen + of the world. + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + + Corets, Eva + Oberon's Legacy + Fantasy + 5.95 + 2001-03-10 + In post-apocalypse England, the mysterious + agent known only as Oberon helps to create a new life + for the inhabitants of London. Sequel to Maeve + Ascendant. + + + Corets, Eva + The Sundered Grail + Fantasy + 5.95 + 2001-09-10 + The two daughters of Maeve, half-sisters, + battle one another for control of England. Sequel to + Oberon's Legacy. + + + Randall, Cynthia + Lover Birds + Romance + 4.95 + 2000-09-02 + When Carla meets Paul at an ornithology + conference, tempers fly as feathers get ruffled. + + + Thurman, Paula + Splish Splash + Romance + 4.95 + 2000-11-02 + A deep sea diver finds true love twenty + thousand leagues beneath the sea. + + + Knorr, Stefan + Creepy Crawlies + Horror + 4.95 + 2000-12-06 + An anthology of horror stories about roaches, + centipedes, scorpions and other insects. + + + Kress, Peter + Paradox Lost + Science Fiction + 6.95 + 2000-11-02 + After an inadvertant trip through a Heisenberg + Uncertainty Device, James Salway discovers the problems + of being quantum. + + + O'Brien, Tim + Microsoft .NET: The Programming Bible + Computer + 36.95 + 2000-12-09 + Microsoft's .NET initiative is explored in + detail in this deep programmer's reference. + + + O'Brien, Tim + MSXML3: A Comprehensive Guide + Computer + 36.95 + 2000-12-01 + The Microsoft MSXML3 parser is covered in + detail, with attention to XML DOM interfaces, XSLT processing, + SAX and more. + + + Galos, Mike + Visual Studio 7: A Comprehensive Guide + Computer + 49.95 + 2001-04-16 + Microsoft Visual Studio 7 is explored in depth, + looking at how Visual Basic, Visual C++, C#, and ASP+ are + integrated into a comprehensive development + environment. + + \ No newline at end of file diff --git a/examples/bedrock/inputs/example.json b/examples/bedrock/inputs/example.json new file mode 100644 index 00000000..d729b76a --- /dev/null +++ b/examples/bedrock/inputs/example.json @@ -0,0 +1,38 @@ +{ + "quiz": { + "sport": { + "q1": { + "question": "Which one is correct team name in NBA?", + "options": [ + "New York Bulls", + "Los Angeles Kings", + "Golden State Warriros", + "Huston Rocket" + ], + "answer": "Huston Rocket" + } + }, + "maths": { + "q1": { + "question": "5 + 7 = ?", + "options": [ + "10", + "11", + "12", + "13" + ], + "answer": "12" + }, + "q2": { + "question": "12 - 8 = ?", + "options": [ + "1", + "2", + "3", + "4" + ], + "answer": "4" + } + } + } +} \ No newline at end of file diff --git a/examples/bedrock/inputs/plain_html_example.txt b/examples/bedrock/inputs/plain_html_example.txt new file mode 100644 index 00000000..78f814ae --- /dev/null +++ b/examples/bedrock/inputs/plain_html_example.txt @@ -0,0 +1,105 @@ + +
+ + +
+
+
+
+
+
+

Projects

+

+
+
+ +
+
+
+ +
+ \ No newline at end of file diff --git a/examples/bedrock/inputs/username.csv b/examples/bedrock/inputs/username.csv new file mode 100644 index 00000000..8c039d7e --- /dev/null +++ b/examples/bedrock/inputs/username.csv @@ -0,0 +1,6 @@ +Username; Identifier;First name;Last name +booker12;9012;Rachel;Booker +grey07;2070;Laura;Grey +johnson81;4081;Craig;Johnson +jenkins46;9346;Mary;Jenkins +smith79;5079;Jamie;Smith \ No newline at end of file diff --git a/examples/bedrock/json_scraper_bedrock.py b/examples/bedrock/json_scraper_bedrock.py new file mode 100644 index 00000000..ad876425 --- /dev/null +++ b/examples/bedrock/json_scraper_bedrock.py @@ -0,0 +1,63 @@ +""" +Basic example of scraping pipeline using JSONScraperGraph from JSON documents +""" + +import os +import json + +from dotenv import load_dotenv + +from scrapegraphai.graphs import JSONScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the JSON file +# ************************************************ + +FILE_NAME = "inputs/example.json" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the JSONScraperGraph instance and run it +# ************************************************ + +json_scraper_graph = JSONScraperGraph( + prompt="List me all questions and options in the math section, no answers.", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = json_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = json_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") + diff --git a/examples/bedrock/scrape_plain_text_bedrock.py b/examples/bedrock/scrape_plain_text_bedrock.py new file mode 100644 index 00000000..5cc2067c --- /dev/null +++ b/examples/bedrock/scrape_plain_text_bedrock.py @@ -0,0 +1,59 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +import json + +from dotenv import load_dotenv + +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the text file +# ************************************************ + +FILE_NAME = "inputs/plain_html_example.txt" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +# It could be also a http request using the request model +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + source=text, + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/scrapegraphai_bedrock.png b/examples/bedrock/scrapegraphai_bedrock.png new file mode 100644 index 00000000..918cf191 Binary files /dev/null and b/examples/bedrock/scrapegraphai_bedrock.png differ diff --git a/examples/bedrock/script_generator_bedrock.py b/examples/bedrock/script_generator_bedrock.py new file mode 100644 index 00000000..038bfb53 --- /dev/null +++ b/examples/bedrock/script_generator_bedrock.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using ScriptCreatorGraph +""" + +from dotenv import load_dotenv + +from scrapegraphai.graphs import ScriptCreatorGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + }, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the ScriptCreatorGraph instance and run it +# ************************************************ + +script_creator_graph = ScriptCreatorGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config +) + +result = script_creator_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = script_creator_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + diff --git a/examples/bedrock/search_graph_bedrock.py b/examples/bedrock/search_graph_bedrock.py new file mode 100644 index 00000000..79e2c803 --- /dev/null +++ b/examples/bedrock/search_graph_bedrock.py @@ -0,0 +1,46 @@ +""" +Example of Search Graph +""" + +from dotenv import load_dotenv +from scrapegraphai.graphs import SearchGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/amazon.titan-embed-text-v2:0" + } +} + +# ************************************************ +# Create the SearchGraph instance and run it +# ************************************************ + +search_graph = SearchGraph( + prompt="List me Chioggia's famous dishes", + config=graph_config +) + +result = search_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = search_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json and csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/bedrock/smart_scraper_bedrock.py b/examples/bedrock/smart_scraper_bedrock.py index fff586f2..4f0952ae 100644 --- a/examples/bedrock/smart_scraper_bedrock.py +++ b/examples/bedrock/smart_scraper_bedrock.py @@ -1,42 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper """ -Smartscraper example on bedrock -""" -import boto3 +import os +from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info -# 0a. Initialize session -# If not required delete it -session = boto3.Session( - aws_access_key_id="...", - aws_secret_access_key="...", - aws_session_token="...", - region_name="us-east-1" -) +load_dotenv() -# 0b. Initialize client -client = session.client("bedrock-runtime") -# 1. Define graph configuration -config = { +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +openai_key = os.getenv("OPENAI_APIKEY") + +graph_config = { "llm": { - "client": client, - "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", - "temperature": 0.0, - "format": "json" - }, - "embeddings": { - "client": client, - "model": "bedrock/cohere.embed-multilingual-v3", + "api_key": openai_key, + "model": "gpt-4o", }, + "verbose": True, + "headless": False, } -# 2. Create graph instance -graph = SmartScraperGraph( - prompt="List me all the articles", - source="https://perinim.github.io/projects", - config=config +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config ) -# 3. Scrape away! -print(graph.run()) +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/xml_scraper_bedrock.py b/examples/bedrock/xml_scraper_bedrock.py new file mode 100644 index 00000000..cb4e24bc --- /dev/null +++ b/examples/bedrock/xml_scraper_bedrock.py @@ -0,0 +1,62 @@ +""" +Basic example of scraping pipeline using XMLScraperGraph from XML documents +""" + +import os +import json + +from dotenv import load_dotenv +from scrapegraphai.graphs import XMLScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info + +load_dotenv() + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0 + }, + "embeddings": { + "model": "bedrock/cohere.embed-multilingual-v3" + } +} + +# ************************************************ +# Create the XMLScraperGraph instance and run it +# ************************************************ + +xml_scraper_graph = XMLScraperGraph( + prompt="List me all the authors, title and genres of the books. Skip the preamble.", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = xml_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = xml_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") + diff --git a/pyproject.toml b/pyproject.toml index e49c6a63..21cb3e59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.2.4" +version = "1.4.0b2" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ @@ -32,6 +32,7 @@ dependencies = [ "yahoo-search-py==0.3", "networkx==3.3", "pyvis==0.3.2", + "undetected-playwright==0.3.0", ] license = "MIT" diff --git a/requirements.txt b/requirements.txt index 1e6224b4..2ccdf0d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 pypdf==4.2.0 +undetected-playwright==0.3.0 \ No newline at end of file diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 7d499245..d3581a7a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader): """ from playwright.async_api import async_playwright + from undetected_playwright import Malenia logger.info("Starting scraping...") results = "" @@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader): headless=self.headless, proxy=self.proxy, **self.browser_config ) try: - page = await browser.new_page() + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped") diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 1a96aa97..b923c89d 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -53,6 +53,11 @@ class AbstractGraph(ABC): self.embedder_model = self._create_default_embedder(llm_config=config["llm"] ) if "embeddings" not in config else self._create_embedder( config["embeddings"]) + self.verbose = False if config is None else config.get( + "verbose", False) + self.headless = True if config is None else config.get( + "headless", True) + self.loader_kwargs = config.get("loader_kwargs", {}) # Create the graph self.graph = self._create_graph() diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 8788f38e..934bf5fe 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -27,6 +27,8 @@ models_tokens = { }, "gemini": { "gemini-pro": 128000, + "gemini-1.5-flash-latest":128000, + "gemini-1.5-pro-latest":128000, "models/embedding-001": 2048 }, @@ -50,6 +52,7 @@ models_tokens = { "dbrx": 32768, "dbrx:instruct": 32768, "nous-hermes2:34b": 4096, + "orca-mini": 2048, # embedding models "nomic-embed-text": 8192, "snowflake-arctic-embed:335m": 8192, @@ -82,6 +85,9 @@ models_tokens = { "mistral.mistral-7b-instruct-v0:2": 32768, "mistral.mixtral-8x7b-instruct-v0:1": 32768, "mistral.mistral-large-2402-v1:0": 32768, + # Embedding models + "amazon.titan-embed-text-v1": 8000, + "amazon.titan-embed-text-v2:0": 8000, "cohere.embed-english-v3": 512, "cohere.embed-multilingual-v3": 512 }, diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index e6a87936..af9446ba 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -98,10 +98,11 @@ class RobotsNode(BaseNode): base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" loader = AsyncChromiumLoader(f"{base_url}/robots.txt") document = loader.load() - if "ollama" in self.llm_model.model_name: + if hasattr(self.llm_model, "model_name") and "ollama" in self.llm_model.model_name: self.llm_model.model_name = self.llm_model.model_name.split("/")[-1] model = self.llm_model.model_name.split("/")[-1] - + elif hasattr(self.llm_model, "model_id"): # Bedrock uses model IDs, not model names + model = self.llm_model.model_id.split("/")[-1] else: model = self.llm_model.model_name try: