From 1d7f30b65b24b80113cd898c1cfbfd5de5f240b5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 17 Aug 2024 12:21:00 +0200 Subject: [PATCH] fix: browser-base integration --- examples/extras/browser_base_integration.py | 2 +- scrapegraphai/docloaders/browser_base.py | 4 ++- scrapegraphai/graphs/abstract_graph.py | 6 ++--- scrapegraphai/graphs/smart_scraper_graph.py | 1 + scrapegraphai/nodes/fetch_node.py | 30 ++++++++++----------- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/examples/extras/browser_base_integration.py b/examples/extras/browser_base_integration.py index 97529879..4036cff6 100644 --- a/examples/extras/browser_base_integration.py +++ b/examples/extras/browser_base_integration.py @@ -18,7 +18,7 @@ load_dotenv() graph_config = { "llm": { "api_key": os.getenv("OPENAI_API_KEY"), - "model": "gpt-3.5-turbo", + "model": "gpt-4o", }, "browser_base": { "api_key": os.getenv("BROWSER_BASE_API_KEY"), diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 77628bc5..9b60f36f 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -43,6 +43,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s browserbase = Browserbase(api_key=api_key, project_id=project_id) - result = browserbase.load([link]) + result = [] + for l in link: + result.append(browserbase.load(l, text_content=True)) return result diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 9cb39a0f..01ff0b0a 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -146,10 +146,10 @@ class AbstractGraph(ABC): with warnings.catch_warnings(): warnings.simplefilter("ignore") return init_chat_model(**llm_params) - - known_models = ["openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"] - if llm_params["model"].split("/")[0] not in known_models: + known_models = ["gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"] + + if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models: raise ValueError(f"Model '{llm_params['model']}' is not supported") try: diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index aa83c23b..0167103e 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -67,6 +67,7 @@ class SmartScraperGraph(AbstractGraph): "force": self.config.get("force", False), "cut": self.config.get("cut", True), "loader_kwargs": self.config.get("loader_kwargs", {}), + "browser_base": self.config.get("browser_base") } ) parse_node = ParseNode( diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index a0514f37..16672ccb 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -121,7 +121,7 @@ class FetchNode(BaseNode): "xml": self.handle_file, "md": self.handle_file, } - + if input_type in handlers: return handlers[input_type](state, input_type, source) elif self.input == "pdf_dir": @@ -130,7 +130,7 @@ class FetchNode(BaseNode): return self.handle_local_source(state, source) else: return self.handle_web_source(state, source) - + def handle_directory(self, state, input_type, source): """ Handles the directory by compressing the source document and updating the state. @@ -143,7 +143,7 @@ class FetchNode(BaseNode): Returns: dict: The updated state with the compressed document. """ - + compressed_document = [ source ] @@ -169,11 +169,11 @@ class FetchNode(BaseNode): - "xml": Reads the content of an XML file as a string. - "md": Reads the content of a Markdown file as a string. """ - + compressed_document = self.load_file_content(source, input_type) - + return self.update_state(state, compressed_document) - + def load_file_content(self, source, input_type): """ Loads the content of a file based on its input type. @@ -185,7 +185,7 @@ class FetchNode(BaseNode): Returns: list: A list containing a Document object with the loaded content and metadata. """ - + if input_type == "pdf": loader = PyPDFLoader(source) return loader.load() @@ -198,7 +198,7 @@ class FetchNode(BaseNode): with open(source, "r", encoding="utf-8") as f: data = f.read() return [Document(page_content=data, metadata={"source": input_type})] - + def handle_local_source(self, state, source): """ Handles the local source by fetching HTML content, optionally converting it to Markdown, @@ -214,11 +214,11 @@ class FetchNode(BaseNode): Raises: ValueError: If the source is empty or contains only whitespace. """ - + self.logger.info(f"--- (Fetching HTML from: {source}) ---") if not source.strip(): raise ValueError("No HTML body content found in the local source.") - + parsed_content = source if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator: @@ -229,13 +229,13 @@ class FetchNode(BaseNode): compressed_document = [ Document(page_content=parsed_content, metadata={"source": "local_dir"}) ] - + return self.update_state(state, compressed_document) - + def handle_web_source(self, state, source): """ - Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown, - and updating the state. + Handles the web source by fetching HTML content from a URL, + optionally converting it to Markdown, and updating the state. Parameters: state (dict): The current state of the graph. @@ -247,7 +247,7 @@ class FetchNode(BaseNode): Raises: ValueError: If the fetched HTML content is empty or contains only whitespace. """ - + self.logger.info(f"--- (Fetching HTML from: {source}) ---") if self.use_soup: response = requests.get(source)