From 1d7f30b65b24b80113cd898c1cfbfd5de5f240b5 Mon Sep 17 00:00:00 2001
From: Marco Vinciguerra <mvincig11@gmail.com>
Date: Sat, 17 Aug 2024 12:21:00 +0200
Subject: [PATCH] fix: browser-base integration

---
 examples/extras/browser_base_integration.py |  2 +-
 scrapegraphai/docloaders/browser_base.py    |  4 ++-
 scrapegraphai/graphs/abstract_graph.py      |  6 ++---
 scrapegraphai/graphs/smart_scraper_graph.py |  1 +
 scrapegraphai/nodes/fetch_node.py           | 30 ++++++++++-----------
 5 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/examples/extras/browser_base_integration.py b/examples/extras/browser_base_integration.py
index 97529879..4036cff6 100644
--- a/examples/extras/browser_base_integration.py
+++ b/examples/extras/browser_base_integration.py
@@ -18,7 +18,7 @@ load_dotenv()
 graph_config = {
     "llm": {
         "api_key": os.getenv("OPENAI_API_KEY"),
-        "model": "gpt-3.5-turbo",
+        "model": "gpt-4o",
     },
     "browser_base": {
         "api_key": os.getenv("BROWSER_BASE_API_KEY"),
diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py
index 77628bc5..9b60f36f 100644
--- a/scrapegraphai/docloaders/browser_base.py
+++ b/scrapegraphai/docloaders/browser_base.py
@@ -43,6 +43,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s
 
     browserbase = Browserbase(api_key=api_key, project_id=project_id)
 
-    result = browserbase.load([link])
+    result = []
+    for l in link:
+        result.append(browserbase.load(l, text_content=True))
 
     return result
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
index 9cb39a0f..01ff0b0a 100644
--- a/scrapegraphai/graphs/abstract_graph.py
+++ b/scrapegraphai/graphs/abstract_graph.py
@@ -146,10 +146,10 @@ class AbstractGraph(ABC):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 return init_chat_model(**llm_params)
-        
-        known_models = ["openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
 
-        if llm_params["model"].split("/")[0] not in known_models:
+        known_models = ["gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
+
+        if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models:
             raise ValueError(f"Model '{llm_params['model']}' is not supported")
 
         try:
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
index aa83c23b..0167103e 100644
--- a/scrapegraphai/graphs/smart_scraper_graph.py
+++ b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -67,6 +67,7 @@ class SmartScraperGraph(AbstractGraph):
                 "force": self.config.get("force", False),
                 "cut": self.config.get("cut", True),
                 "loader_kwargs": self.config.get("loader_kwargs", {}),
+                "browser_base": self.config.get("browser_base")
             }
         )
         parse_node = ParseNode(
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
index a0514f37..16672ccb 100644
--- a/scrapegraphai/nodes/fetch_node.py
+++ b/scrapegraphai/nodes/fetch_node.py
@@ -121,7 +121,7 @@ class FetchNode(BaseNode):
             "xml": self.handle_file,
             "md": self.handle_file,
         }
-        
+
         if input_type in handlers:
             return handlers[input_type](state, input_type, source)
         elif self.input == "pdf_dir":
@@ -130,7 +130,7 @@ class FetchNode(BaseNode):
             return self.handle_local_source(state, source)
         else:
             return self.handle_web_source(state, source)
-    
+
     def handle_directory(self, state, input_type, source):
         """
         Handles the directory by compressing the source document and updating the state.
@@ -143,7 +143,7 @@ class FetchNode(BaseNode):
         Returns:
         dict: The updated state with the compressed document.
         """
-        
+
         compressed_document = [
             source
         ]
@@ -169,11 +169,11 @@ class FetchNode(BaseNode):
         - "xml": Reads the content of an XML file as a string.
         - "md": Reads the content of a Markdown file as a string.
         """
-        
+
         compressed_document = self.load_file_content(source, input_type)
-        
+
         return self.update_state(state, compressed_document)
-        
+
     def load_file_content(self, source, input_type):
         """
         Loads the content of a file based on its input type.
@@ -185,7 +185,7 @@ class FetchNode(BaseNode):
         Returns:
         list: A list containing a Document object with the loaded content and metadata.
         """
-        
+
         if input_type == "pdf":
             loader = PyPDFLoader(source)
             return loader.load()
@@ -198,7 +198,7 @@ class FetchNode(BaseNode):
             with open(source, "r", encoding="utf-8") as f:
                 data = f.read()
             return [Document(page_content=data, metadata={"source": input_type})]
-    
+
     def handle_local_source(self, state, source):
         """
         Handles the local source by fetching HTML content, optionally converting it to Markdown,
@@ -214,11 +214,11 @@ class FetchNode(BaseNode):
         Raises:
         ValueError: If the source is empty or contains only whitespace.
         """
-    
+
         self.logger.info(f"--- (Fetching HTML from: {source}) ---")
         if not source.strip():
             raise ValueError("No HTML body content found in the local source.")
-        
+  
         parsed_content = source
 
         if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
@@ -229,13 +229,13 @@ class FetchNode(BaseNode):
         compressed_document = [
             Document(page_content=parsed_content, metadata={"source": "local_dir"})
         ]
-        
+
         return self.update_state(state, compressed_document)
-    
+
     def handle_web_source(self, state, source):
         """
-        Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown,
-        and updating the state.
+        Handles the web source by fetching HTML content from a URL, 
+        optionally converting it to Markdown, and updating the state.
 
         Parameters:
         state (dict): The current state of the graph.
@@ -247,7 +247,7 @@ class FetchNode(BaseNode):
         Raises:
         ValueError: If the fetched HTML content is empty or contains only whitespace.
         """
-        
+
         self.logger.info(f"--- (Fetching HTML from: {source}) ---")
         if self.use_soup:
             response = requests.get(source)