mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-07-01 21:00:48 +08:00
fix: browser-base integration
This commit is contained in:
parent
a6fcc1ea58
commit
1d7f30b65b
@ -18,7 +18,7 @@ load_dotenv()
|
||||
graph_config = {
|
||||
"llm": {
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
"model": "gpt-3.5-turbo",
|
||||
"model": "gpt-4o",
|
||||
},
|
||||
"browser_base": {
|
||||
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
|
||||
|
||||
@ -43,6 +43,8 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s
|
||||
|
||||
browserbase = Browserbase(api_key=api_key, project_id=project_id)
|
||||
|
||||
result = browserbase.load([link])
|
||||
result = []
|
||||
for l in link:
|
||||
result.append(browserbase.load(l, text_content=True))
|
||||
|
||||
return result
|
||||
|
||||
@ -146,10 +146,10 @@ class AbstractGraph(ABC):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
return init_chat_model(**llm_params)
|
||||
|
||||
known_models = ["openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
|
||||
|
||||
if llm_params["model"].split("/")[0] not in known_models:
|
||||
known_models = ["gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
|
||||
|
||||
if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models:
|
||||
raise ValueError(f"Model '{llm_params['model']}' is not supported")
|
||||
|
||||
try:
|
||||
|
||||
@ -67,6 +67,7 @@ class SmartScraperGraph(AbstractGraph):
|
||||
"force": self.config.get("force", False),
|
||||
"cut": self.config.get("cut", True),
|
||||
"loader_kwargs": self.config.get("loader_kwargs", {}),
|
||||
"browser_base": self.config.get("browser_base")
|
||||
}
|
||||
)
|
||||
parse_node = ParseNode(
|
||||
|
||||
@ -121,7 +121,7 @@ class FetchNode(BaseNode):
|
||||
"xml": self.handle_file,
|
||||
"md": self.handle_file,
|
||||
}
|
||||
|
||||
|
||||
if input_type in handlers:
|
||||
return handlers[input_type](state, input_type, source)
|
||||
elif self.input == "pdf_dir":
|
||||
@ -130,7 +130,7 @@ class FetchNode(BaseNode):
|
||||
return self.handle_local_source(state, source)
|
||||
else:
|
||||
return self.handle_web_source(state, source)
|
||||
|
||||
|
||||
def handle_directory(self, state, input_type, source):
|
||||
"""
|
||||
Handles the directory by compressing the source document and updating the state.
|
||||
@ -143,7 +143,7 @@ class FetchNode(BaseNode):
|
||||
Returns:
|
||||
dict: The updated state with the compressed document.
|
||||
"""
|
||||
|
||||
|
||||
compressed_document = [
|
||||
source
|
||||
]
|
||||
@ -169,11 +169,11 @@ class FetchNode(BaseNode):
|
||||
- "xml": Reads the content of an XML file as a string.
|
||||
- "md": Reads the content of a Markdown file as a string.
|
||||
"""
|
||||
|
||||
|
||||
compressed_document = self.load_file_content(source, input_type)
|
||||
|
||||
|
||||
return self.update_state(state, compressed_document)
|
||||
|
||||
|
||||
def load_file_content(self, source, input_type):
|
||||
"""
|
||||
Loads the content of a file based on its input type.
|
||||
@ -185,7 +185,7 @@ class FetchNode(BaseNode):
|
||||
Returns:
|
||||
list: A list containing a Document object with the loaded content and metadata.
|
||||
"""
|
||||
|
||||
|
||||
if input_type == "pdf":
|
||||
loader = PyPDFLoader(source)
|
||||
return loader.load()
|
||||
@ -198,7 +198,7 @@ class FetchNode(BaseNode):
|
||||
with open(source, "r", encoding="utf-8") as f:
|
||||
data = f.read()
|
||||
return [Document(page_content=data, metadata={"source": input_type})]
|
||||
|
||||
|
||||
def handle_local_source(self, state, source):
|
||||
"""
|
||||
Handles the local source by fetching HTML content, optionally converting it to Markdown,
|
||||
@ -214,11 +214,11 @@ class FetchNode(BaseNode):
|
||||
Raises:
|
||||
ValueError: If the source is empty or contains only whitespace.
|
||||
"""
|
||||
|
||||
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
if not source.strip():
|
||||
raise ValueError("No HTML body content found in the local source.")
|
||||
|
||||
|
||||
parsed_content = source
|
||||
|
||||
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator:
|
||||
@ -229,13 +229,13 @@ class FetchNode(BaseNode):
|
||||
compressed_document = [
|
||||
Document(page_content=parsed_content, metadata={"source": "local_dir"})
|
||||
]
|
||||
|
||||
|
||||
return self.update_state(state, compressed_document)
|
||||
|
||||
|
||||
def handle_web_source(self, state, source):
|
||||
"""
|
||||
Handles the web source by fetching HTML content from a URL, optionally converting it to Markdown,
|
||||
and updating the state.
|
||||
Handles the web source by fetching HTML content from a URL,
|
||||
optionally converting it to Markdown, and updating the state.
|
||||
|
||||
Parameters:
|
||||
state (dict): The current state of the graph.
|
||||
@ -247,7 +247,7 @@ class FetchNode(BaseNode):
|
||||
Raises:
|
||||
ValueError: If the fetched HTML content is empty or contains only whitespace.
|
||||
"""
|
||||
|
||||
|
||||
self.logger.info(f"--- (Fetching HTML from: {source}) ---")
|
||||
if self.use_soup:
|
||||
response = requests.get(source)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user