mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-23 21:00:30 +08:00
feat(fetch): added playwright support
This commit is contained in:
parent
450291f52e
commit
42ab0aa1d2
@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
|
|||||||
```bash
|
```bash
|
||||||
pip install scrapegraphai
|
pip install scrapegraphai
|
||||||
```
|
```
|
||||||
|
you will also need to install Playwright for javascript-based scraping:
|
||||||
|
```bash
|
||||||
|
playwright install
|
||||||
|
```
|
||||||
## 🔍 Demo
|
## 🔍 Demo
|
||||||
Official streamlit demo:
|
Official streamlit demo:
|
||||||
|
|
||||||
|
|||||||
@ -24,7 +24,8 @@ graph_config = {
|
|||||||
"model": "ollama/nomic-embed-text",
|
"model": "ollama/nomic-embed-text",
|
||||||
"temperature": 0,
|
"temperature": 0,
|
||||||
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
|
||||||
}
|
},
|
||||||
|
"headless": False
|
||||||
}
|
}
|
||||||
|
|
||||||
# ************************************************
|
# ************************************************
|
||||||
|
|||||||
@ -39,6 +39,7 @@ google = "3.0.0"
|
|||||||
minify-html = "0.15.0"
|
minify-html = "0.15.0"
|
||||||
free-proxy = "1.1.1"
|
free-proxy = "1.1.1"
|
||||||
langchain-groq = "0.1.3"
|
langchain-groq = "0.1.3"
|
||||||
|
playwright = "^1.43.0"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
pytest = "8.0.0"
|
pytest = "8.0.0"
|
||||||
|
|||||||
@ -13,3 +13,4 @@ google==3.0.0
|
|||||||
minify-html==0.15.0
|
minify-html==0.15.0
|
||||||
free-proxy==1.1.1
|
free-proxy==1.1.1
|
||||||
langchain-groq==0.1.3
|
langchain-groq==0.1.3
|
||||||
|
playwright==1.43.0
|
||||||
@ -29,6 +29,7 @@ class SearchGraph(AbstractGraph):
|
|||||||
fetch_node = FetchNode(
|
fetch_node = FetchNode(
|
||||||
input="url | local_dir",
|
input="url | local_dir",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
|
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
input="doc",
|
input="doc",
|
||||||
|
|||||||
@ -25,6 +25,7 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
|
|
||||||
self.input_key = "url" if source.startswith("http") else "local_dir"
|
self.input_key = "url" if source.startswith("http") else "local_dir"
|
||||||
|
|
||||||
|
|
||||||
def _create_graph(self):
|
def _create_graph(self):
|
||||||
"""
|
"""
|
||||||
Creates the graph of nodes representing the workflow for web scraping.
|
Creates the graph of nodes representing the workflow for web scraping.
|
||||||
@ -32,6 +33,7 @@ class SmartScraperGraph(AbstractGraph):
|
|||||||
fetch_node = FetchNode(
|
fetch_node = FetchNode(
|
||||||
input="url | local_dir",
|
input="url | local_dir",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
|
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
input="doc",
|
input="doc",
|
||||||
|
|||||||
@ -35,6 +35,7 @@ class SpeechGraph(AbstractGraph):
|
|||||||
fetch_node = FetchNode(
|
fetch_node = FetchNode(
|
||||||
input="url | local_dir",
|
input="url | local_dir",
|
||||||
output=["doc"],
|
output=["doc"],
|
||||||
|
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
|
||||||
)
|
)
|
||||||
parse_node = ParseNode(
|
parse_node = ParseNode(
|
||||||
input="doc",
|
input="doc",
|
||||||
|
|||||||
@ -2,8 +2,8 @@
|
|||||||
Module for fetching the HTML node
|
Module for fetching the HTML node
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
from langchain_community.document_loaders import AsyncHtmlLoader
|
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from .base_node import BaseNode
|
from .base_node import BaseNode
|
||||||
from ..utils.remover import remover
|
from ..utils.remover import remover
|
||||||
@ -37,7 +37,7 @@ class FetchNode(BaseNode):
|
|||||||
to succeed.
|
to succeed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
|
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
|
||||||
"""
|
"""
|
||||||
Initializes the FetchHTMLNode with a node name and node type.
|
Initializes the FetchHTMLNode with a node name and node type.
|
||||||
Arguments:
|
Arguments:
|
||||||
@ -46,6 +46,8 @@ class FetchNode(BaseNode):
|
|||||||
"""
|
"""
|
||||||
super().__init__(node_name, "node", input, output, 1)
|
super().__init__(node_name, "node", input, output, 1)
|
||||||
|
|
||||||
|
self.headless = True if node_config is None else node_config.get("headless", True)
|
||||||
|
|
||||||
def execute(self, state):
|
def execute(self, state):
|
||||||
"""
|
"""
|
||||||
Executes the node's logic to fetch HTML content from a specified URL and
|
Executes the node's logic to fetch HTML content from a specified URL and
|
||||||
@ -79,14 +81,21 @@ class FetchNode(BaseNode):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
if self.node_config is not None and self.node_config.get("endpoint") is not None:
|
||||||
loader = AsyncHtmlLoader(
|
|
||||||
source, proxies={"http": self.node_config["endpoint"]})
|
loader = AsyncChromiumLoader(
|
||||||
|
[source],
|
||||||
|
proxies={"http": self.node_config["endpoint"]},
|
||||||
|
headless=self.headless,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
loader = AsyncHtmlLoader(source)
|
loader = AsyncChromiumLoader(
|
||||||
|
[source],
|
||||||
|
headless=self.headless,
|
||||||
|
)
|
||||||
|
|
||||||
document = loader.load()
|
document = loader.load()
|
||||||
compressed_document = [
|
compressed_document = [
|
||||||
Document(page_content=remover(str(document)))]
|
Document(page_content=remover(str(document[0].page_content)))]
|
||||||
|
|
||||||
state.update({self.output[0]: compressed_document})
|
state.update({self.output[0]: compressed_document})
|
||||||
return state
|
return state
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user