feat(fetch): added playwright support

This commit is contained in:
EURAC\marperini 2024-04-30 04:02:58 +02:00
parent 450291f52e
commit 42ab0aa1d2
8 changed files with 28 additions and 8 deletions

View File

@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
```bash
pip install scrapegraphai
```
you will also need to install Playwright for javascript-based scraping:
```bash
playwright install
```
## 🔍 Demo
Official streamlit demo:

View File

@ -24,7 +24,8 @@ graph_config = {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
},
"headless": False
}
# ************************************************

View File

@ -39,6 +39,7 @@ google = "3.0.0"
minify-html = "0.15.0"
free-proxy = "1.1.1"
langchain-groq = "0.1.3"
playwright = "^1.43.0"
[tool.poetry.dev-dependencies]
pytest = "8.0.0"

View File

@ -13,3 +13,4 @@ google==3.0.0
minify-html==0.15.0
free-proxy==1.1.1
langchain-groq==0.1.3
playwright==1.43.0

View File

@ -29,6 +29,7 @@ class SearchGraph(AbstractGraph):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",

View File

@ -25,6 +25,7 @@ class SmartScraperGraph(AbstractGraph):
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self):
"""
Creates the graph of nodes representing the workflow for web scraping.
@ -32,6 +33,7 @@ class SmartScraperGraph(AbstractGraph):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",

View File

@ -35,6 +35,7 @@ class SpeechGraph(AbstractGraph):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",

View File

@ -2,8 +2,8 @@
Module for fetching the HTML node
"""
from typing import List
from langchain_community.document_loaders import AsyncHtmlLoader
from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover
@ -37,7 +37,7 @@ class FetchNode(BaseNode):
to succeed.
"""
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
"""
Initializes the FetchHTMLNode with a node name and node type.
Arguments:
@ -46,6 +46,8 @@ class FetchNode(BaseNode):
"""
super().__init__(node_name, "node", input, output, 1)
self.headless = True if node_config is None else node_config.get("headless", True)
def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
@ -79,14 +81,21 @@ class FetchNode(BaseNode):
else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:
loader = AsyncHtmlLoader(
source, proxies={"http": self.node_config["endpoint"]})
loader = AsyncChromiumLoader(
[source],
proxies={"http": self.node_config["endpoint"]},
headless=self.headless,
)
else:
loader = AsyncHtmlLoader(source)
loader = AsyncChromiumLoader(
[source],
headless=self.headless,
)
document = loader.load()
compressed_document = [
Document(page_content=remover(str(document)))]
Document(page_content=remover(str(document[0].page_content)))]
state.update({self.output[0]: compressed_document})
return state