mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
feat(docloaders): undetected-playwright
This commit is contained in:
parent
ae9986a113
commit
7b3ee4e71e
3
.gitignore
vendored
3
.gitignore
vendored
@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
|
||||
examples/**/result.csv
|
||||
examples/**/result.json
|
||||
main.py
|
||||
|
||||
|
||||
.idea
|
||||
@ -30,6 +30,7 @@ dependencies = [
|
||||
"playwright==1.43.0",
|
||||
"google==3.0.0",
|
||||
"yahoo-search-py==0.3",
|
||||
"undetected-playwright==0.3.0",
|
||||
]
|
||||
|
||||
license = "MIT"
|
||||
|
||||
@ -19,3 +19,4 @@ langchain-aws==0.1.2
|
||||
langchain-anthropic==0.1.11
|
||||
yahoo-search-py==0.3
|
||||
pypdf==4.2.0
|
||||
undetected-playwright==0.3.0
|
||||
@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader):
|
||||
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
from undetected_playwright import Malenia
|
||||
|
||||
logger.info("Starting scraping...")
|
||||
results = ""
|
||||
@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader):
|
||||
headless=self.headless, proxy=self.proxy, **self.browser_config
|
||||
)
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
context = await browser.new_context()
|
||||
await Malenia.apply_stealth(context)
|
||||
page = await context.new_page()
|
||||
await page.goto(url)
|
||||
results = await page.content() # Simply get the HTML content
|
||||
logger.info("Content scraped")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user