mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-25 21:11:11 +08:00
feat(docloaders): undetected-playwright
This commit is contained in:
parent
ae9986a113
commit
7b3ee4e71e
3
.gitignore
vendored
3
.gitignore
vendored
@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
|
|||||||
examples/**/result.csv
|
examples/**/result.csv
|
||||||
examples/**/result.json
|
examples/**/result.json
|
||||||
main.py
|
main.py
|
||||||
|
.idea
|
||||||
|
|
||||||
@ -30,6 +30,7 @@ dependencies = [
|
|||||||
"playwright==1.43.0",
|
"playwright==1.43.0",
|
||||||
"google==3.0.0",
|
"google==3.0.0",
|
||||||
"yahoo-search-py==0.3",
|
"yahoo-search-py==0.3",
|
||||||
|
"undetected-playwright==0.3.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@ -19,3 +19,4 @@ langchain-aws==0.1.2
|
|||||||
langchain-anthropic==0.1.11
|
langchain-anthropic==0.1.11
|
||||||
yahoo-search-py==0.3
|
yahoo-search-py==0.3
|
||||||
pypdf==4.2.0
|
pypdf==4.2.0
|
||||||
|
undetected-playwright==0.3.0
|
||||||
@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
from undetected_playwright import Malenia
|
||||||
|
|
||||||
logger.info("Starting scraping...")
|
logger.info("Starting scraping...")
|
||||||
results = ""
|
results = ""
|
||||||
@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader):
|
|||||||
headless=self.headless, proxy=self.proxy, **self.browser_config
|
headless=self.headless, proxy=self.proxy, **self.browser_config
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
context = await browser.new_context()
|
||||||
|
await Malenia.apply_stealth(context)
|
||||||
|
page = await context.new_page()
|
||||||
await page.goto(url)
|
await page.goto(url)
|
||||||
results = await page.content() # Simply get the HTML content
|
results = await page.content() # Simply get the HTML content
|
||||||
logger.info("Content scraped")
|
logger.info("Content scraped")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user