feat(docloaders): undetected-playwright

This commit is contained in:
QIN2DIM 2024-05-19 18:01:03 +08:00
parent ae9986a113
commit 7b3ee4e71e
4 changed files with 7 additions and 3 deletions

3
.gitignore vendored
View File

@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
examples/**/result.csv
examples/**/result.json
main.py
.idea

View File

@ -30,6 +30,7 @@ dependencies = [
"playwright==1.43.0",
"google==3.0.0",
"yahoo-search-py==0.3",
"undetected-playwright==0.3.0",
]
license = "MIT"

View File

@ -19,3 +19,4 @@ langchain-aws==0.1.2
langchain-anthropic==0.1.11
yahoo-search-py==0.3
pypdf==4.2.0
undetected-playwright==0.3.0

View File

@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader):
"""
from playwright.async_api import async_playwright
from undetected_playwright import Malenia
logger.info("Starting scraping...")
results = ""
@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader):
headless=self.headless, proxy=self.proxy, **self.browser_config
)
try:
page = await browser.new_page()
context = await browser.new_context()
await Malenia.apply_stealth(context)
page = await context.new_page()
await page.goto(url)
results = await page.content() # Simply get the HTML content
logger.info("Content scraped")