feat(docloaders): undetected-playwright

This commit is contained in:
QIN2DIM 2024-05-19 18:01:03 +08:00
parent ae9986a113
commit 7b3ee4e71e
4 changed files with 7 additions and 3 deletions

3
.gitignore vendored
View File

@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
examples/**/result.csv examples/**/result.csv
examples/**/result.json examples/**/result.json
main.py main.py
.idea

View File

@ -30,6 +30,7 @@ dependencies = [
"playwright==1.43.0", "playwright==1.43.0",
"google==3.0.0", "google==3.0.0",
"yahoo-search-py==0.3", "yahoo-search-py==0.3",
"undetected-playwright==0.3.0",
] ]
license = "MIT" license = "MIT"

View File

@ -19,3 +19,4 @@ langchain-aws==0.1.2
langchain-anthropic==0.1.11 langchain-anthropic==0.1.11
yahoo-search-py==0.3 yahoo-search-py==0.3
pypdf==4.2.0 pypdf==4.2.0
undetected-playwright==0.3.0

View File

@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader):
""" """
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from undetected_playwright import Malenia
logger.info("Starting scraping...") logger.info("Starting scraping...")
results = "" results = ""
@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader):
headless=self.headless, proxy=self.proxy, **self.browser_config headless=self.headless, proxy=self.proxy, **self.browser_config
) )
try: try:
page = await browser.new_page() context = await browser.new_context()
await Malenia.apply_stealth(context)
page = await context.new_page()
await page.goto(url) await page.goto(url)
results = await page.content() # Simply get the HTML content results = await page.content() # Simply get the HTML content
logger.info("Content scraped") logger.info("Content scraped")