From 7b3ee4e71e4af04edeb47999d70d398b67c93ac4 Mon Sep 17 00:00:00 2001 From: QIN2DIM <62018067+QIN2DIM@users.noreply.github.com> Date: Sun, 19 May 2024 18:01:03 +0800 Subject: [PATCH] feat(docloaders): undetected-playwright --- .gitignore | 3 +-- pyproject.toml | 1 + requirements.txt | 1 + scrapegraphai/docloaders/chromium.py | 5 ++++- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index b8ab5703..4bd66401 100644 --- a/.gitignore +++ b/.gitignore @@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph examples/**/result.csv examples/**/result.json main.py - - \ No newline at end of file +.idea \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 19c714e8..29d0b419 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "playwright==1.43.0", "google==3.0.0", "yahoo-search-py==0.3", + "undetected-playwright==0.3.0", ] license = "MIT" diff --git a/requirements.txt b/requirements.txt index 1e6224b4..2ccdf0d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 pypdf==4.2.0 +undetected-playwright==0.3.0 \ No newline at end of file diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 7d499245..d3581a7a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader): """ from playwright.async_api import async_playwright + from undetected_playwright import Malenia logger.info("Starting scraping...") results = "" @@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader): headless=self.headless, proxy=self.proxy, **self.browser_config ) try: - page = await browser.new_page() + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped")