diff --git a/.gitignore b/.gitignore index b8ab5703..4bd66401 100644 --- a/.gitignore +++ b/.gitignore @@ -32,5 +32,4 @@ examples/graph_examples/ScrapeGraphAI_generated_graph examples/**/result.csv examples/**/result.json main.py - - \ No newline at end of file +.idea \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 19c714e8..29d0b419 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "playwright==1.43.0", "google==3.0.0", "yahoo-search-py==0.3", + "undetected-playwright==0.3.0", ] license = "MIT" diff --git a/requirements.txt b/requirements.txt index 1e6224b4..2ccdf0d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ langchain-aws==0.1.2 langchain-anthropic==0.1.11 yahoo-search-py==0.3 pypdf==4.2.0 +undetected-playwright==0.3.0 \ No newline at end of file diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 7d499245..d3581a7a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -69,6 +69,7 @@ class ChromiumLoader(BaseLoader): """ from playwright.async_api import async_playwright + from undetected_playwright import Malenia logger.info("Starting scraping...") results = "" @@ -77,7 +78,9 @@ class ChromiumLoader(BaseLoader): headless=self.headless, proxy=self.proxy, **self.browser_config ) try: - page = await browser.new_page() + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped")