fix: screenshot scraper

2026-06-28 21:01:55 +08:00 · 2024-08-30 13:02:13 +02:00 · 2024-08-30 13:02:13 +02:00 · 388630c0ff
commit 388630c0ff
parent a0d21137b7
9 changed files with 46 additions and 39 deletions
--- a/README.md
+++ b/README.md
@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase.
 pip install scrapegraphai[more-browser-options]
 ```

+### Installing "More Browser Options"
+
+This group includes an ocr scraper for websites
+```bash
+pip install scrapegraphai[screenshot_scraper]
+```
+
 ## 💻 Usage
 There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).

--- a/examples/extras/Savedscreenshots/test_image.jpeg
+++ b/examples/extras/Savedscreenshots/test_image.jpeg
--- a/examples/extras/screenshot_scaping.py
+++ b/examples/extras/screenshot_scaping.py
@ -1,5 +1,10 @@
-from scrapegraphai.utils.screenshot_scraping import take_screenshot, select_area_with_opencv, crop_image, detect_text
+"""
+example of scraping with screenshots
+"""
 import asyncio
+from scrapegraphai.utils.screenshot_scraping import (take_screenshot,
+                                                     select_area_with_opencv,
+                                                     crop_image, detect_text)

 # STEP 1: Take a screenshot
 image = asyncio.run(take_screenshot(
@ -13,14 +18,15 @@ LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image)
 print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)

 # STEP 3 (Optional): Crop the image.
-# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
+# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, 
+# it will be set to the corresponding edge of the image.
 cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)

 # STEP 4: Detect text
-text = detect_text(
+TEXT = detect_text(
    cropped_image,          # The image to detect text from
    languages = ["en"]       # The languages to detect text in
 )

 print("DETECTED TEXT: ")
-print(text)
+print(TEXT)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -90,7 +90,7 @@ more-browser-options = [

 # Group 4: Surya Library
 screenshot_scraper = [
-    "surya-ocr>=0.4.5",
+    "surya-ocr>=0.5.0",
    "matplotlib>=3.7.2",
    "ipywidgets>=8.1.0"
 ]
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -176,10 +176,6 @@ idna==3.7
    # via yarl
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==8.2.0
-    # via sphinx
-importlib-resources==6.4.0
-    # via matplotlib
 iniconfig==2.0.0
    # via pytest
 ipython==8.18.1
@ -525,16 +521,13 @@ typing-extensions==4.12.2
    # via fastapi-pagination
    # via google-generativeai
    # via huggingface-hub
-    # via ipython
    # via langchain-core
    # via openai
    # via pydantic
    # via pydantic-core
    # via pyee
-    # via pylint
    # via sf-hamilton
    # via sqlalchemy
-    # via starlette
    # via streamlit
    # via torch
    # via typing-inspect
@ -560,6 +553,3 @@ widgetsnbextension==4.0.13
    # via ipywidgets
 yarl==1.9.4
    # via aiohttp
-zipp==3.20.0
-    # via importlib-metadata
-    # via importlib-resources
--- a/requirements.lock
+++ b/requirements.lock
@ -132,8 +132,6 @@ idna==3.7
    # via httpx
    # via requests
    # via yarl
-importlib-resources==6.4.4
-    # via matplotlib
 ipython==8.18.1
    # via ipywidgets
 ipywidgets==8.1.5
@ -372,7 +370,6 @@ typing-extensions==4.12.2
    # via anyio
    # via google-generativeai
    # via huggingface-hub
-    # via ipython
    # via langchain-core
    # via openai
    # via pydantic
@ -399,5 +396,3 @@ widgetsnbextension==4.0.13
    # via ipywidgets
 yarl==1.9.4
    # via aiohttp
-zipp==3.20.1
-    # via importlib-resources
--- a/scrapegraphai/utils/screenshot_scraping/init.py
+++ b/scrapegraphai/utils/screenshot_scraping/init.py
@ -0,0 +1,2 @@
+from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
+from .text_detection import detect_text
--- a/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py
+++ b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py
@ -1,9 +1,13 @@
+"""
+screenshot_preparation module
+"""
 import asyncio
-from playwright.async_api import async_playwright
-
 from io import BytesIO
 from PIL import Image, ImageGrab
-
+from playwright.async_api import async_playwright
+import cv2 as cv
+import numpy as np
+from io import BytesIO

 async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
    """
@ -20,23 +24,24 @@ async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
-        image_bytes = await page.screenshot(path=save_path, type="jpeg", full_page=True, quality=quality)
+        image_bytes = await page.screenshot(path=save_path, 
+                                            type="jpeg", 
+                                            full_page=True, 
+                                            quality=quality)
        await browser.close()
        return Image.open(BytesIO(image_bytes))

-
 def select_area_with_opencv(image):
    """
-    Allows you to manually select an image area using OpenCV. It is recommended to use this function if your project is on your computer, otherwise use select_area_with_ipywidget().
+    Allows you to manually select an image area using OpenCV.
+    It is recommended to use this function if your project is on your computer,
+    otherwise use select_area_with_ipywidget().
    Parameters:
        image (PIL.Image): The image from which to select an area.
    Returns:
        A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area.
    """

-    import cv2 as cv
-    import numpy as np
-
    fullscreen_screenshot = ImageGrab.grab()
    dw, dh = fullscreen_screenshot.size

@ -100,7 +105,9 @@ def select_area_with_opencv(image):

 def select_area_with_ipywidget(image):
    """
-    Allows you to manually select an image area using ipywidgets. It is recommended to use this function if your project is in Google Colab, Kaggle or other similar platform, otherwise use  select_area_with_opencv().
+    Allows you to manually select an image area using ipywidgets. 
+    It is recommended to use this function if your project is in Google Colab, 
+    Kaggle or other similar platform, otherwise use  select_area_with_opencv().
    Parameters:
        image (PIL Image): The input image.
    Returns:
@ -183,13 +190,15 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None,  save_path:
        image (PIL.Image): The image to be cropped.
        LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None.
        TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None.
-        RIGHT (int, optional): The x-coordinate of the right edge of the crop area. Defaults to None.
+        RIGHT (int, optional): The x-coordinate of 
+        the right edge of the crop area. Defaults to None.
        BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None.
        save_path (str, optional): The path to save the cropped image. Defaults to None.
    Returns:
        PIL.Image: The cropped image.
    Notes:
-        If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
+        If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, 
+        it will be set to the corresponding edge of the image.
        If save_path is specified, the cropped image will be saved as a JPEG file at the specified path.
    """

@ -208,5 +217,3 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None,  save_path:
        croped_image.save(save_path, "JPEG")

    return image.crop((LEFT, TOP, RIGHT, BOTTOM))
-
-
--- a/scrapegraphai/utils/screenshot_scraping/text_detection.py
+++ b/scrapegraphai/utils/screenshot_scraping/text_detection.py
@ -1,6 +1,9 @@
+"""
+text_detection_module
+"""
 from surya.ocr import run_ocr
-import numpy as np
-from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
+from surya.model.detection.model import (load_model as load_det_model,
+                                         load_processor as load_det_processor)
 from surya.model.recognition.model import load_model as load_rec_model
 from surya.model.recognition.processor import load_processor as load_rec_processor

@ -22,8 +25,5 @@ def detect_text(image, languages: list = ["en"]):
    rec_model, rec_processor = load_rec_model(), load_rec_processor()
    predictions = run_ocr([image], [langs], det_model,
                          det_processor, rec_model, rec_processor)
-    
    text = "\n".join([line.text for line in predictions[0].text_lines])
    return text
-
-