diff --git a/README.md b/README.md index 2174bf47..d9058936 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase. pip install scrapegraphai[more-browser-options] ``` +### Installing "More Browser Options" + +This group includes an ocr scraper for websites +```bash +pip install scrapegraphai[screenshot_scraper] +``` + ## 💻 Usage There are multiple standard scraping pipelines that can be used to extract information from a website (or local file). diff --git a/examples/extras/Savedscreenshots/test_image.jpeg b/examples/extras/Savedscreenshots/test_image.jpeg new file mode 100644 index 00000000..159625bc Binary files /dev/null and b/examples/extras/Savedscreenshots/test_image.jpeg differ diff --git a/examples/extras/screenshot_scaping.py b/examples/extras/screenshot_scaping.py index f6548362..439c2a0c 100644 --- a/examples/extras/screenshot_scaping.py +++ b/examples/extras/screenshot_scaping.py @@ -1,5 +1,10 @@ -from scrapegraphai.utils.screenshot_scraping import take_screenshot, select_area_with_opencv, crop_image, detect_text +""" +example of scraping with screenshots +""" import asyncio +from scrapegraphai.utils.screenshot_scraping import (take_screenshot, + select_area_with_opencv, + crop_image, detect_text) # STEP 1: Take a screenshot image = asyncio.run(take_screenshot( @@ -13,14 +18,15 @@ LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image) print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM) # STEP 3 (Optional): Crop the image. -# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image. +# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, +# it will be set to the corresponding edge of the image. cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM) # STEP 4: Detect text -text = detect_text( +TEXT = detect_text( cropped_image, # The image to detect text from languages = ["en"] # The languages to detect text in ) print("DETECTED TEXT: ") -print(text) \ No newline at end of file +print(TEXT) diff --git a/pyproject.toml b/pyproject.toml index fb7c683b..ad1c4f58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,7 +90,7 @@ more-browser-options = [ # Group 4: Surya Library screenshot_scraper = [ - "surya-ocr>=0.4.5", + "surya-ocr>=0.5.0", "matplotlib>=3.7.2", "ipywidgets>=8.1.0" ] diff --git a/requirements-dev.lock b/requirements-dev.lock index b13918c1..97e21b92 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -176,10 +176,6 @@ idna==3.7 # via yarl imagesize==1.4.1 # via sphinx -importlib-metadata==8.2.0 - # via sphinx -importlib-resources==6.4.0 - # via matplotlib iniconfig==2.0.0 # via pytest ipython==8.18.1 @@ -525,16 +521,13 @@ typing-extensions==4.12.2 # via fastapi-pagination # via google-generativeai # via huggingface-hub - # via ipython # via langchain-core # via openai # via pydantic # via pydantic-core # via pyee - # via pylint # via sf-hamilton # via sqlalchemy - # via starlette # via streamlit # via torch # via typing-inspect @@ -560,6 +553,3 @@ widgetsnbextension==4.0.13 # via ipywidgets yarl==1.9.4 # via aiohttp -zipp==3.20.0 - # via importlib-metadata - # via importlib-resources diff --git a/requirements.lock b/requirements.lock index aa08d042..5d3d7cf2 100644 --- a/requirements.lock +++ b/requirements.lock @@ -132,8 +132,6 @@ idna==3.7 # via httpx # via requests # via yarl -importlib-resources==6.4.4 - # via matplotlib ipython==8.18.1 # via ipywidgets ipywidgets==8.1.5 @@ -372,7 +370,6 @@ typing-extensions==4.12.2 # via anyio # via google-generativeai # via huggingface-hub - # via ipython # via langchain-core # via openai # via pydantic @@ -399,5 +396,3 @@ widgetsnbextension==4.0.13 # via ipywidgets yarl==1.9.4 # via aiohttp -zipp==3.20.1 - # via importlib-resources diff --git a/scrapegraphai/utils/screenshot_scraping/__init__.py b/scrapegraphai/utils/screenshot_scraping/__init__.py new file mode 100644 index 00000000..20cfb3c0 --- /dev/null +++ b/scrapegraphai/utils/screenshot_scraping/__init__.py @@ -0,0 +1,2 @@ +from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image +from .text_detection import detect_text diff --git a/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py index 44b2c786..6205449c 100644 --- a/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py +++ b/scrapegraphai/utils/screenshot_scraping/screenshot_preparation.py @@ -1,9 +1,13 @@ +""" +screenshot_preparation module +""" import asyncio -from playwright.async_api import async_playwright - from io import BytesIO from PIL import Image, ImageGrab - +from playwright.async_api import async_playwright +import cv2 as cv +import numpy as np +from io import BytesIO async def take_screenshot(url: str, save_path: str = None, quality: int = 100): """ @@ -20,23 +24,24 @@ async def take_screenshot(url: str, save_path: str = None, quality: int = 100): browser = await p.chromium.launch(headless=True) page = await browser.new_page() await page.goto(url) - image_bytes = await page.screenshot(path=save_path, type="jpeg", full_page=True, quality=quality) + image_bytes = await page.screenshot(path=save_path, + type="jpeg", + full_page=True, + quality=quality) await browser.close() return Image.open(BytesIO(image_bytes)) - def select_area_with_opencv(image): """ - Allows you to manually select an image area using OpenCV. It is recommended to use this function if your project is on your computer, otherwise use select_area_with_ipywidget(). + Allows you to manually select an image area using OpenCV. + It is recommended to use this function if your project is on your computer, + otherwise use select_area_with_ipywidget(). Parameters: image (PIL.Image): The image from which to select an area. Returns: A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area. """ - import cv2 as cv - import numpy as np - fullscreen_screenshot = ImageGrab.grab() dw, dh = fullscreen_screenshot.size @@ -100,7 +105,9 @@ def select_area_with_opencv(image): def select_area_with_ipywidget(image): """ - Allows you to manually select an image area using ipywidgets. It is recommended to use this function if your project is in Google Colab, Kaggle or other similar platform, otherwise use select_area_with_opencv(). + Allows you to manually select an image area using ipywidgets. + It is recommended to use this function if your project is in Google Colab, + Kaggle or other similar platform, otherwise use select_area_with_opencv(). Parameters: image (PIL Image): The input image. Returns: @@ -183,13 +190,15 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: image (PIL.Image): The image to be cropped. LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None. TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None. - RIGHT (int, optional): The x-coordinate of the right edge of the crop area. Defaults to None. + RIGHT (int, optional): The x-coordinate of + the right edge of the crop area. Defaults to None. BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None. save_path (str, optional): The path to save the cropped image. Defaults to None. Returns: PIL.Image: The cropped image. Notes: - If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image. + If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, + it will be set to the corresponding edge of the image. If save_path is specified, the cropped image will be saved as a JPEG file at the specified path. """ @@ -208,5 +217,3 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path: croped_image.save(save_path, "JPEG") return image.crop((LEFT, TOP, RIGHT, BOTTOM)) - - diff --git a/scrapegraphai/utils/screenshot_scraping/text_detection.py b/scrapegraphai/utils/screenshot_scraping/text_detection.py index 92b96cf6..f883a907 100644 --- a/scrapegraphai/utils/screenshot_scraping/text_detection.py +++ b/scrapegraphai/utils/screenshot_scraping/text_detection.py @@ -1,6 +1,9 @@ +""" +text_detection_module +""" from surya.ocr import run_ocr -import numpy as np -from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor +from surya.model.detection.model import (load_model as load_det_model, + load_processor as load_det_processor) from surya.model.recognition.model import load_model as load_rec_model from surya.model.recognition.processor import load_processor as load_rec_processor @@ -22,8 +25,5 @@ def detect_text(image, languages: list = ["en"]): rec_model, rec_processor = load_rec_model(), load_rec_processor() predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor) - text = "\n".join([line.text for line in predictions[0].text_lines]) return text - -