fix: screenshot scraper

This commit is contained in:
Marco Vinciguerra 2024-08-30 13:02:13 +02:00
parent a0d21137b7
commit 388630c0ff
9 changed files with 46 additions and 39 deletions

View File

@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase.
pip install scrapegraphai[more-browser-options]
```
### Installing "More Browser Options"
This group includes an ocr scraper for websites
```bash
pip install scrapegraphai[screenshot_scraper]
```
## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).

Binary file not shown.

After

Width:  |  Height:  |  Size: 174 KiB

View File

@ -1,5 +1,10 @@
from scrapegraphai.utils.screenshot_scraping import take_screenshot, select_area_with_opencv, crop_image, detect_text
"""
example of scraping with screenshots
"""
import asyncio
from scrapegraphai.utils.screenshot_scraping import (take_screenshot,
select_area_with_opencv,
crop_image, detect_text)
# STEP 1: Take a screenshot
image = asyncio.run(take_screenshot(
@ -13,14 +18,15 @@ LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image)
print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)
# STEP 3 (Optional): Crop the image.
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
# it will be set to the corresponding edge of the image.
cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)
# STEP 4: Detect text
text = detect_text(
TEXT = detect_text(
cropped_image, # The image to detect text from
languages = ["en"] # The languages to detect text in
)
print("DETECTED TEXT: ")
print(text)
print(TEXT)

View File

@ -90,7 +90,7 @@ more-browser-options = [
# Group 4: Surya Library
screenshot_scraper = [
"surya-ocr>=0.4.5",
"surya-ocr>=0.5.0",
"matplotlib>=3.7.2",
"ipywidgets>=8.1.0"
]

View File

@ -176,10 +176,6 @@ idna==3.7
# via yarl
imagesize==1.4.1
# via sphinx
importlib-metadata==8.2.0
# via sphinx
importlib-resources==6.4.0
# via matplotlib
iniconfig==2.0.0
# via pytest
ipython==8.18.1
@ -525,16 +521,13 @@ typing-extensions==4.12.2
# via fastapi-pagination
# via google-generativeai
# via huggingface-hub
# via ipython
# via langchain-core
# via openai
# via pydantic
# via pydantic-core
# via pyee
# via pylint
# via sf-hamilton
# via sqlalchemy
# via starlette
# via streamlit
# via torch
# via typing-inspect
@ -560,6 +553,3 @@ widgetsnbextension==4.0.13
# via ipywidgets
yarl==1.9.4
# via aiohttp
zipp==3.20.0
# via importlib-metadata
# via importlib-resources

View File

@ -132,8 +132,6 @@ idna==3.7
# via httpx
# via requests
# via yarl
importlib-resources==6.4.4
# via matplotlib
ipython==8.18.1
# via ipywidgets
ipywidgets==8.1.5
@ -372,7 +370,6 @@ typing-extensions==4.12.2
# via anyio
# via google-generativeai
# via huggingface-hub
# via ipython
# via langchain-core
# via openai
# via pydantic
@ -399,5 +396,3 @@ widgetsnbextension==4.0.13
# via ipywidgets
yarl==1.9.4
# via aiohttp
zipp==3.20.1
# via importlib-resources

View File

@ -0,0 +1,2 @@
from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
from .text_detection import detect_text

View File

@ -1,9 +1,13 @@
"""
screenshot_preparation module
"""
import asyncio
from playwright.async_api import async_playwright
from io import BytesIO
from PIL import Image, ImageGrab
from playwright.async_api import async_playwright
import cv2 as cv
import numpy as np
from io import BytesIO
async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
"""
@ -20,23 +24,24 @@ async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url)
image_bytes = await page.screenshot(path=save_path, type="jpeg", full_page=True, quality=quality)
image_bytes = await page.screenshot(path=save_path,
type="jpeg",
full_page=True,
quality=quality)
await browser.close()
return Image.open(BytesIO(image_bytes))
def select_area_with_opencv(image):
"""
Allows you to manually select an image area using OpenCV. It is recommended to use this function if your project is on your computer, otherwise use select_area_with_ipywidget().
Allows you to manually select an image area using OpenCV.
It is recommended to use this function if your project is on your computer,
otherwise use select_area_with_ipywidget().
Parameters:
image (PIL.Image): The image from which to select an area.
Returns:
A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area.
"""
import cv2 as cv
import numpy as np
fullscreen_screenshot = ImageGrab.grab()
dw, dh = fullscreen_screenshot.size
@ -100,7 +105,9 @@ def select_area_with_opencv(image):
def select_area_with_ipywidget(image):
"""
Allows you to manually select an image area using ipywidgets. It is recommended to use this function if your project is in Google Colab, Kaggle or other similar platform, otherwise use select_area_with_opencv().
Allows you to manually select an image area using ipywidgets.
It is recommended to use this function if your project is in Google Colab,
Kaggle or other similar platform, otherwise use select_area_with_opencv().
Parameters:
image (PIL Image): The input image.
Returns:
@ -183,13 +190,15 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path:
image (PIL.Image): The image to be cropped.
LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None.
TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None.
RIGHT (int, optional): The x-coordinate of the right edge of the crop area. Defaults to None.
RIGHT (int, optional): The x-coordinate of
the right edge of the crop area. Defaults to None.
BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None.
save_path (str, optional): The path to save the cropped image. Defaults to None.
Returns:
PIL.Image: The cropped image.
Notes:
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
it will be set to the corresponding edge of the image.
If save_path is specified, the cropped image will be saved as a JPEG file at the specified path.
"""
@ -208,5 +217,3 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path:
croped_image.save(save_path, "JPEG")
return image.crop((LEFT, TOP, RIGHT, BOTTOM))

View File

@ -1,6 +1,9 @@
"""
text_detection_module
"""
from surya.ocr import run_ocr
import numpy as np
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.detection.model import (load_model as load_det_model,
load_processor as load_det_processor)
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
@ -22,8 +25,5 @@ def detect_text(image, languages: list = ["en"]):
rec_model, rec_processor = load_rec_model(), load_rec_processor()
predictions = run_ocr([image], [langs], det_model,
det_processor, rec_model, rec_processor)
text = "\n".join([line.text for line in predictions[0].text_lines])
return text