mirror of
https://github.com/VinciGit00/Scrapegraph-ai.git
synced 2026-06-28 21:01:55 +08:00
fix: screenshot scraper
This commit is contained in:
parent
a0d21137b7
commit
388630c0ff
@ -54,6 +54,13 @@ This group includes additional browser management options, such as BrowserBase.
|
||||
pip install scrapegraphai[more-browser-options]
|
||||
```
|
||||
|
||||
### Installing "More Browser Options"
|
||||
|
||||
This group includes an ocr scraper for websites
|
||||
```bash
|
||||
pip install scrapegraphai[screenshot_scraper]
|
||||
```
|
||||
|
||||
## 💻 Usage
|
||||
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
|
||||
|
||||
|
||||
BIN
examples/extras/Savedscreenshots/test_image.jpeg
Normal file
BIN
examples/extras/Savedscreenshots/test_image.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 174 KiB |
@ -1,5 +1,10 @@
|
||||
from scrapegraphai.utils.screenshot_scraping import take_screenshot, select_area_with_opencv, crop_image, detect_text
|
||||
"""
|
||||
example of scraping with screenshots
|
||||
"""
|
||||
import asyncio
|
||||
from scrapegraphai.utils.screenshot_scraping import (take_screenshot,
|
||||
select_area_with_opencv,
|
||||
crop_image, detect_text)
|
||||
|
||||
# STEP 1: Take a screenshot
|
||||
image = asyncio.run(take_screenshot(
|
||||
@ -13,14 +18,15 @@ LEFT, TOP, RIGHT, BOTTOM = select_area_with_opencv(image)
|
||||
print("LEFT: ", LEFT, " TOP: ", TOP, " RIGHT: ", RIGHT, " BOTTOM: ", BOTTOM)
|
||||
|
||||
# STEP 3 (Optional): Crop the image.
|
||||
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
|
||||
# Note: If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
|
||||
# it will be set to the corresponding edge of the image.
|
||||
cropped_image = crop_image(image, LEFT=LEFT, RIGHT=RIGHT,TOP=TOP,BOTTOM=BOTTOM)
|
||||
|
||||
# STEP 4: Detect text
|
||||
text = detect_text(
|
||||
TEXT = detect_text(
|
||||
cropped_image, # The image to detect text from
|
||||
languages = ["en"] # The languages to detect text in
|
||||
)
|
||||
|
||||
print("DETECTED TEXT: ")
|
||||
print(text)
|
||||
print(TEXT)
|
||||
|
||||
@ -90,7 +90,7 @@ more-browser-options = [
|
||||
|
||||
# Group 4: Surya Library
|
||||
screenshot_scraper = [
|
||||
"surya-ocr>=0.4.5",
|
||||
"surya-ocr>=0.5.0",
|
||||
"matplotlib>=3.7.2",
|
||||
"ipywidgets>=8.1.0"
|
||||
]
|
||||
|
||||
@ -176,10 +176,6 @@ idna==3.7
|
||||
# via yarl
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==8.2.0
|
||||
# via sphinx
|
||||
importlib-resources==6.4.0
|
||||
# via matplotlib
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
ipython==8.18.1
|
||||
@ -525,16 +521,13 @@ typing-extensions==4.12.2
|
||||
# via fastapi-pagination
|
||||
# via google-generativeai
|
||||
# via huggingface-hub
|
||||
# via ipython
|
||||
# via langchain-core
|
||||
# via openai
|
||||
# via pydantic
|
||||
# via pydantic-core
|
||||
# via pyee
|
||||
# via pylint
|
||||
# via sf-hamilton
|
||||
# via sqlalchemy
|
||||
# via starlette
|
||||
# via streamlit
|
||||
# via torch
|
||||
# via typing-inspect
|
||||
@ -560,6 +553,3 @@ widgetsnbextension==4.0.13
|
||||
# via ipywidgets
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
zipp==3.20.0
|
||||
# via importlib-metadata
|
||||
# via importlib-resources
|
||||
|
||||
@ -132,8 +132,6 @@ idna==3.7
|
||||
# via httpx
|
||||
# via requests
|
||||
# via yarl
|
||||
importlib-resources==6.4.4
|
||||
# via matplotlib
|
||||
ipython==8.18.1
|
||||
# via ipywidgets
|
||||
ipywidgets==8.1.5
|
||||
@ -372,7 +370,6 @@ typing-extensions==4.12.2
|
||||
# via anyio
|
||||
# via google-generativeai
|
||||
# via huggingface-hub
|
||||
# via ipython
|
||||
# via langchain-core
|
||||
# via openai
|
||||
# via pydantic
|
||||
@ -399,5 +396,3 @@ widgetsnbextension==4.0.13
|
||||
# via ipywidgets
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
zipp==3.20.1
|
||||
# via importlib-resources
|
||||
|
||||
2
scrapegraphai/utils/screenshot_scraping/__init__.py
Normal file
2
scrapegraphai/utils/screenshot_scraping/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .screenshot_preparation import take_screenshot, select_area_with_opencv, select_area_with_ipywidget, crop_image
|
||||
from .text_detection import detect_text
|
||||
@ -1,9 +1,13 @@
|
||||
"""
|
||||
screenshot_preparation module
|
||||
"""
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from io import BytesIO
|
||||
from PIL import Image, ImageGrab
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
from io import BytesIO
|
||||
|
||||
async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
|
||||
"""
|
||||
@ -20,23 +24,24 @@ async def take_screenshot(url: str, save_path: str = None, quality: int = 100):
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
await page.goto(url)
|
||||
image_bytes = await page.screenshot(path=save_path, type="jpeg", full_page=True, quality=quality)
|
||||
image_bytes = await page.screenshot(path=save_path,
|
||||
type="jpeg",
|
||||
full_page=True,
|
||||
quality=quality)
|
||||
await browser.close()
|
||||
return Image.open(BytesIO(image_bytes))
|
||||
|
||||
|
||||
def select_area_with_opencv(image):
|
||||
"""
|
||||
Allows you to manually select an image area using OpenCV. It is recommended to use this function if your project is on your computer, otherwise use select_area_with_ipywidget().
|
||||
Allows you to manually select an image area using OpenCV.
|
||||
It is recommended to use this function if your project is on your computer,
|
||||
otherwise use select_area_with_ipywidget().
|
||||
Parameters:
|
||||
image (PIL.Image): The image from which to select an area.
|
||||
Returns:
|
||||
A tuple containing the LEFT, TOP, RIGHT, and BOTTOM coordinates of the selected area.
|
||||
"""
|
||||
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
|
||||
fullscreen_screenshot = ImageGrab.grab()
|
||||
dw, dh = fullscreen_screenshot.size
|
||||
|
||||
@ -100,7 +105,9 @@ def select_area_with_opencv(image):
|
||||
|
||||
def select_area_with_ipywidget(image):
|
||||
"""
|
||||
Allows you to manually select an image area using ipywidgets. It is recommended to use this function if your project is in Google Colab, Kaggle or other similar platform, otherwise use select_area_with_opencv().
|
||||
Allows you to manually select an image area using ipywidgets.
|
||||
It is recommended to use this function if your project is in Google Colab,
|
||||
Kaggle or other similar platform, otherwise use select_area_with_opencv().
|
||||
Parameters:
|
||||
image (PIL Image): The input image.
|
||||
Returns:
|
||||
@ -183,13 +190,15 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path:
|
||||
image (PIL.Image): The image to be cropped.
|
||||
LEFT (int, optional): The x-coordinate of the left edge of the crop area. Defaults to None.
|
||||
TOP (int, optional): The y-coordinate of the top edge of the crop area. Defaults to None.
|
||||
RIGHT (int, optional): The x-coordinate of the right edge of the crop area. Defaults to None.
|
||||
RIGHT (int, optional): The x-coordinate of
|
||||
the right edge of the crop area. Defaults to None.
|
||||
BOTTOM (int, optional): The y-coordinate of the bottom edge of the crop area. Defaults to None.
|
||||
save_path (str, optional): The path to save the cropped image. Defaults to None.
|
||||
Returns:
|
||||
PIL.Image: The cropped image.
|
||||
Notes:
|
||||
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None, it will be set to the corresponding edge of the image.
|
||||
If any of the coordinates (LEFT, TOP, RIGHT, BOTTOM) is None,
|
||||
it will be set to the corresponding edge of the image.
|
||||
If save_path is specified, the cropped image will be saved as a JPEG file at the specified path.
|
||||
"""
|
||||
|
||||
@ -208,5 +217,3 @@ def crop_image(image, LEFT=None, TOP=None, RIGHT=None, BOTTOM=None, save_path:
|
||||
croped_image.save(save_path, "JPEG")
|
||||
|
||||
return image.crop((LEFT, TOP, RIGHT, BOTTOM))
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
"""
|
||||
text_detection_module
|
||||
"""
|
||||
from surya.ocr import run_ocr
|
||||
import numpy as np
|
||||
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
|
||||
from surya.model.detection.model import (load_model as load_det_model,
|
||||
load_processor as load_det_processor)
|
||||
from surya.model.recognition.model import load_model as load_rec_model
|
||||
from surya.model.recognition.processor import load_processor as load_rec_processor
|
||||
|
||||
@ -22,8 +25,5 @@ def detect_text(image, languages: list = ["en"]):
|
||||
rec_model, rec_processor = load_rec_model(), load_rec_processor()
|
||||
predictions = run_ocr([image], [langs], det_model,
|
||||
det_processor, rec_model, rec_processor)
|
||||
|
||||
text = "\n".join([line.text for line in predictions[0].text_lines])
|
||||
return text
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user