Scrapegraph-ai/extract_data.py
2024-09-24 21:42:45 +02:00

27 lines
1000 B
Python

def extract_data(html: str) -> dict:
from bs4 import BeautifulSoup
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Initialize an empty list to hold project data
projects = []
# Find all project entries in the HTML
project_entries = soup.find_all('div', class_='grid-item')
# Iterate over each project entry to extract title and description
for entry in project_entries:
# Extract the title from the h4 element
title = entry.find('h4', class_='card-title').get_text(strip=True)
# Extract the description from the p element
description = entry.find('p', class_='card-text').get_text(strip=True)
# Append the extracted data as a dictionary to the projects list
projects.append({
'title': title,
'description': description
})
# Return the structured data as a dictionary matching the desired JSON schema
return {'projects': projects}