Merge pull request #698 from vedovati-matteo/pre/beta

Code Generator functionalities and graph
This commit is contained in:
Marco Vinciguerra 2024-09-25 10:45:29 +02:00 committed by GitHub
commit 8553d89bca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 2364 additions and 18 deletions

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
graph_config = {
"llm": {
"api_key":anthropic_key,
"model": "anthropic/claude-3-haiku-20240307",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,58 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"api_key": os.environ["AZURE_OPENAI_KEY"],
"model": "azure_openai/gpt-3.5-turbo",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"client": "client_name",
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
deepseek_key = os.getenv("DEEPSEEK_APIKEY")
graph_config = {
"llm": {
"model": "deepseek/deepseek-chat",
"api_key": deepseek_key,
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,62 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"model": "ernie/ernie-bot-turbo",
"ernie_client_id": "<ernie_client_id>",
"ernie_client_secret": "<ernie_client_secret>",
"temperature": 0.1
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
fireworks_api_key = os.getenv("FIREWORKS_APIKEY")
graph_config = {
"llm": {
"api_key": fireworks_api_key,
"model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct"
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
gemini_key = os.getenv("GOOGLE_APIKEY")
graph_config = {
"llm": {
"api_key": gemini_key,
"model": "google_genai/gemini-pro",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
gemini_key = os.getenv("GOOGLE_APIKEY")
graph_config = {
"llm": {
"api_key": gemini_key,
"model": "google_vertexai/gemini-1.5-pro",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,61 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
groq_key = os.getenv("GROQ_APIKEY")
graph_config = {
"llm": {
"model": "groq/gemma-7b-it",
"api_key": groq_key,
"temperature": 0
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,71 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)
graph_config = {
"llm": {
"model_instance": llm_model_instance
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,61 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"model": "ollama/llama3",
"temperature": 0,
"format": "json",
"base_url": "http://localhost:11434",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
mistral_key = os.getenv("MISTRAL_API_KEY")
graph_config = {
"llm": {
"api_key": mistral_key,
"model": "mistralai/open-mistral-nemo",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,67 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.chat_models.moonshot import MoonshotChat
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
llm_instance_config = {
"model": "moonshot-v1-8k",
"base_url": "https://api.moonshot.cn/v1",
"moonshot_api_key": os.getenv("MOONLIGHT_API_KEY"),
}
llm_model_instance = MoonshotChat(**llm_instance_config)
graph_config = {
"llm": {
"model_instance": llm_model_instance,
"model_tokens": 10000
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,58 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
graph_config = {
"llm": {
"api_key": os.getenv("NEMOTRON_APIKEY"),
"model": "nvidia/meta/llama3-70b-instruct",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,61 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key": "***************************",
"model": "oneapi/qwen-turbo",
"base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

View File

@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""
import os, json
from typing import List
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph
load_dotenv()
# ************************************************
# Define the output schema for the graph
# ************************************************
class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")
class Projects(BaseModel):
projects: List[Project]
# ************************************************
# Define the configuration for the graph
# ************************************************
openai_key = os.getenv("OPENAI_APIKEY")
graph_config = {
"llm": {
"api_key":openai_key,
"model": "openai/gpt-4o-mini",
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)
result = code_generator_graph.run()
print(result)

27
extract_data.py Normal file
View File

@ -0,0 +1,27 @@
def extract_data(html: str) -> dict:
from bs4 import BeautifulSoup
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Initialize an empty list to hold project data
projects = []
# Find all project entries in the HTML
project_entries = soup.find_all('div', class_='grid-item')
# Iterate over each project entry to extract title and description
for entry in project_entries:
# Extract the title from the h4 element
title = entry.find('h4', class_='card-title').get_text(strip=True)
# Extract the description from the p element
description = entry.find('p', class_='card-text').get_text(strip=True)
# Append the extracted data as a dictionary to the projects list
projects.append({
'title': title,
'description': description
})
# Return the structured data as a dictionary matching the desired JSON schema
return {'projects': projects}

View File

@ -6,6 +6,8 @@
# features: []
# all-features: false
# with-sources: false
# generate-hashes: false
# universal: false
-e file:.
aiofiles==24.1.0
@ -129,6 +131,7 @@ graphviz==0.20.3
# via burr
greenlet==3.0.3
# via playwright
# via sqlalchemy
grpcio==1.65.4
# via google-api-core
# via grpcio-status
@ -498,5 +501,7 @@ urllib3==1.26.19
# via requests
uvicorn==0.30.5
# via burr
watchdog==4.0.2
# via streamlit
yarl==1.9.4
# via aiohttp

View File

@ -6,6 +6,8 @@
# features: []
# all-features: false
# with-sources: false
# generate-hashes: false
# universal: false
-e file:.
aiohttp==3.9.5
@ -84,6 +86,7 @@ googleapis-common-protos==1.63.2
# via grpcio-status
greenlet==3.0.3
# via playwright
# via sqlalchemy
grpcio==1.65.1
# via google-api-core
# via grpcio-status

View File

@ -26,3 +26,4 @@ from .markdown_scraper_multi_graph import MDScraperMultiGraph
from .search_link_graph import SearchLinkGraph
from .screenshot_scraper_graph import ScreenshotScraperGraph
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
from .code_generator_graph import CodeGeneratorGraph

View File

@ -0,0 +1,188 @@
"""
SmartScraperGraph Module
"""
from typing import Optional
import logging
from pydantic import BaseModel
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
FetchNode,
ParseNode,
GenerateAnswerNode,
PromptRefinerNode,
HtmlAnalyzerNode,
GenerateCodeNode,
)
class CodeGeneratorGraph(AbstractGraph):
"""
CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
It requires a user prompt, a source URL, and an output schema.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
library (str): The library used for web scraping (beautiful soup).
Args:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
Example:
>>> code_gen = CodeGeneratorGraph(
... "List me all the attractions in Chioggia.",
... "https://en.wikipedia.org/wiki/Chioggia",
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
... )
>>> result = code_gen.run()
)
"""
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[BaseModel] = None):
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
if self.schema is None:
raise KeyError("The schema is required for CodeGeneratorGraph")
fetch_node = FetchNode(
input="url| local_dir",
output=["doc"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"loader_kwargs": self.config.get("loader_kwargs", {}),
"browser_base": self.config.get("browser_base"),
"scrape_do": self.config.get("scrape_do")
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token
}
)
generate_validation_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"additional_info": self.config.get("additional_info"),
"schema": self.schema,
}
)
prompt_refier_node = PromptRefinerNode(
input="user_prompt",
output=["refined_prompt"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token,
"schema": self.schema
}
)
html_analyzer_node = HtmlAnalyzerNode(
input="refined_prompt & original_html",
output=["html_info", "reduced_html"],
node_config={
"llm_model": self.llm_model,
"additional_info": self.config.get("additional_info"),
"schema": self.schema,
"reduction": self.config.get("reduction", 0)
}
)
generate_code_node = GenerateCodeNode(
input="user_prompt & refined_prompt & html_info & reduced_html & answer",
output=["generated_code"],
node_config={
"llm_model": self.llm_model,
"additional_info": self.config.get("additional_info"),
"schema": self.schema,
"max_iterations": self.config.get("max_iterations", {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
}),
}
)
return BaseGraph(
nodes=[
fetch_node,
parse_node,
generate_validation_answer_node,
prompt_refier_node,
html_analyzer_node,
generate_code_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, generate_validation_answer_node),
(generate_validation_answer_node, prompt_refier_node),
(prompt_refier_node, html_analyzer_node),
(html_analyzer_node, generate_code_node)
],
entry_point=fetch_node,
graph_name=self.__class__.__name__
)
def run(self) -> str:
"""
Executes the scraping process and returns the generated code.
Returns:
str: The generated code.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
generated_code = self.final_state.get("generated_code", "No code created.")
if self.config.get("filename") is None:
filename = "extracted_data.py"
elif ".py" not in self.config.get("filename"):
filename += ".py"
else:
filename = self.config.get("filename")
self.save_code_to_file(generated_code, filename)
return generated_code
def save_code_to_file(self, code: str, filename:str) -> None:
"""
Saves the generated code to a Python file.
Args:
code (str): The generated code to be saved.
filename (str): name of the output file
"""
with open(filename, "w") as file:
file.write(code)

View File

@ -23,3 +23,6 @@ from .merge_generated_scripts import MergeGeneratedScriptsNode
from .fetch_screen_node import FetchScreenNode
from .generate_answer_from_image_node import GenerateAnswerFromImageNode
from .concat_answers_node import ConcatAnswersNode
from .prompt_refiner_node import PromptRefinerNode
from .html_analyzer_node import HtmlAnalyzerNode
from .generate_code_node import GenerateCodeNode

View File

@ -316,21 +316,7 @@ class FetchNode(BaseNode):
compressed_document = [
Document(page_content=parsed_content, metadata={"source": "html file"})
]
return self.update_state(state, compressed_document)
def update_state(self, state, compressed_document):
"""
Updates the state with the output data from the node.
Args:
state (dict): The current state of the graph.
compressed_document (List[Document]): The compressed document content fetched
by the node.
Returns:
dict: The updated state with the output data.
"""
state["original_html"] = document
state.update({self.output[0]: compressed_document,})
return state

View File

@ -0,0 +1,329 @@
"""
GenerateCodeNode Module
"""
from typing import Any, Dict, List, Optional
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel
from langchain_core.utils.pydantic import is_basemodel_subclass
from langchain_community.chat_models import ChatOllama
import ast
import sys
from io import StringIO
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from .base_node import BaseNode
from pydantic import ValidationError
from ..utils import (transform_schema,
extract_code,
syntax_focused_analysis, syntax_focused_code_generation,
execution_focused_analysis, execution_focused_code_generation,
validation_focused_analysis, validation_focused_code_generation,
semantic_focused_analysis, semantic_focused_code_generation,
are_content_equal)
from jsonschema import validate, ValidationError
import json
from ..prompts import (
TEMPLATE_INIT_CODE_GENERATION, TEMPLATE_SEMANTIC_COMPARISON
)
class GenerateCodeNode(BaseNode):
"""
A node that generates Python code for a function that extracts data from HTML based on a output schema.
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "GenerateCode",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
if isinstance(node_config["llm_model"], ChatOllama):
self.llm_model.format="json"
self.verbose = (
True if node_config is None else node_config.get("verbose", False)
)
self.force = (
False if node_config is None else node_config.get("force", False)
)
self.script_creator = (
False if node_config is None else node_config.get("script_creator", False)
)
self.is_md_scraper = (
False if node_config is None else node_config.get("is_md_scraper", False)
)
self.additional_info = node_config.get("additional_info")
self.max_iterations = node_config.get("max_iterations", {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
})
self.output_schema = node_config.get("schema")
def execute(self, state: dict) -> dict:
"""
Generates Python code for a function that extracts data from HTML based on a output schema.
Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data from the state.
Returns:
dict: The updated state with the output key containing the generated answer.
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
RuntimeError: If the maximum number of iterations is reached without obtaining the desired code.
"""
self.logger.info(f"--- Executing {self.node_name} Node ---")
input_keys = self.get_input_keys(state)
input_data = [state[key] for key in input_keys]
user_prompt = input_data[0]
refined_prompt = input_data[1]
html_info = input_data[2]
reduced_html = input_data[3]
answer = input_data[4]
self.raw_html = state['original_html'][0].page_content
simplefied_schema = str(transform_schema(self.output_schema.schema()))
reasoning_state = {
"user_input": user_prompt,
"json_schema": simplefied_schema,
"initial_analysis": refined_prompt,
"html_code": reduced_html,
"html_analysis": html_info,
"generated_code": "",
"execution_result": None,
"reference_answer": answer,
"errors": {
"syntax": [],
"execution": [],
"validation": [],
"semantic": []
},
"iteration": 0
}
final_state = self.overall_reasoning_loop(reasoning_state)
state.update({self.output[0]: final_state["generated_code"]})
return state
def overall_reasoning_loop(self, state: dict) -> dict:
self.logger.info(f"--- (Generating Code) ---")
state["generated_code"] = self.generate_initial_code(state)
state["generated_code"] = extract_code(state["generated_code"])
while state["iteration"] < self.max_iterations["overall"]:
state["iteration"] += 1
if self.verbose:
self.logger.info(f"--- Iteration {state['iteration']} ---")
self.logger.info(f"--- (Checking Code Syntax) ---")
state = self.syntax_reasoning_loop(state)
if state["errors"]["syntax"]:
continue
self.logger.info(f"--- (Executing the Generated Code) ---")
state = self.execution_reasoning_loop(state)
if state["errors"]["execution"]:
continue
self.logger.info(f"--- (Validate the Code Output Schema) ---")
state = self.validation_reasoning_loop(state)
if state["errors"]["validation"]:
continue
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
state = self.semantic_comparison_loop(state)
if state["errors"]["semantic"]:
continue
break
if state["iteration"] == self.max_iterations["overall"] and (state["errors"]["syntax"] or state["errors"]["execution"] or state["errors"]["validation"] or state["errors"]["semantic"]):
raise RuntimeError("Max iterations reached without obtaining the desired code.")
self.logger.info(f"--- (Code Generated Correctly) ---")
return state
def syntax_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["syntax"]):
syntax_valid, syntax_message = self.syntax_check(state["generated_code"])
if syntax_valid:
state["errors"]["syntax"] = []
return state
state["errors"]["syntax"] = [syntax_message]
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
analysis = syntax_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
state["generated_code"] = syntax_focused_code_generation(state, analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def execution_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["execution"]):
execution_success, execution_result = self.create_sandbox_and_execute(state["generated_code"])
if execution_success:
state["execution_result"] = execution_result
state["errors"]["execution"] = []
return state
state["errors"]["execution"] = [execution_result]
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
analysis = execution_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
state["generated_code"] = execution_focused_code_generation(state, analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def validation_reasoning_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["validation"]):
validation, errors = self.validate_dict(state["execution_result"], self.output_schema.schema())
if validation:
state["errors"]["validation"] = []
return state
state["errors"]["validation"] = errors
self.logger.info(f"--- (Code Output not compliant to the deisred Output Schema) ---")
analysis = validation_focused_analysis(state, self.llm_model)
self.logger.info(f"--- (Regenerating Code to make the Output compliant to the deisred Output Schema) ---")
state["generated_code"] = validation_focused_code_generation(state, analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def semantic_comparison_loop(self, state: dict) -> dict:
for _ in range(self.max_iterations["semantic"]):
comparison_result = self.semantic_comparison(state["execution_result"], state["reference_answer"])
if comparison_result["are_semantically_equivalent"]:
state["errors"]["semantic"] = []
return state
state["errors"]["semantic"] = comparison_result["differences"]
self.logger.info(f"--- (The informations exctrcated are not the all ones requested) ---")
analysis = semantic_focused_analysis(state, comparison_result, self.llm_model)
self.logger.info(f"--- (Regenerating Code to obtain all the infromation requested) ---")
state["generated_code"] = semantic_focused_code_generation(state, analysis, self.llm_model)
state["generated_code"] = extract_code(state["generated_code"])
return state
def generate_initial_code(self, state: dict) -> str:
prompt = PromptTemplate(
template=TEMPLATE_INIT_CODE_GENERATION,
partial_variables={
"user_input": state["user_input"],
"json_schema": state["json_schema"],
"initial_analysis": state["initial_analysis"],
"html_code": state["html_code"],
"html_analysis": state["html_analysis"]
})
output_parser = StrOutputParser()
chain = prompt | self.llm_model | output_parser
generated_code = chain.invoke({})
return generated_code
def semantic_comparison(self, generated_result: Any, reference_result: Any) -> Dict[str, Any]:
reference_result_dict = self.output_schema(**reference_result).dict()
# Check if generated result and reference result are actually equal
if are_content_equal(generated_result, reference_result_dict):
return {
"are_semantically_equivalent": True,
"differences": [],
"explanation": "The generated result and reference result are exactly equal."
}
response_schemas = [
ResponseSchema(name="are_semantically_equivalent", description="Boolean indicating if the results are semantically equivalent"),
ResponseSchema(name="differences", description="List of semantic differences between the results, if any"),
ResponseSchema(name="explanation", description="Detailed explanation of the comparison and reasoning")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
prompt = PromptTemplate(
template=TEMPLATE_SEMANTIC_COMPARISON,
input_variables=["generated_result", "reference_result"],
partial_variables={"format_instructions": output_parser.get_format_instructions()}
)
chain = prompt | self.llm_model | output_parser
return chain.invoke({
"generated_result": json.dumps(generated_result, indent=2),
"reference_result": json.dumps(reference_result_dict, indent=2)
})
def syntax_check(self, code):
try:
ast.parse(code)
return True, "Syntax is correct."
except SyntaxError as e:
return False, f"Syntax error: {str(e)}"
def create_sandbox_and_execute(self, function_code):
# Create a sandbox environment
sandbox_globals = {
'BeautifulSoup': BeautifulSoup,
're': re,
'__builtins__': __builtins__,
}
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
exec(function_code, sandbox_globals)
extract_data = sandbox_globals.get('extract_data')
if not extract_data:
raise NameError("Function 'extract_data' not found in the generated code.")
result = extract_data(self.raw_html)
return True, result
except Exception as e:
return False, f"Error during execution: {str(e)}"
finally:
sys.stdout = old_stdout
def validate_dict(self, data: dict, schema):
try:
validate(instance=data, schema=schema)
return True, None
except ValidationError as e:
errors = e.errors()
return False, errors

View File

@ -0,0 +1,106 @@
"""
HtmlAnalyzerNode Module
"""
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel
from langchain_core.utils.pydantic import is_basemodel_subclass
from langchain_community.chat_models import ChatOllama
from tqdm import tqdm
from .base_node import BaseNode
from ..utils import reduce_html
from ..prompts import (
TEMPLATE_HTML_ANALYSIS, TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT
)
class HtmlAnalyzerNode(BaseNode):
"""
A node that generates an analysis of the provided HTML code based on the wanted infromations to be extracted.
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "HtmlAnalyzer",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
if isinstance(node_config["llm_model"], ChatOllama):
self.llm_model.format="json"
self.verbose = (
True if node_config is None else node_config.get("verbose", False)
)
self.force = (
False if node_config is None else node_config.get("force", False)
)
self.script_creator = (
False if node_config is None else node_config.get("script_creator", False)
)
self.is_md_scraper = (
False if node_config is None else node_config.get("is_md_scraper", False)
)
self.additional_info = node_config.get("additional_info")
def execute(self, state: dict) -> dict:
"""
Generates an analysis of the provided HTML code based on the wanted infromations to be extracted.
Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data from the state.
Returns:
dict: The updated state with the output key containing the generated answer.
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
"""
self.logger.info(f"--- Executing {self.node_name} Node ---")
input_keys = self.get_input_keys(state)
input_data = [state[key] for key in input_keys]
refined_prompt = input_data[0] # get refined user prompt
html = input_data[1] # get HTML code
reduced_html = reduce_html(html[0].page_content, self.node_config.get("reduction", 0)) # reduce HTML code
if self.additional_info is not None: # use additional context if present
prompt = PromptTemplate(
template=TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT,
partial_variables={"initial_analysis": refined_prompt,
"html_code": reduced_html,
"additional_context": self.additional_info})
else:
prompt = PromptTemplate(
template=TEMPLATE_HTML_ANALYSIS,
partial_variables={"initial_analysis": refined_prompt,
"html_code": reduced_html})
output_parser = StrOutputParser()
chain = prompt | self.llm_model | output_parser
html_analysis = chain.invoke({})
state.update({self.output[0]: html_analysis, self.output[1]: reduced_html})
return state

View File

@ -0,0 +1,107 @@
"""
PromptRefinerNode Module
"""
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel
from langchain_core.utils.pydantic import is_basemodel_subclass
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_mistralai import ChatMistralAI
from langchain_community.chat_models import ChatOllama
from tqdm import tqdm
from .base_node import BaseNode
from ..utils import transform_schema
from ..prompts import (
TEMPLATE_REFINER, TEMPLATE_REFINER_WITH_CONTEXT
)
class PromptRefinerNode(BaseNode):
"""
A node that refine the user prompt with the use of the schema and additional context and
create a precise prompt in subsequent steps that explicitly link elements in the user's
original input to their corresponding representations in the JSON schema.
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "PromptRefiner",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
if isinstance(node_config["llm_model"], ChatOllama):
self.llm_model.format="json"
self.verbose = (
True if node_config is None else node_config.get("verbose", False)
)
self.force = (
False if node_config is None else node_config.get("force", False)
)
self.script_creator = (
False if node_config is None else node_config.get("script_creator", False)
)
self.is_md_scraper = (
False if node_config is None else node_config.get("is_md_scraper", False)
)
self.additional_info = node_config.get("additional_info")
self.output_schema = node_config.get("schema")
def execute(self, state: dict) -> dict:
"""
Generate a refined prompt using the user's prompt, the schema, and additional context.
Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data from the state.
Returns:
dict: The updated state with the output key containing the generated answer.
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
"""
self.logger.info(f"--- Executing {self.node_name} Node ---")
user_prompt = state['user_prompt']
self.simplefied_schema = transform_schema(self.output_schema.schema())
if self.additional_info is not None:
prompt = PromptTemplate(
template=TEMPLATE_REFINER_WITH_CONTEXT,
partial_variables={"user_input": user_prompt,
"json_schema": str(self.simplefied_schema),
"additional_context": self.additional_info})
else:
prompt = PromptTemplate(
template=TEMPLATE_REFINER,
partial_variables={"user_input": user_prompt,
"json_schema": str(self.simplefied_schema)})
output_parser = StrOutputParser()
chain = prompt | self.llm_model | output_parser
refined_prompt = chain.invoke({})
state.update({self.output[0]: refined_prompt})
return state

View File

@ -11,3 +11,11 @@ from .robots_node_prompts import TEMPLATE_ROBOT
from .search_internet_node_prompts import TEMPLATE_SEARCH_INTERNET
from .search_link_node_prompts import TEMPLATE_RELEVANT_LINKS
from .search_node_with_context_prompts import TEMPLATE_SEARCH_WITH_CONTEXT_CHUNKS, TEMPLATE_SEARCH_WITH_CONTEXT_NO_CHUNKS
from .prompt_refiner_node_prompts import TEMPLATE_REFINER, TEMPLATE_REFINER_WITH_CONTEXT
from .html_analyzer_node_prompts import TEMPLATE_HTML_ANALYSIS, TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT
from .generate_code_node_prompts import (TEMPLATE_INIT_CODE_GENERATION,
TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_SYNTAX_CODE_GENERATION,
TEMPLATE_EXECUTION_ANALYSIS, TEMPLATE_EXECUTION_CODE_GENERATION,
TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_VALIDATION_CODE_GENERATION,
TEMPLATE_SEMANTIC_COMPARISON, TEMPLATE_SEMANTIC_ANALYSIS,
TEMPLATE_SEMANTIC_CODE_GENERATION)

View File

@ -0,0 +1,213 @@
"""
Generate code prompts helper
"""
TEMPLATE_INIT_CODE_GENERATION = """
**Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema.
**User's Request**:
{user_input}
**Desired JSON Output Schema**:
```json
{json_schema}
```
**Initial Task Analysis**:
{initial_analysis}
**HTML Code**:
```html
{html_code}
```
**HTML Structure Analysis**:
{html_analysis}
Based on the above analyses, generate the `extract_data(html: str) -> dict()` function that:
1. Efficiently extracts the required data from the given HTML structure.
2. Processes and structures the data according to the specified JSON schema.
3. Returns the structured data as a dictionary.
Your code should be well-commented, explaining the reasoning behind key decisions and any potential areas for improvement or customization.
Use only the following pre-imported libraries:
- BeautifulSoup from bs4
- re
**Output ONLY the Python code of the extract_data function, WITHOUT ANY IMPORTS OR ADDITIONAL TEXT.**
In your code do not include backticks.
**Response**:
"""
TEMPLATE_SYNTAX_ANALYSIS = """
The current code has encountered a syntax error. Here are the details:
Current Code:
```python
{generated_code}
```
Syntax Error:
{errors}
Please analyze in detail the syntax error and suggest a fix. Focus only on correcting the syntax issue while ensuring the code still meets the original requirements.
Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
"""
TEMPLATE_SYNTAX_CODE_GENERATION = """
Based on the following analysis of a syntax error, please generate the corrected code, following the suggested fix.:
Error Analysis:
{analysis}
Original Code:
```python
{generated_code}
```
Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
"""
TEMPLATE_EXECUTION_ANALYSIS = """
The current code has encountered an execution error. Here are the details:
**Current Code**:
```python
{generated_code}
```
**Execution Error**:
{errors}
**HTML Code**:
```html
{html_code}
```
**HTML Structure Analysis**:
{html_analysis}
Please analyze the execution error and suggest a fix. Focus only on correcting the execution issue while ensuring the code still meets the original requirements and maintains correct syntax.
The suggested fix should address the execution error and ensure the function can successfully extract the required data from the provided HTML structure. Be sure to be precise and specific in your analysis.
Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
"""
TEMPLATE_EXECUTION_CODE_GENERATION = """
Based on the following analysis of an execution error, please generate the corrected code:
Error Analysis:
{analysis}
Original Code:
```python
{generated_code}
```
Generate the corrected code, applying the suggestions from the analysis. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
"""
TEMPLATE_VALIDATION_ANALYSIS = """
The current code's output does not match the required schema. Here are the details:
Current Code:
```python
{generated_code}
```
Validation Errors:
{errors}
Required Schema:
```json
{json_schema}
```
Current Output:
{execution_result}
Please analyze the validation errors and suggest fixes. Focus only on correcting the output to match the required schema while ensuring the code maintains correct syntax and execution.
Provide your analysis and suggestions for fixing the error. DO NOT generate any code in your response.
"""
TEMPLATE_VALIDATION_CODE_GENERATION = """
Based on the following analysis of a validation error, please generate the corrected code:
Error Analysis:
{analysis}
Original Code:
```python
{generated_code}
```
Required Schema:
```json
{json_schema}
```
Generate the corrected code, applying the suggestions from the analysis and ensuring the output matches the required schema. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
"""
TEMPLATE_SEMANTIC_COMPARISON = """
Compare the Generated Result with the Reference Result and determine if they are semantically equivalent:
Generated Result:
{generated_result}
Reference Result (Correct Output):
{reference_result}
Analyze the content, structure, and meaning of both results. They should be considered semantically equivalent if they convey the same information, even if the exact wording or structure differs.
If they are not semantically equivalent, identify what are the key differences in the Generated Result. The Reference Result should be considered the correct output, you need to pinpoint the problems in the Generated Result.
{format_instructions}
Human: Are the generated result and reference result semantically equivalent? If not, what are the key differences?
Assistant: Let's analyze the two results carefully:
"""
TEMPLATE_SEMANTIC_ANALYSIS = """
The current code's output is semantically different from the reference answer. Here are the details:
Current Code:
```python
{generated_code}
```
Semantic Differences:
{differences}
Comparison Explanation:
{explanation}
Please analyze these semantic differences and suggest how to modify the code to produce a result that is semantically equivalent to the reference answer. Focus on addressing the key differences while maintaining the overall structure and functionality of the code.
Provide your analysis and suggestions for fixing the semantic differences. DO NOT generate any code in your response.
"""
TEMPLATE_SEMANTIC_CODE_GENERATION = """
Based on the following analysis of semantic differences, please generate the corrected code:
Semantic Analysis:
{analysis}
Original Code:
```python
{generated_code}
```
Generated Result:
{generated_result}
Reference Result:
{reference_result}
Generate the corrected code, applying the suggestions from the analysis to make the output semantically equivalent to the reference result. Output ONLY the corrected Python code, WITHOUT ANY ADDITIONAL TEXT.
"""

View File

@ -0,0 +1,71 @@
"""
HTML analysis prompts helper
"""
TEMPLATE_HTML_ANALYSIS = """
Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
**Initial Analysis**:
{initial_analysis}
**HTML Code**:
```html
{html_code}
```
**HTML Analysis Instructions**:
1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
5. Recommend the specific strategy to use for scraping the content, remeber.
**Important Notes**:
- The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
- No web scraping, automation, or handling of dynamic content is required.
- The analysis should focus solely on extracting data from the static HTML provided.
- Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
**HTML Analysis for Data Extraction**:
"""
TEMPLATE_HTML_ANALYSIS_WITH_CONTEXT = """
Task: Your job is to analyze the provided HTML code in relation to the initial scraping task analysis and the additional context the user provided and provide all the necessary HTML information useful for implementing a function that extracts data from the given HTML string.
**Initial Analysis**:
{initial_analysis}
**HTML Code**:
```html
{html_code}
```
**Additional Context**:
{additional_context}
**HTML Analysis Instructions**:
1. Examine the HTML code and identify elements, classes, or IDs that correspond to each required data field mentioned in the Initial Analysis.
2. Look for patterns or repeated structures that could indicate multiple items (e.g., product listings).
3. Note any nested structures or relationships between elements that are relevant to the data extraction task.
4. Discuss any additional considerations based on the specific HTML layout that are crucial for accurate data extraction.
5. Recommend the specific strategy to use for scraping the content, remeber.
**Important Notes**:
- The function that the code generator is gonig to implement will receive the HTML as a string parameter, not as a live webpage.
- No web scraping, automation, or handling of dynamic content is required.
- The analysis should focus solely on extracting data from the static HTML provided.
- Be precise and specific in your analysis, as the code generator will, possibly, not have access to the full HTML context.
This HTML analysis will be used to guide the final code generation process for a function that extracts data from the given HTML string.
Please provide only the analysis with relevant, specific information based on this HTML code. Avoid vague statements and focus on exact details needed for accurate data extraction.
Focus on providing a concise, step-by-step analysis of the HTML structure and the key elements needed for data extraction. Do not include any code examples or implementation logic. Keep the response focused and avoid general statements.**
In your code do not include backticks.
**HTML Analysis for Data Extraction**:
"""

View File

@ -0,0 +1,63 @@
"""
Prompts refiner prompts helper
"""
TEMPLATE_REFINER = """
**Task**: Analyze the user's request and the provided JSON schema to clearly map the desired data extraction.\n
Break down the user's request into key components, and then explicitly connect these components to the
corresponding elements within the JSON schema.
**User's Request**:
{user_input}
**Desired JSON Output Schema**:
```json
{json_schema}
```
**Analysis Instructions**:
1. **Break Down User Request:**
* Clearly identify the core entities or data types the user is asking for.\n
* Highlight any specific attributes or relationships mentioned in the request.\n
2. **Map to JSON Schema**:
* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
* Explain how the schema structure accommodates the user's needs.
* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
Please generate only the analysis and no other text.
**Response**:
"""
TEMPLATE_REFINER_WITH_CONTEXT = """
**Task**: Analyze the user's request, the provided JSON schema, and the additional context the user provided to clearly map the desired data extraction.\n
Break down the user's request into key components, and then explicitly connect these components to the corresponding elements within the JSON schema.\n
**User's Request**:
{user_input}
**Desired JSON Output Schema**:
```json
{json_schema}
```
**Additional Context**:
{additional_context}
**Analysis Instructions**:
1. **Break Down User Request:**
* Clearly identify the core entities or data types the user is asking for.\n
* Highlight any specific attributes or relationships mentioned in the request.\n
2. **Map to JSON Schema**:
* For each identified element in the user request, pinpoint its exact counterpart in the JSON schema.\n
* Explain how the schema structure accommodates the user's needs.\n
* If applicable, mention any schema elements that are not directly addressed in the user's request.\n
This analysis will be used to guide the HTML structure examination and ultimately inform the code generation process.\n
Please generate only the analysis and no other text.
**Response**:
"""

View File

@ -7,7 +7,7 @@ from .prettify_exec_info import prettify_exec_info
from .proxy_rotation import Proxy, parse_or_search_proxy, search_proxy_servers
from .save_audio_from_bytes import save_audio_from_bytes
from .sys_dynamic_import import dynamic_import, srcfile_import
from .cleanup_html import cleanup_html
from .cleanup_html import cleanup_html, reduce_html
from .logging import *
from .convert_to_md import convert_to_md
from .screenshot_scraping.screenshot_preparation import (take_screenshot,
@ -18,3 +18,10 @@ from .screenshot_scraping.text_detection import detect_text
from .tokenizer import num_tokens_calculus
from .split_text_into_chunks import split_text_into_chunks
from .llm_callback_manager import CustomLLMCallbackManager
from .schema_trasform import transform_schema
from .cleanup_code import extract_code
from .dict_content_compare import are_content_equal
from .code_error_analysis import (syntax_focused_analysis, execution_focused_analysis,
validation_focused_analysis, semantic_focused_analysis)
from .code_error_correction import (syntax_focused_code_generation, execution_focused_code_generation,
validation_focused_code_generation, semantic_focused_code_generation)

View File

@ -0,0 +1,11 @@
"""
This utility function extracts the code from a given string.
"""
import re
def extract_code(code: str) -> str:
pattern = r'```(?:python)?\n(.*?)```'
match = re.search(pattern, code, re.DOTALL)
return match.group(1) if match else code

View File

@ -2,7 +2,8 @@
Module for minimizing the code
"""
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup, Comment
from minify_html import minify
def cleanup_html(html_content: str, base_url: str) -> str:
@ -53,3 +54,82 @@ def cleanup_html(html_content: str, base_url: str) -> str:
else:
raise ValueError(f"""No HTML body content found, please try setting the 'headless'
flag to False in the graph configuration. HTML content: {html_content}""")
def minify_html(html):
# Remove comments
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
# Remove whitespace between tags
html = re.sub(r'>\s+<', '><', html)
# Remove whitespace at the beginning and end of tags
html = re.sub(r'\s+>', '>', html)
html = re.sub(r'<\s+', '<', html)
# Collapse multiple whitespace characters into a single space
html = re.sub(r'\s+', ' ', html)
# Remove spaces around equals signs in attributes
html = re.sub(r'\s*=\s*', '=', html)
return html.strip()
def reduce_html(html, reduction):
"""
Reduces the size of the HTML content based on the specified level of reduction.
Args:
html (str): The HTML content to reduce.
reduction (int): The level of reduction to apply to the HTML content.
0: minification only,
1: minification and removig unnecessary tags and attributes,
2: minification, removig unnecessary tags and attributes, simplifying text content, removing of the head tag
Returns:
str: The reduced HTML content based on the specified reduction level.
"""
if reduction == 0:
return minify_html(html)
soup = BeautifulSoup(html, 'html.parser')
# Remove comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove script and style tag contents, but keep the tags
for tag in soup(['script', 'style']):
tag.string = ""
# Remove unnecessary attributes, but keep class and id
attrs_to_keep = ['class', 'id', 'href', 'src']
for tag in soup.find_all(True):
for attr in list(tag.attrs):
if attr not in attrs_to_keep:
del tag[attr]
if reduction == 1:
return minify_html(str(soup))
# Remove script and style tags completely
for tag in soup(['script', 'style']):
tag.decompose()
# Focus only on the body
body = soup.body
if not body:
return "No <body> tag found in the HTML"
# Simplify text content
for tag in body.find_all(string=True):
if tag.parent.name not in ['script', 'style']:
tag.replace_with(re.sub(r'\s+', ' ', tag.strip())[:20])
# Generate reduced HTML
reduced_html = str(body)
# Apply minification
reduced_html = minify_html(reduced_html)
return reduced_html

View File

@ -0,0 +1,48 @@
"""
This module contains the functions that are used to generate the prompts for the code error analysis.
"""
from typing import Any, Dict
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import json
from ..prompts import (
TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_EXECUTION_ANALYSIS,
TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_SEMANTIC_ANALYSIS
)
def syntax_focused_analysis(state: dict, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
"errors": state["errors"]["syntax"]
})
def execution_focused_analysis(state: dict, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
"errors": state["errors"]["execution"],
"html_code": state["html_code"],
"html_analysis": state["html_analysis"]
})
def validation_focused_analysis(state: dict, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
"errors": state["errors"]["validation"],
"json_schema": state["json_schema"],
"execution_result": state["execution_result"]
})
def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"generated_code": state["generated_code"],
"differences": json.dumps(comparison_result["differences"], indent=2),
"explanation": comparison_result["explanation"]
})

View File

@ -0,0 +1,45 @@
"""
This module contains the code generation functions for code correction for different types errors.
"""
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import json
from ..prompts import (
TEMPLATE_SYNTAX_CODE_GENERATION, TEMPLATE_EXECUTION_CODE_GENERATION,
TEMPLATE_VALIDATION_CODE_GENERATION, TEMPLATE_SEMANTIC_CODE_GENERATION
)
def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
"generated_code": state["generated_code"]
})
def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
"generated_code": state["generated_code"]
})
def validation_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
"generated_code": state["generated_code"],
"json_schema": state["json_schema"]
})
def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
chain = prompt | llm_model | StrOutputParser()
return chain.invoke({
"analysis": analysis,
"generated_code": state["generated_code"],
"generated_result": json.dumps(state["execution_result"], indent=2),
"reference_result": json.dumps(state["reference_answer"], indent=2)
})

View File

@ -0,0 +1,30 @@
"""
Utility functions for comparing the content of two dictionaries.
"""
from typing import Any, Dict, List
def normalize_dict(d: Dict[str, Any]) -> Dict[str, Any]:
normalized = {}
for key, value in d.items():
if isinstance(value, str):
normalized[key] = value.lower().strip()
elif isinstance(value, dict):
normalized[key] = normalize_dict(value)
elif isinstance(value, list):
normalized[key] = normalize_list(value)
else:
normalized[key] = value
return normalized
def normalize_list(lst: List[Any]) -> List[Any]:
return [
normalize_dict(item) if isinstance(item, dict)
else normalize_list(item) if isinstance(item, list)
else item.lower().strip() if isinstance(item, str)
else item
for item in lst
]
def are_content_equal(generated_result: Dict[str, Any], reference_result: Dict[str, Any]) -> bool:
"""Compare two dictionaries for semantic equality."""
return normalize_dict(generated_result) == normalize_dict(reference_result)

View File

@ -0,0 +1,36 @@
"""
This utility function trasfrom the pydantic schema into a more comprehensible schema.
"""
def transform_schema(pydantic_schema):
"""
Transform the pydantic schema into a more comprehensible JSON schema.
Args:
pydantic_schema (dict): The pydantic schema.
Returns:
dict: The transformed JSON schema.
"""
def process_properties(properties):
result = {}
for key, value in properties.items():
if 'type' in value:
if value['type'] == 'array':
if '$ref' in value['items']:
ref_key = value['items']['$ref'].split('/')[-1]
result[key] = [process_properties(pydantic_schema['definitions'][ref_key]['properties'])]
else:
result[key] = [value['items']['type']]
else:
result[key] = {
"type": value['type'],
"description": value.get('description', '')
}
elif '$ref' in value:
ref_key = value['$ref'].split('/')[-1]
result[key] = process_properties(pydantic_schema['definitions'][ref_key]['properties'])
return result
return process_properties(pydantic_schema['properties'])